# Monthly Payment Features

In [1]:
import numpy as np
import pandas as pd
import gc

pd.set_option('display.float_format', '{:.10f}'.format)
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
historical_transactions = pd.read_csv('./data/historical_transactions.csv')
new_merchant_transactions = pd.read_csv('./data/new_merchant_transactions.csv')

In [3]:
all_transactions = pd.concat([historical_transactions, new_merchant_transactions], axis=0)

In [4]:
all_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.70333091,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.73312848,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.73535241,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.72286538,2017-03-10 01:14:19,1.0,16,37


In [5]:
def shift_round(col):
    # shift_round: reverse normalization ?
    # where are the two numbers come from ?
    return np.round(col / 0.00150265118 + 497.06, 2)

all_transactions["purchase_amount_new"] = shift_round(all_transactions["purchase_amount"])

In [6]:
all_transactions.dtypes

authorized_flag          object
card_id                  object
city_id                   int64
category_1               object
installments              int64
category_3               object
merchant_category_id      int64
merchant_id              object
month_lag                 int64
purchase_amount         float64
purchase_date            object
category_2              float64
state_id                  int64
subsector_id              int64
purchase_amount_new     float64
dtype: object

# monthly_payment

In [7]:
monthly_payment = (all_transactions.groupby(["card_id", "month_lag"])
                                   .purchase_amount_new.sum()
                                   .reset_index(name="monthly_pa")
                                   .sort_values(["card_id", "month_lag"]))

In [8]:
monthly_payment.head()

Unnamed: 0,card_id,month_lag,monthly_pa
0,C_ID_00007093c1,-12,1100.2
1,C_ID_00007093c1,-11,1168.9
2,C_ID_00007093c1,-10,1633.25
3,C_ID_00007093c1,-9,2691.13
4,C_ID_00007093c1,-8,2876.63


# psum with rolling ratio

In [9]:
part = monthly_payment[:10000].copy()

part["ratio_1"] = part.rolling(window=2).monthly_pa.apply(lambda x: x[1] / x[0], raw=True)
part["lag_offset"] = part.rolling(window=2).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
part.loc[part.lag_offset < 0, "ratio_1"] = np.NaN
del part["lag_offset"]

for i in range(3):
    window_size = i + 2
    sum_col = "sum_{}".format(window_size)
    ratio_col = "ratio_{}".format(window_size)
    part["lag_offset"] = part.rolling(window=window_size).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
    part[sum_col] = part.rolling(window=window_size).monthly_pa.sum()
    part.loc[part.lag_offset < 0, sum_col] = np.NaN
    part[ratio_col] = part.rolling(window=2)[sum_col].apply(lambda x: x[1] / x[0], raw=True)
    part.loc[part.lag_offset < 0, ratio_col] = np.NaN
    del part["lag_offset"]

part["sum_1"] = part.monthly_pa

In [10]:
monthly_payment_features = pd.DataFrame()

for month in range(15):
    month_lag = month - 12
    temp = part[part.month_lag == month_lag].set_index("card_id").drop(["month_lag", "monthly_pa"], axis=1).rename(columns=lambda x: "{}_lag{}".format(x, month_lag))
    monthly_payment_features = pd.concat([monthly_payment_features, temp], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [11]:
def calc_monthly_payment_features(part):
    part["ratio_1"] = part.rolling(window=2).monthly_pa.apply(lambda x: x[1] / x[0], raw=True)
    part["lag_offset"] = part.rolling(window=2).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
    part.loc[part.lag_offset < 0, "ratio_1"] = np.NaN
    del part["lag_offset"]

    for i in range(3):
        window_size = i + 2
        sum_col = "sum_{}".format(window_size)
        ratio_col = "ratio_{}".format(window_size)
        part["lag_offset"] = part.rolling(window=window_size).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
        part[sum_col] = part.rolling(window=window_size).monthly_pa.sum()
        part.loc[part.lag_offset < 0, sum_col] = np.NaN
        part[ratio_col] = part.rolling(window=2)[sum_col].apply(lambda x: x[1] / x[0], raw=True)
        part.loc[part.lag_offset < 0, ratio_col] = np.NaN
        del part["lag_offset"]

    part["sum_1"] = part.monthly_pa
    
    monthly_payment_features = pd.DataFrame()

    for month in range(15):
        month_lag = month - 12
        temp = part[part.month_lag == month_lag].set_index("card_id").drop(["month_lag", "monthly_pa"], axis=1).rename(columns=lambda x: "{}_lag{}".format(x, month_lag))
        monthly_payment_features = pd.concat([monthly_payment_features, temp], axis=1)
    return monthly_payment_features

In [12]:
# features_df = calc_monthly_payment_features(monthly_payment)
# features_df.shape
# features_df.index.nunique()
# features_df.reset_index().head()

In [13]:
from data_io import features_downcast
from downcast import save_dataframe32, load_dataframe32

In [14]:
save_dataframe32

<function downcast.save_dataframe32(path, dataframe, keep=[])>

In [15]:
# save_dataframe32(features_downcast("monthly_psum"), features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

# next

1. pmax with rolling ratio
2. c-m std
3. m std
4. max-m count
5. max-m count/ month count
6. max-m std

# pmax with rolling ratio

In [21]:
monthly_payment_pmax = (all_transactions.groupby(["card_id", "month_lag"])
                                        .purchase_amount_new.max()
                                        .reset_index(name="monthly_pa")
                                        .sort_values(["card_id", "month_lag"]))

In [22]:
monthly_payment_pmax.head()

Unnamed: 0,card_id,month_lag,monthly_pa
0,C_ID_00007093c1,-12,521.8
1,C_ID_00007093c1,-11,230.0
2,C_ID_00007093c1,-10,500.0
3,C_ID_00007093c1,-9,702.0
4,C_ID_00007093c1,-8,500.0


In [23]:
features_df = calc_monthly_payment_features(monthly_payment_pmax)
save_dataframe32(features_downcast("monthly_pmax"),
                 features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

In [26]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.8202826
1,2017-01,C_ID_3d0044924f,4,1,0,0.39291325
2,2016-08,C_ID_d639edf6cd,2,2,0,0.68805599
3,2017-09,C_ID_186d6a6901,4,3,0,0.1424952
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.15974919


# merchant_pmax

In [31]:
monthly_payment_merchant_psum =  (all_transactions.groupby(["card_id", "month_lag", "merchant_id"])
                                                  .purchase_amount_new.sum()
                                                  .reset_index(name="monthly_pa")
                                                  .sort_values(["card_id", "month_lag"]))

In [33]:
monthly_payment_merchant_psum.shape

(20509676, 4)

In [34]:
monthly_payment_merchant_psum.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa
0,C_ID_00007093c1,-12,M_ID_69423b34e4,521.8
1,C_ID_00007093c1,-12,M_ID_9400cf2342,116.0
2,C_ID_00007093c1,-12,M_ID_a33355a1b7,462.4
3,C_ID_00007093c1,-11,M_ID_69423b34e4,515.9
4,C_ID_00007093c1,-11,M_ID_9400cf2342,653.0


In [35]:
monthly_payment_merchant_psum_pmax = (monthly_payment_merchant_psum.groupby(["card_id", "month_lag"])
                                                                   .monthly_pa.max()
                                                                   .reset_index(name="monthly_pa")
                                                                   .sort_values(["card_id", "month_lag"]))

In [36]:
features_df = calc_monthly_payment_features(monthly_payment_merchant_psum_pmax)

In [42]:
save_dataframe32(features_downcast("monthly_merchant_pmax"),
                 features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

# max_merchant 

In [43]:
monthly_payment_merchant_psum_pmax.head()

Unnamed: 0,card_id,month_lag,monthly_pa,ratio_1,sum_2,ratio_2,sum_3,ratio_3,sum_4,ratio_4,sum_1
0,C_ID_00007093c1,-12,521.8,,,,,,,,521.8
1,C_ID_00007093c1,-11,653.0,1.2514373323,1174.8,,,,,,653.0
2,C_ID_00007093c1,-10,718.45,1.100229709,1371.45,1.1673901941,1893.25,,,,718.45
3,C_ID_00007093c1,-9,2204.0,3.0677152203,2922.45,2.1309198294,3575.45,1.8885250231,4097.25,,2204.0
4,C_ID_00007093c1,-8,1400.0,0.6352087114,3604.0,1.2332118599,4322.45,1.2089247507,4975.45,1.2143388858,1400.0


In [44]:
monthly_payment_merchant_psum["card_id_extend"] = monthly_payment_merchant_psum.card_id + "_" + monthly_payment_merchant_psum.month_lag.astype(str)
monthly_payment_merchant_psum_pmax["card_id_extend"] = monthly_payment_merchant_psum_pmax.card_id + "_" + monthly_payment_merchant_psum_pmax.month_lag.astype(str)

In [45]:
monthly_payment_merchant_psum.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa,card_id_extend
0,C_ID_00007093c1,-12,M_ID_69423b34e4,521.8,C_ID_00007093c1_-12
1,C_ID_00007093c1,-12,M_ID_9400cf2342,116.0,C_ID_00007093c1_-12
2,C_ID_00007093c1,-12,M_ID_a33355a1b7,462.4,C_ID_00007093c1_-12
3,C_ID_00007093c1,-11,M_ID_69423b34e4,515.9,C_ID_00007093c1_-11
4,C_ID_00007093c1,-11,M_ID_9400cf2342,653.0,C_ID_00007093c1_-11


In [47]:
monthly_payment_merchant_psum_pmax.head()

Unnamed: 0,card_id,month_lag,monthly_pa,ratio_1,sum_2,ratio_2,sum_3,ratio_3,sum_4,ratio_4,sum_1,card_id_extend
0,C_ID_00007093c1,-12,521.8,,,,,,,,521.8,C_ID_00007093c1_-12
1,C_ID_00007093c1,-11,653.0,1.2514373323,1174.8,,,,,,653.0,C_ID_00007093c1_-11
2,C_ID_00007093c1,-10,718.45,1.100229709,1371.45,1.1673901941,1893.25,,,,718.45,C_ID_00007093c1_-10
3,C_ID_00007093c1,-9,2204.0,3.0677152203,2922.45,2.1309198294,3575.45,1.8885250231,4097.25,,2204.0,C_ID_00007093c1_-9
4,C_ID_00007093c1,-8,1400.0,0.6352087114,3604.0,1.2332118599,4322.45,1.2089247507,4975.45,1.2143388858,1400.0,C_ID_00007093c1_-8


In [48]:
monthly_payment_merchant_psum["monthly_pa_max"] = monthly_payment_merchant_psum.card_id_extend.map(monthly_payment_merchant_psum_pmax.set_index("card_id_extend").monthly_pa)

In [49]:
monthly_payment_merchant_psum.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa,card_id_extend,monthly_pa_max
0,C_ID_00007093c1,-12,M_ID_69423b34e4,521.8,C_ID_00007093c1_-12,521.8
1,C_ID_00007093c1,-12,M_ID_9400cf2342,116.0,C_ID_00007093c1_-12,521.8
2,C_ID_00007093c1,-12,M_ID_a33355a1b7,462.4,C_ID_00007093c1_-12,521.8
3,C_ID_00007093c1,-11,M_ID_69423b34e4,515.9,C_ID_00007093c1_-11,653.0
4,C_ID_00007093c1,-11,M_ID_9400cf2342,653.0,C_ID_00007093c1_-11,653.0


In [50]:
monthly_main_merchant = monthly_payment_merchant_psum[monthly_payment_merchant_psum.monthly_pa == monthly_payment_merchant_psum.monthly_pa_max]

In [53]:
monthly_main_merchant.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa,card_id_extend,monthly_pa_max
0,C_ID_00007093c1,-12,M_ID_69423b34e4,521.8,C_ID_00007093c1_-12,521.8
4,C_ID_00007093c1,-11,M_ID_9400cf2342,653.0,C_ID_00007093c1_-11,653.0
6,C_ID_00007093c1,-10,M_ID_69423b34e4,718.45,C_ID_00007093c1_-10,718.45
11,C_ID_00007093c1,-9,M_ID_15446e939c,2204.0,C_ID_00007093c1_-9,2204.0
22,C_ID_00007093c1,-8,M_ID_ee8fcd02ca,1400.0,C_ID_00007093c1_-8,1400.0


In [52]:
monthly_main_merchant.card_id_extend.nunique()

3040375

In [54]:
key_vc = monthly_main_merchant.card_id_extend.value_counts()
key_dup = key_vc[key_vc > 1]

In [57]:
monthly_main_merchant = monthly_main_merchant.drop_duplicates("card_id_extend")

In [58]:
monthly_main_merchant.shape

(3040375, 6)

In [59]:
monthly_main_merchant.card_id_extend.nunique()

3040375

In [63]:
main_merchant_count = monthly_main_merchant.groupby("card_id")[["month_lag", "merchant_id"]].nunique()

In [65]:
main_merchant_count["ratio"] = main_merchant_count.merchant_id / main_merchant_count.month_lag

In [66]:
main_merchant_count.head(20)

Unnamed: 0_level_0,month_lag,merchant_id,ratio
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C_ID_00007093c1,14,9,0.6428571429
C_ID_0001238066,8,4,0.5
C_ID_0001506ef0,14,6,0.4285714286
C_ID_0001793786,12,11,0.9166666667
C_ID_000183fdda,9,9,1.0
C_ID_00024e244b,13,10,0.7692307692
C_ID_0002709b5a,14,11,0.7857142857
C_ID_00027503e2,9,6,0.6666666667
C_ID_000298032a,10,9,0.9
C_ID_0002ba3c2e,9,5,0.5555555556


In [67]:
save_dataframe32(features_downcast("main_merchant_count"),
                 main_merchant_count.reset_index(), keep=["card_id"])

# merchant avg std

In [69]:
merchant_avg_std = (all_transactions.groupby(["merchant_id", "card_id"])
                                    .purchase_amount_new.std().reset_index(name="pa_std")
                                    .groupby("merchant_id").pa_std.mean().reset_index(name="pa_std_avg"))

In [70]:
merchant_avg_std.head()

Unnamed: 0,merchant_id,pa_std_avg
0,M_ID_000025127f,0.5773502692
1,M_ID_0000699140,27.1220987316
2,M_ID_00006a5552,
3,M_ID_000087311e,269.2202923001
4,M_ID_0000ab0b2d,5.0041993186


In [71]:
save_dataframe32(features_downcast("merchant_avg_std"),
                 merchant_avg_std, keep=["merchant_id"])

In [72]:
monthly_main_merchant["monthly_pa"] = monthly_main_merchant.merchant_id.map(merchant_avg_std.set_index("merchant_id").pa_std_avg)

In [77]:
monthly_main_merchant.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa,card_id_extend,monthly_pa_max,ratio_1,sum_2,ratio_2,sum_3,ratio_3,sum_4,ratio_4,sum_1
0,C_ID_00007093c1,-12,M_ID_69423b34e4,44.263705028,C_ID_00007093c1_-12,521.8,,,,,,,,44.263705028
4,C_ID_00007093c1,-11,M_ID_9400cf2342,68.3237688669,C_ID_00007093c1_-11,653.0,1.5435619053,112.587473895,,,,,,68.3237688669
6,C_ID_00007093c1,-10,M_ID_69423b34e4,44.263705028,C_ID_00007093c1_-10,718.45,0.6478522154,112.587473895,1.0,156.851178923,,,,44.263705028
11,C_ID_00007093c1,-9,M_ID_15446e939c,29.2447842358,C_ID_00007093c1_-9,2204.0,0.6606944497,73.5084892639,0.652901133,141.8322581308,0.9042473197,186.0959631588,,29.2447842358
22,C_ID_00007093c1,-8,M_ID_ee8fcd02ca,137.8139649003,C_ID_00007093c1_-8,1400.0,4.7124288485,167.0587491361,2.2726456605,211.3224541641,1.4899463419,279.6462230311,1.5026990284,137.8139649003


In [78]:
features_df = calc_monthly_payment_features(monthly_main_merchant[["card_id", "month_lag", "monthly_pa"]])
save_dataframe32(features_downcast("monthly_merchant_avg_std"),
                 features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

In [79]:
features_df.head().T

Unnamed: 0,C_ID_00007093c1,C_ID_0001238066,C_ID_0001506ef0,C_ID_0001793786,C_ID_000183fdda
ratio_1_lag-12,,,2.1933054913,,
sum_2_lag-12,,,22.7978406080,,
ratio_2_lag-12,,,,,
sum_3_lag-12,,,,,
ratio_3_lag-12,,,,,
sum_4_lag-12,,,,,
ratio_4_lag-12,,,,,
sum_1_lag-12,44.2637050280,,15.6585798420,,
ratio_1_lag-11,1.5435619053,,12.2231728986,,
sum_2_lag-11,112.5874738950,,207.0561085971,,
