# Monthly Payment Features

In [1]:
import numpy as np
import pandas as pd
import gc

pd.set_option('display.float_format', '{:.10f}'.format)
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
historical_transactions = pd.read_csv('./data/historical_transactions.csv')
new_merchant_transactions = pd.read_csv('./data/new_merchant_transactions.csv')

In [3]:
all_transactions = pd.concat([historical_transactions, new_merchant_transactions], axis=0)

In [4]:
all_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.70333091,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.73312848,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.73535241,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.72286538,2017-03-10 01:14:19,1.0,16,37


In [5]:
def shift_round(col):
    # shift_round: reverse normalization ?
    # where are the two numbers come from ?
    return np.round(col / 0.00150265118 + 497.06, 2)

all_transactions["purchase_amount_new"] = shift_round(all_transactions["purchase_amount"])

In [6]:
all_transactions.dtypes

authorized_flag          object
card_id                  object
city_id                   int64
category_1               object
installments              int64
category_3               object
merchant_category_id      int64
merchant_id              object
month_lag                 int64
purchase_amount         float64
purchase_date            object
category_2              float64
state_id                  int64
subsector_id              int64
purchase_amount_new     float64
dtype: object

In [7]:
all_transactions["month_abs"] = all_transactions.purchase_date.apply(lambda x: int(x[:7].replace("-", "")))

In [8]:
month_map = sorted(all_transactions.month_abs.unique())
month_map = dict(zip(month_map, range(len(month_map))))

In [9]:
all_transactions["month_lag"] = all_transactions.month_abs.map(month_map)

In [11]:
all_transactions.month_lag.value_counts().sort_index()

0     1147922
1     1206048
2     1456591
3     1448973
4     1562029
5     1698931
6     2045658
7     2233312
8     2432105
9     2547286
10    2872400
11    3480297
12    2937362
13    2541919
14     806902
15     657657
Name: month_lag, dtype: int64

In [19]:
all_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,purchase_amount_new,month_abs
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,5,-0.70333091,2017-06-25 15:33:07,1.0,16,37,29.0,201706
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,6,-0.73312848,2017-07-15 12:10:45,1.0,16,16,9.17,201707
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,7,-0.720386,2017-08-09 22:04:29,1.0,16,37,17.65,201708
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,8,-0.73535241,2017-09-02 10:06:26,1.0,16,34,7.69,201709
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,2,-0.72286538,2017-03-10 01:14:19,1.0,16,37,16.0,201703


# monthly_payment

In [23]:
monthly_payment = (all_transactions.groupby(["card_id", "month_lag"])
                                   .purchase_amount_new.sum()
                                   .reset_index(name="monthly_pa")
                                   .sort_values(["card_id", "month_lag"]))

In [24]:
monthly_payment.head()

Unnamed: 0,card_id,month_lag,monthly_pa
0,C_ID_00007093c1,1,1100.2
1,C_ID_00007093c1,2,1168.9
2,C_ID_00007093c1,3,1633.25
3,C_ID_00007093c1,4,2691.13
4,C_ID_00007093c1,5,2876.63


# psum with rolling ratio

In [9]:
part = monthly_payment[:10000].copy()

part["ratio_1"] = part.rolling(window=2).monthly_pa.apply(lambda x: x[1] / x[0], raw=True)
part["lag_offset"] = part.rolling(window=2).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
part.loc[part.lag_offset < 0, "ratio_1"] = np.NaN
del part["lag_offset"]

for i in range(3):
    window_size = i + 2
    sum_col = "sum_{}".format(window_size)
    ratio_col = "ratio_{}".format(window_size)
    part["lag_offset"] = part.rolling(window=window_size).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
    part[sum_col] = part.rolling(window=window_size).monthly_pa.sum()
    part.loc[part.lag_offset < 0, sum_col] = np.NaN
    part[ratio_col] = part.rolling(window=2)[sum_col].apply(lambda x: x[1] / x[0], raw=True)
    part.loc[part.lag_offset < 0, ratio_col] = np.NaN
    del part["lag_offset"]

part["sum_1"] = part.monthly_pa

In [10]:
monthly_payment_features = pd.DataFrame()

for month in range(16):
    temp = part[part.month_lag == month_lag].set_index("card_id").drop(["month_lag", "monthly_pa"], axis=1).rename(columns=lambda x: "{}_lag{}".format(x, month_lag))
    monthly_payment_features = pd.concat([monthly_payment_features, temp], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [25]:
def calc_monthly_payment_features(part):
    part["ratio_1"] = part.rolling(window=2).monthly_pa.apply(lambda x: x[1] / x[0], raw=True)
    part["lag_offset"] = part.rolling(window=2).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
    part.loc[part.lag_offset < 0, "ratio_1"] = np.NaN
    del part["lag_offset"]

    for i in range(3):
        window_size = i + 2
        sum_col = "sum_{}".format(window_size)
        ratio_col = "ratio_{}".format(window_size)
        part["lag_offset"] = part.rolling(window=window_size).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
        part[sum_col] = part.rolling(window=window_size).monthly_pa.sum()
        part.loc[part.lag_offset < 0, sum_col] = np.NaN
        part[ratio_col] = part.rolling(window=2)[sum_col].apply(lambda x: x[1] / x[0], raw=True)
        part.loc[part.lag_offset < 0, ratio_col] = np.NaN
        del part["lag_offset"]

    part["sum_1"] = part.monthly_pa
    
    monthly_payment_features = pd.DataFrame()

    for month in range(16):
        month_lag = month
        temp = part[part.month_lag == month_lag].set_index("card_id").drop(["month_lag", "monthly_pa"], axis=1).rename(columns=lambda x: "{}_lag{}".format(x, month_lag))
        monthly_payment_features = pd.concat([monthly_payment_features, temp], axis=1)
    return monthly_payment_features

In [26]:
from data_io import features_downcast
from downcast import save_dataframe32, load_dataframe32

In [27]:
features_df = calc_monthly_payment_features(monthly_payment)
features_df.shape
features_df.index.nunique()
features_df.reset_index().head()

save_dataframe32(features_downcast("monthly_psum_abs"),
                 features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

# next

1. pmax with rolling ratio
2. c-m std
3. m std
4. max-m count
5. max-m count/ month count
6. max-m std

# pmax with rolling ratio

In [30]:
monthly_payment_pmax = (all_transactions.groupby(["card_id", "month_lag"])
                                        .purchase_amount_new.max()
                                        .reset_index(name="monthly_pa")
                                        .sort_values(["card_id", "month_lag"]))

In [31]:
monthly_payment_pmax.head()

Unnamed: 0,card_id,month_lag,monthly_pa
0,C_ID_00007093c1,1,521.8
1,C_ID_00007093c1,2,230.0
2,C_ID_00007093c1,3,500.0
3,C_ID_00007093c1,4,702.0
4,C_ID_00007093c1,5,500.0


In [32]:
features_df = calc_monthly_payment_features(monthly_payment_pmax)
save_dataframe32(features_downcast("monthly_pmax_abs"),
                 features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

# main_merchant

In [45]:
all_transactions.card_id.nunique()

325540

In [12]:
monthly_payment_merchant_psum =  (all_transactions.groupby(["card_id", "month_lag", "merchant_id"])
                                                  .purchase_amount_new.sum()
                                                  .reset_index(name="monthly_pa")
                                                  .sort_values(["card_id", "month_lag"]))

In [13]:
monthly_payment_merchant_psum.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa
0,C_ID_00007093c1,1,M_ID_69423b34e4,521.8
1,C_ID_00007093c1,1,M_ID_9400cf2342,116.0
2,C_ID_00007093c1,1,M_ID_a33355a1b7,462.4
3,C_ID_00007093c1,2,M_ID_69423b34e4,515.9
4,C_ID_00007093c1,2,M_ID_9400cf2342,653.0


In [14]:
monthly_payment_merchant_psum_pmax = (monthly_payment_merchant_psum.groupby(["card_id", "month_lag"])
                                                                   .monthly_pa.max()
                                                                   .reset_index(name="monthly_pa")
                                                                   .sort_values(["card_id", "month_lag"]))

In [15]:
monthly_payment_merchant_psum_pmax.head()

Unnamed: 0,card_id,month_lag,monthly_pa
0,C_ID_00007093c1,1,521.8
1,C_ID_00007093c1,2,653.0
2,C_ID_00007093c1,3,718.45
3,C_ID_00007093c1,4,2204.0
4,C_ID_00007093c1,5,1400.0


In [47]:
monthly_payment_merchant_psum["card_id_extend"] = monthly_payment_merchant_psum.card_id + "_" + monthly_payment_merchant_psum.month_lag.astype(str)
monthly_payment_merchant_psum_pmax["card_id_extend"] = monthly_payment_merchant_psum_pmax.card_id + "_" + monthly_payment_merchant_psum_pmax.month_lag.astype(str)
monthly_payment_merchant_psum["monthly_pa_max"] = monthly_payment_merchant_psum.card_id_extend.map(monthly_payment_merchant_psum_pmax.set_index("card_id_extend").monthly_pa)
monthly_main_merchant = monthly_payment_merchant_psum[monthly_payment_merchant_psum.monthly_pa == monthly_payment_merchant_psum.monthly_pa_max]

In [66]:
monthly_main_merchant.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa,card_id_extend,monthly_pa_max,m_code
0,C_ID_00007093c1,1,M_ID_69423b34e4,521.8,C_ID_00007093c1_1,521.8,100434
4,C_ID_00007093c1,2,M_ID_9400cf2342,653.0,C_ID_00007093c1_2,653.0,141523
6,C_ID_00007093c1,3,M_ID_69423b34e4,718.45,C_ID_00007093c1_3,718.45,100434
11,C_ID_00007093c1,4,M_ID_15446e939c,2204.0,C_ID_00007093c1_4,2204.0,20468
22,C_ID_00007093c1,5,M_ID_ee8fcd02ca,1400.0,C_ID_00007093c1_5,1400.0,227801


In [49]:
from sklearn.preprocessing import LabelEncoder
m_ecd = LabelEncoder()
monthly_main_merchant["m_code"] = m_ecd.fit_transform(monthly_main_merchant.merchant_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
monthly_main_merchant.shape

(3053315, 7)

In [51]:
monthly_main_merchant.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa,card_id_extend,monthly_pa_max,m_code
0,C_ID_00007093c1,1,M_ID_69423b34e4,521.8,C_ID_00007093c1_1,521.8,100434
4,C_ID_00007093c1,2,M_ID_9400cf2342,653.0,C_ID_00007093c1_2,653.0,141523
6,C_ID_00007093c1,3,M_ID_69423b34e4,718.45,C_ID_00007093c1_3,718.45,100434
11,C_ID_00007093c1,4,M_ID_15446e939c,2204.0,C_ID_00007093c1_4,2204.0,20468
22,C_ID_00007093c1,5,M_ID_ee8fcd02ca,1400.0,C_ID_00007093c1_5,1400.0,227801


In [52]:
monthly_main_merchant = monthly_main_merchant.drop_duplicates(subset="card_id_extend")
monthly_main_merchant.shape

(3040375, 7)

In [57]:
monthly_payment_features = pd.DataFrame({
    "main_m_lag{}".format(month): monthly_main_merchant[monthly_main_merchant.month_lag == month].set_index("card_id").m_code + 1
    for month in range(16)
})

In [64]:
monthly_main_merchant.merchant_id.nunique()

244022

In [61]:
monthly_payment_features = monthly_payment_features.fillna(0).astype(int)

In [58]:
monthly_payment_features.shape

(325540, 16)

In [67]:
monthly_payment_pa = pd.DataFrame({
    "main_m_lag{}".format(month): monthly_main_merchant[monthly_main_merchant.month_lag == month].set_index("card_id").monthly_pa
    for month in range(16)
})

In [68]:
monthly_payment_pa.head()

Unnamed: 0,main_m_lag0,main_m_lag1,main_m_lag2,main_m_lag3,main_m_lag4,main_m_lag5,main_m_lag6,main_m_lag7,main_m_lag8,main_m_lag9,main_m_lag10,main_m_lag11,main_m_lag12,main_m_lag13,main_m_lag14,main_m_lag15
C_ID_00007093c1,,521.8,653.0,718.45,2204.0,1400.0,331.0,2900.0,220.0,413.0,527.0,782.0,430.0,500.0,,60.0
C_ID_0001238066,,,,,,,,,127.13,430.01,661.56,792.71,1008.22,771.83,444.94,250.0
C_ID_0001506ef0,39.0,79.0,58.2,,30.0,16.33,1017.0,842.0,750.0,910.74,881.14,750.0,1100.0,1491.0,9.92,
C_ID_0001793786,1174.83,1475.58,2290.18,2078.17,1621.61,3527.8,5733.9,1650.8,3445.76,1443.25,2580.0,1500.0,,,,
C_ID_000183fdda,,,,,,,,2158.2,609.2,541.22,1764.9,1960.0,2337.0,1374.93,191.2,425.4


In [63]:
monthly_payment_features.to_csv("./data/train_test_main_merchant.csv")

In [69]:
monthly_payment_pa.to_csv("./data/train_test_main_merchant_pa.csv")

# merchant_length

more
1. daynum
2. ratio

In [38]:
merchant_dura = (monthly_payment_merchant_psum.groupby(["card_id", "merchant_id"])
                                              .size()
                                              .reset_index(name="merchant_dura"))

In [40]:
merchant_dura.head()

Unnamed: 0,card_id,merchant_id,merchant_dura
0,C_ID_00007093c1,M_ID_00a6ca8a8a,1
1,C_ID_00007093c1,M_ID_0379adb435,4
2,C_ID_00007093c1,M_ID_06a8d84366,1
3,C_ID_00007093c1,M_ID_08f01305af,1
4,C_ID_00007093c1,M_ID_0dabc5a70c,2


In [41]:
merchant_dura_count = (merchant_dura.groupby(["card_id", "merchant_dura"])
                                    .size()
                                    .reset_index(name="count"))

In [42]:
merchant_dura_count.head()

Unnamed: 0,card_id,merchant_dura,count
0,C_ID_00007093c1,1,15
1,C_ID_00007093c1,2,7
2,C_ID_00007093c1,3,5
3,C_ID_00007093c1,4,1
4,C_ID_00007093c1,7,1


In [45]:
df_features = pd.DataFrame()

for dura in range(14):
    temp = (merchant_dura_count[merchant_dura_count.merchant_dura == dura + 1].set_index("card_id")
                                                                              .drop(["merchant_dura"], axis=1)
                                                                              .rename(columns=lambda x: "{}_dura{}".format(x, dura + 1)))
    df_features = pd.concat([df_features, temp], axis=1)

In [49]:
df_features["count_total"] = df_features.sum(axis=1)

for dura in range(14):
    count_col = "count_dura{}".format(dura + 1)
    ratio_col = "ratio_dura{}".format(dura + 1)
    df_features[ratio_col] = df_features[count_col] / df_features["count_total"]

In [51]:
save_dataframe32(features_downcast("duar_count"),
                 df_features.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

In [50]:
df_features.head().T

Unnamed: 0,C_ID_00007093c1,C_ID_0001238066,C_ID_0001506ef0,C_ID_0001793786,C_ID_000183fdda
count_dura1,15.0,82.0,22.0,128.0,62.0
count_dura2,7.0,4.0,3.0,12.0,12.0
count_dura3,5.0,1.0,,3.0,6.0
count_dura4,1.0,,2.0,2.0,3.0
count_dura5,,2.0,,2.0,1.0
count_dura6,,1.0,1.0,1.0,
count_dura7,1.0,,,2.0,
count_dura8,,,,,
count_dura9,,,1.0,,
count_dura10,1.0,,,,


In [39]:
merchant_dura.merchant_dura.value_counts()

1     9687983
2     1605279
3      671822
4      350670
5      204864
6      129588
7       84699
8       57202
9       40143
10      28105
11      20239
12      15009
13      11280
14      10637
Name: merchant_dura, dtype: int64

# merchant avg std

In [52]:
merchant_avg_std = (all_transactions.groupby(["merchant_id", "card_id"])
                                    .purchase_amount_new.std().reset_index(name="pa_std")
                                    .groupby("merchant_id").pa_std.mean().reset_index(name="pa_std_avg"))

In [53]:
merchant_avg_std.head()

Unnamed: 0,merchant_id,pa_std_avg
0,M_ID_000025127f,0.5773502692
1,M_ID_0000699140,27.1220987316
2,M_ID_00006a5552,
3,M_ID_000087311e,269.2202923001
4,M_ID_0000ab0b2d,5.0041993186


In [54]:
save_dataframe32(features_downcast("merchant_avg_std"),
                 merchant_avg_std, keep=["merchant_id"])

In [57]:
monthly_payment_merchant_psum.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa
0,C_ID_00007093c1,1,M_ID_69423b34e4,521.8
1,C_ID_00007093c1,1,M_ID_9400cf2342,116.0
2,C_ID_00007093c1,1,M_ID_a33355a1b7,462.4
3,C_ID_00007093c1,2,M_ID_69423b34e4,515.9
4,C_ID_00007093c1,2,M_ID_9400cf2342,653.0


In [58]:
monthly_payment_merchant_psum["card_id_extend"] = monthly_payment_merchant_psum.card_id + "_" + monthly_payment_merchant_psum.month_lag.astype(str)
monthly_payment_merchant_psum_pmax["card_id_extend"] = monthly_payment_merchant_psum_pmax.card_id + "_" + monthly_payment_merchant_psum_pmax.month_lag.astype(str)
monthly_payment_merchant_psum["monthly_pa_max"] = monthly_payment_merchant_psum.card_id_extend.map(monthly_payment_merchant_psum_pmax.set_index("card_id_extend").monthly_pa)
monthly_main_merchant = monthly_payment_merchant_psum[monthly_payment_merchant_psum.monthly_pa == monthly_payment_merchant_psum.monthly_pa_max]

In [59]:
monthly_main_merchant["monthly_pa"] = monthly_main_merchant.merchant_id.map(merchant_avg_std.set_index("merchant_id").pa_std_avg)

In [62]:
monthly_main_merchant = monthly_main_merchant.drop_duplicates("card_id_extend")

In [63]:
monthly_main_merchant.head()

Unnamed: 0,card_id,month_lag,merchant_id,monthly_pa,card_id_extend,monthly_pa_max
0,C_ID_00007093c1,1,M_ID_69423b34e4,44.263705028,C_ID_00007093c1_1,521.8
4,C_ID_00007093c1,2,M_ID_9400cf2342,68.3237688669,C_ID_00007093c1_2,653.0
6,C_ID_00007093c1,3,M_ID_69423b34e4,44.263705028,C_ID_00007093c1_3,718.45
11,C_ID_00007093c1,4,M_ID_15446e939c,29.2447842358,C_ID_00007093c1_4,2204.0
22,C_ID_00007093c1,5,M_ID_ee8fcd02ca,137.8139649003,C_ID_00007093c1_5,1400.0


In [64]:
features_df = calc_monthly_payment_features(monthly_main_merchant[["card_id", "month_lag", "monthly_pa"]])
save_dataframe32(features_downcast("monthly_merchant_avg_std_abs"),
                 features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

In [65]:
features_df.head().T

Unnamed: 0,C_ID_00007093c1,C_ID_0001238066,C_ID_0001506ef0,C_ID_0001793786,C_ID_000183fdda
ratio_1_lag0,,,,,
sum_2_lag0,,,,,
ratio_2_lag0,,,,,
sum_3_lag0,,,,,
ratio_3_lag0,,,,,
sum_4_lag0,,,,,
ratio_4_lag0,,,,,
sum_1_lag0,,,7.1392607660,83.1361047020,
ratio_1_lag1,,,2.1933054913,,
sum_2_lag1,,,22.7978406080,,
