# Monthly Payment Features

In [3]:
import numpy as np
import pandas as pd
import gc

pd.set_option('display.float_format', '{:.10f}'.format)
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
historical_transactions = pd.read_csv('./data/historical_transactions.csv')
new_merchant_transactions = pd.read_csv('./data/new_merchant_transactions.csv')

In [4]:
all_transactions = pd.concat([historical_transactions, new_merchant_transactions], axis=0)

In [5]:
all_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.70333091,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.73312848,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.73535241,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.72286538,2017-03-10 01:14:19,1.0,16,37


In [10]:
def shift_round(col):
    # shift_round: reverse normalization ?
    # where are the two numbers come from ?
    return np.round(col / 0.00150265118 + 497.06, 2)

all_transactions["purchase_amount_new"] = shift_round(all_transactions["purchase_amount"])

In [6]:
all_transactions.dtypes

authorized_flag          object
card_id                  object
city_id                   int64
category_1               object
installments              int64
category_3               object
merchant_category_id      int64
merchant_id              object
month_lag                 int64
purchase_amount         float64
purchase_date            object
category_2              float64
state_id                  int64
subsector_id              int64
dtype: object

# monthly_payment

In [9]:
monthly_payment = (all_transactions.groupby(["card_id", "month_lag"])
                                   .purchase_amount_new.sum()
                                   .reset_index(name="monthly_pa")
                                   .sort_values(["card_id", "month_lag"]))

In [11]:
monthly_payment.head()

Unnamed: 0,card_id,month_lag,monthly_pa
0,C_ID_00007093c1,-12,1100.2
1,C_ID_00007093c1,-11,1168.9
2,C_ID_00007093c1,-10,1633.25
3,C_ID_00007093c1,-9,2691.13
4,C_ID_00007093c1,-8,2876.63


# psum with rolling ratio

In [35]:
part = monthly_payment[:10000].copy()

part["ratio_1"] = part.rolling(window=2).monthly_pa.apply(lambda x: x[1] / x[0], raw=True)
part["lag_offset"] = part.rolling(window=2).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
part.loc[part.lag_offset < 0, "ratio_1"] = np.NaN
del part["lag_offset"]

for i in range(3):
    window_size = i + 2
    sum_col = "sum_{}".format(window_size)
    ratio_col = "ratio_{}".format(window_size)
    part["lag_offset"] = part.rolling(window=window_size).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
    part[sum_col] = part.rolling(window=window_size).monthly_pa.sum()
    part.loc[part.lag_offset < 0, sum_col] = np.NaN
    part[ratio_col] = part.rolling(window=2)[sum_col].apply(lambda x: x[1] / x[0], raw=True)
    part.loc[part.lag_offset < 0, ratio_col] = np.NaN
    del part["lag_offset"]

part["sum_1"] = part.monthly_pa

In [47]:
monthly_payment_features = pd.DataFrame()

for month in range(15):
    month_lag = month - 12
    temp = part[part.month_lag == month_lag].set_index("card_id").drop(["month_lag", "monthly_pa"], axis=1).rename(columns=lambda x: "{}_lag{}".format(x, month_lag))
    monthly_payment_features = pd.concat([monthly_payment_features, temp], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [50]:
def calc_monthly_payment_features(part):
    part["ratio_1"] = part.rolling(window=2).monthly_pa.apply(lambda x: x[1] / x[0], raw=True)
    part["lag_offset"] = part.rolling(window=2).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
    part.loc[part.lag_offset < 0, "ratio_1"] = np.NaN
    del part["lag_offset"]

    for i in range(3):
        window_size = i + 2
        sum_col = "sum_{}".format(window_size)
        ratio_col = "ratio_{}".format(window_size)
        part["lag_offset"] = part.rolling(window=window_size).month_lag.apply(lambda x: x[-1] - x[0], raw=True)
        part[sum_col] = part.rolling(window=window_size).monthly_pa.sum()
        part.loc[part.lag_offset < 0, sum_col] = np.NaN
        part[ratio_col] = part.rolling(window=2)[sum_col].apply(lambda x: x[1] / x[0], raw=True)
        part.loc[part.lag_offset < 0, ratio_col] = np.NaN
        del part["lag_offset"]

    part["sum_1"] = part.monthly_pa
    
    monthly_payment_features = pd.DataFrame()

    for month in range(15):
        month_lag = month - 12
        temp = part[part.month_lag == month_lag].set_index("card_id").drop(["month_lag", "monthly_pa"], axis=1).rename(columns=lambda x: "{}_lag{}".format(x, month_lag))
        monthly_payment_features = pd.concat([monthly_payment_features, temp], axis=1)
    return monthly_payment_features

In [51]:
features_df = calc_monthly_payment_features(monthly_payment)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [52]:
features_df.shape

(325540, 120)

In [54]:
features_df.index.nunique()

325540

In [55]:
features_df.reset_index().head()

Unnamed: 0,index,ratio_1_lag-12,sum_2_lag-12,ratio_2_lag-12,sum_3_lag-12,ratio_3_lag-12,sum_4_lag-12,ratio_4_lag-12,sum_1_lag-12,ratio_1_lag-11,...,ratio_4_lag1,sum_1_lag1,ratio_1_lag2,sum_2_lag2,ratio_2_lag2,sum_3_lag2,ratio_3_lag2,sum_4_lag2,ratio_4_lag2,sum_1_lag2
0,C_ID_00007093c1,,,,,,,,1100.2,1.0624431921,...,,,0.0609357516,1915.18,0.6401494767,3101.77,0.623222068,5086.99,0.8596896526,110.0
1,C_ID_0001238066,,,,,,,,,,...,1.0106644286,1957.98,0.5531261811,3040.99,0.7649134722,5058.61,0.6435448455,8943.55,0.7464322765,1083.01
2,C_ID_0001506ef0,3.617254902,235.48,,,,,,184.48,1.5531764961,...,0.8128161412,30.92,,,,,,,,
3,C_ID_0001793786,,,,,,,,,,...,0.9461002596,11552.9,0.3205385661,15256.05,0.8553875051,21538.4,0.6572342177,36474.42,0.8591347557,3703.15
4,C_ID_000183fdda,,,,,,,,,,...,0.9121978247,575.28,0.8800584063,1081.56,0.264739143,4591.66,0.3543535386,13464.13,0.7795910545,506.28


In [58]:
from data_io import features_downcast
from downcast import save_dataframe32, load_dataframe32

In [59]:
save_dataframe32

<function downcast.save_dataframe32(path, dataframe, keep=[])>

In [61]:
save_dataframe32(features_downcast("monthly_psum"), features_df.reset_index().rename(columns={"index":"card_id"}), keep=["card_id"])

# next

1. pmax with rolling ratio
2. c-m std
3. m std
4. max-m count
5. max-m count/ month count
6. max-m std