In [1]:
import numpy as np
import pandas as pd
import keras

Using TensorFlow backend.


In [2]:
subset = 100
resave = False
window = 10

In [3]:
if resave:
    train = pd.read_csv('./raw_data/train.csv')
    train = train.head(subset)
    train_card_ids = train['card_id']

    hist_trans = []
    for chunk in pd.read_csv('./raw_data/historical_transactions.csv',chunksize=100000):
        hist_trans.append(chunk[chunk['card_id'].isin(train_card_ids)])

    new_merch_trans = []
    for chunk in pd.read_csv('./raw_data/new_merchant_transactions.csv',chunksize=100000):
        new_merch_trans.append(chunk[chunk['card_id'].isin(train_card_ids)])

    hist_trans = pd.concat(hist_trans)
    new_merch_trans = pd.concat(new_merch_trans)

    train.to_pickle('./raw_data/train_subset.pkl')
    hist_trans.to_pickle('./raw_data/hist_trans_subset.pkl')
    new_merch_trans.to_pickle('./raw_data/new_merch_subset.pkl')

In [4]:
train = pd.read_pickle('./raw_data/train_subset.pkl')
hist_trans = pd.read_pickle('./raw_data/hist_trans_subset.pkl')
new_merch_trans = pd.read_pickle('./raw_data/new_merch_subset.pkl')

In [5]:
window_df = hist_trans.groupby('card_id').apply(lambda x: x.sort_values('purchase_date',ascending=False,axis=0).head(window))
window_df = window_df.reset_index(drop=True)
window_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_022b48f3ac,292,N,0,A,307,M_ID_10f895765a,0,-0.520263,2017-12-30 17:00:57,3.0,1,19
1,Y,C_ID_022b48f3ac,69,N,0,A,879,M_ID_00a6ca8a8a,0,-0.698072,2017-12-30 16:53:51,1.0,9,29
2,Y,C_ID_022b48f3ac,69,N,0,A,879,M_ID_00a6ca8a8a,0,-0.608904,2017-12-30 16:42:46,1.0,9,29
3,Y,C_ID_022b48f3ac,69,N,0,A,879,M_ID_00a6ca8a8a,0,-0.521991,2017-12-30 16:40:26,1.0,9,29
4,N,C_ID_022b48f3ac,69,N,0,A,879,M_ID_00a6ca8a8a,0,-0.32854,2017-12-30 16:38:10,1.0,9,29


In [6]:
COL_SUBSET = ['card_id','authorized_flag','category_1','purchase_amount','month_lag']

In [7]:
window_df = window_df[COL_SUBSET]

In [8]:
authorized_flag_dict = {'Y':1,
                        'N':0}
window_df['authorized_flag'] = window_df['authorized_flag'].map(authorized_flag_dict)
window_df.head()

Unnamed: 0,card_id,authorized_flag,category_1,purchase_amount,month_lag
0,C_ID_022b48f3ac,1,N,-0.520263,0
1,C_ID_022b48f3ac,1,N,-0.698072,0
2,C_ID_022b48f3ac,1,N,-0.608904,0
3,C_ID_022b48f3ac,1,N,-0.521991,0
4,C_ID_022b48f3ac,0,N,-0.32854,0


In [9]:
category_1_dict = {'Y':1,
                   'N':0}
window_df['category_1'] = window_df['category_1'].map(category_1_dict)
window_df.head()

Unnamed: 0,card_id,authorized_flag,category_1,purchase_amount,month_lag
0,C_ID_022b48f3ac,1,0,-0.520263,0
1,C_ID_022b48f3ac,1,0,-0.698072,0
2,C_ID_022b48f3ac,1,0,-0.608904,0
3,C_ID_022b48f3ac,1,0,-0.521991,0
4,C_ID_022b48f3ac,0,0,-0.32854,0


In [10]:
def reshape_df(df_grp,key,window=10):
    
    def reshape_series(series):
        np_series = series.values.reshape(-1)
        if len(np_series) < 10:
            pad_zeros = window - len(series)
            np_series = np.append(np_series,np.zeros(pad_zeros))
        window_dict = {}
        for i, s in enumerate(np_series):
            window_dict['{}_{}'.format(series.name,i)] = s
        return(window_dict)
    
    full_dict = {}
    for col in df_grp.columns:
        if col == key:
            next
        else:
            full_dict.update(reshape_series(df_grp[col]))
    return(pd.Series(full_dict))

In [11]:
window_df_flattened = window_df.groupby('card_id').apply(lambda x: reshape_df(x,key='card_id'))
window_df_flattened.head()

Unnamed: 0_level_0,authorized_flag_0,authorized_flag_1,authorized_flag_2,authorized_flag_3,authorized_flag_4,authorized_flag_5,authorized_flag_6,authorized_flag_7,authorized_flag_8,authorized_flag_9,...,purchase_amount_0,purchase_amount_1,purchase_amount_2,purchase_amount_3,purchase_amount_4,purchase_amount_5,purchase_amount_6,purchase_amount_7,purchase_amount_8,purchase_amount_9
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_022b48f3ac,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,-0.520263,-0.698072,-0.608904,-0.521991,-0.32854,-0.32854,-0.584621,-0.589129,-0.682294,-0.629551
C_ID_04dd9152c7,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.735037,-0.735037,-0.735037,-0.735037,-0.745405,0.0,0.0,0.0,0.0,0.0
C_ID_0894217f2f,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,-0.353213,-0.353664,-0.36854,-0.47042,-0.611819,-0.416625,-0.611669,-0.611669,-0.509789,-0.22699
C_ID_092d98ce80,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,-0.735097,-0.739395,-0.734887,-0.743902,-0.739395,-0.709342,-0.73956,-0.738448,-0.743752,-0.736389
C_ID_09da8a2858,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-0.716855,-0.731881,-0.731881,-0.70829,-0.691325,-0.689807,-0.378909,-0.145847,-0.596643,-0.596643


In [12]:
window_df_flattened = window_df_flattened.reset_index()
feature_vectors = window_df_flattened.columns[1:]

In [14]:
X = window_df_flattened[feature_vectors].values.reshape(len(window_df_flattened),len(COL_SUBSET)-1,window)
X = np.swapaxes(X,1,2)
Y = window_df_flattened[['card_id']].merge(train[['card_id','target']],how='left',on='card_id')['target'].values

In [15]:
from keras import Sequential
from keras.layers import Dense,TimeDistributed, LSTM
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score

In [20]:
model = Sequential()
model.add(LSTM(16, input_shape=(window, len(COL_SUBSET)-1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 16)                1344      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 1,361
Trainable params: 1,361
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.fit(X,Y,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x11b73b00>