# import library

In [1]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from glob import glob
from datetime import datetime, date
from collections import defaultdict
from multiprocessing import cpu_count, Pool
from tqdm import tqdm

import eli5
from eli5.lightgbm import explain_weights_lightgbm
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots
import shap 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [34]:
pd.options.display.max_columns = None

# params

In [None]:
PATH = os.path.join('..', 'remove_outlier_data')

params = {
    'num_leaves': 31,
    'min_data_in_leaf': 30, 
    'objective':'regression',
    'max_depth': -1,
    'learning_rate': 0.01,
    'boosting': 'gbdt',
    'feature_fraction': 0.8,
    'bagging_freq': 1,
    'bagging_fraction': 0.8,
    'bagging_seed': 11,
    'metric': 'rmse',
    'lambda_l1': 0.1,
    'verbosity': -1,
    'nthread': cpu_count(),
    'random_state': 6,
}

# features

In [None]:
features = []
features +=  [f'f10{i}.pkl' for i in (2, 3)]
features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
                               for j in ('Y', 'N')]
features += [f'f12{i}.pkl' for i in (1, 2)]


features += [f'f20{i}.pkl' for i in (2, 3)]
features += [f'f21{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]

features += [f'f40{i}.pkl' for i in (2, 3)]
features += [f'f41{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]
features += [f'f42{i}.pkl' for i in (1, 2)]

# read csv

In [None]:
train = pd.read_csv(os.path.join(PATH, 'train.csv'))

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on='card_id', how='left')

# data to int

In [None]:
cols = train.columns.values
for f in [
    'new_purchase_date_max', 'new_purchase_date_min',
    'hist_purchase_date_max', 'hist_purchase_date_min', 
    'N_hist_auth_purchase_date_max', 'N_hist_auth_purchase_date_min',
    'Y_hist_auth_purchase_date_max', 'Y_hist_auth_purchase_date_min', 
    'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min', 
    'N_new_auth_purchase_date_max', 'N_new_auth_purchase_date_min',
    'Y_new_auth_purchase_date_max_x', 'Y_new_auth_purchase_date_min_x', 
    'N_new_auth_purchase_date_max_x', 'N_new_auth_purchase_date_min_x', 
    'Y_new_auth_purchase_date_max_y', 'Y_new_auth_purchase_date_min_y', 
    'N_new_auth_purchase_date_max_y', 'N_new_auth_purchase_date_min_y'
]:
    if f in cols:
        train[f] = train[f].astype(np.int64) * 1e-9

# preprocess

In [None]:
y = train['target']

col_not_to_use = ['first_active_month', 'card_id', 'target']
col_to_use = [c for c in train.columns if c not in col_not_to_use]

train = train[col_to_use]
train['feature_3'] = train['feature_3'].astype(int)

categorical_features = ['feature_1', 'feature_2', 'feature_3']

for col in categorical_features:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))

gc.collect()

# model

In [None]:
X = train

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=6)

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    dtrain = lgb.Dataset(X.iloc[train_index], label=y.iloc[train_index])
    dvalid = lgb.Dataset(X.iloc[valid_index], label=y.iloc[valid_index])

    model = lgb.train(
        params,
        dtrain,
        20000,          
        valid_sets=[dtrain, dvalid],
        verbose_eval=200,
        early_stopping_rounds=20)

# shape

In [None]:
explainer = shap.TreeExplainer(model)

shap_values = explainer.shap_values(X.iloc[valid_index])
shap.summary_plot(shap_values, X.iloc[valid_index], max_display=300)

# libFFM

In [None]:
import os
import gc

import pandas as pd
import numpy as np
import pickle as pkl
from datetime import date
from tqdm import tqdm

from keras.layers.normalization import BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dot, Reshape, Add, Subtract
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

#==============================================================================
PREF = 'f501'

KEY = 'card_id'

# =============================================================================
# def
# =============================================================================

def get_embed(x_input, x_size, k_latent):
    if x_size > 0:  
        embed = Embedding(x_size, k_latent, input_length=1,
                          embeddings_regularizer=l2(embedding_reg))(x_input)
        embed = Flatten()(embed)
    else:
        embed = Dense(k_latent, kernel_regularizer=l2(embedding_reg))(x_input)
    return embed


def build_model_1(X, fsize):
    dim_input = len(fsize)

    input_x = [Input(shape=(1,)) for i in range(dim_input)]

    biases = [get_embed(x, size, 1) for (x, size) in zip(input_x, fsize)]

    factors = [get_embed(x, size, k_latent)
               for (x, size) in zip(input_x, fsize)]

    s = Add()(factors)

    diffs = [Subtract()([s, x]) for x in factors]

    dots = [Dot(axes=1)([d, x]) for d, x in zip(diffs, factors)]

    x = Concatenate()(biases + dots)
    x = BatchNormalization()(x)
    output = Dense(1, activation='relu', kernel_regularizer=l2(kernel_reg))(x)
    model = Model(inputs=input_x, outputs=[output])
    opt = Adam(clipnorm=0.5)
    model.compile(optimizer=opt, loss='mean_squared_error')
    output_f = factors + biases
    model_features = Model(inputs=input_x, outputs=output_f)

    return model, model_features

# =============================================================================
# main
# =============================================================================


train = pd.read_csv('../remove_outlier_data/train.csv')
test = pd.read_csv('../remove_outlier_data/test.csv')


features = []
features +=  [f'f10{i}.pkl' for i in (2, 3)]
features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
                               for j in ('Y', 'N')]
features += [f'f12{i}.pkl' for i in (1, 2)]


features += [f'f20{i}.pkl' for i in (2, 3)]
features += [f'f21{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]

features += [f'f40{i}.pkl' for i in (2, 3)]
features += [f'f41{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]
features += [f'f42{i}.pkl' for i in (1, 2)]

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on='card_id', how='left')
    test = pd.merge(test, t, on='card_id', how='left')

df = pd.concat([train, test], axis=0, sort=False)
del train, test
gc.collect()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
for c, d in zip(df.columns, df.dtypes):
    if ('int' in str(d)):
        print('int:', c)
#     if ('float' in str(d)):
#         print('float:', c)

In [None]:
SEED = 18
np.random.seed(SEED)

features = [
#     'feature_1', 'feature_2', 'feature_3', 
#     'days_feature_1', 'days_feature_2', 'days_feature_3',
    'hist_category_1_sum', 'hist_category_2_nunique', 'hist_category_3_nunique',
    'Y_new_auth_category_1_sum', 'Y_new_auth_category_2_nunique', 'Y_new_auth_category_3_nunique',
    'union_category_1_sum', 'union_category_2_nunique', 'union_category_3_nunique',
]

fsize = [int(df[f].max()) + 1 for f in features]

In [None]:
X = df.groupby(features)['card_id'].count()

X = X.unstack().fillna(0)
X = X.stack().astype('float32')
X = np.log1p(X).reset_index()
X.columns = features + ['num']

X_train = [X[f].values for f in features]
y_train = (X[['num']].values).astype('float32')

k_latent = 3
embedding_reg = 0.0002
kernel_reg = 0.1

model, model_features = build_model_1(X_train, fsize)

n_epochs = 20

batch_size = 2 ** 17
model, model_features = build_model_1(X_train, fsize)
earlystopper = EarlyStopping(patience=0, verbose=50)

history = model.fit(
    X_train,  y_train,
    epochs=n_epochs, batch_size=batch_size, verbose=1, shuffle=True,
    validation_data=(X_train, y_train),
    callbacks=[earlystopper],
)
# model.save('weights/{}_weights.h5'.format(str(date.today()).replace('-', '')))

X_pred = model_features.predict(X_train, batch_size=batch_size)

factors = X_pred[:len(features)]

biases = X_pred[len(features):2*len(features)]

for f, X_p in zip(features, factors):
    for i in range(k_latent):
        X['%s_fm_factor_%d' % (f, i)] = X_p[:, i]

# for f, X_p in zip(features, biases):
#     for i in range(k_latent):
#         X['%s_fm_bias' % (f, i)] = X_p[:, 0i]

In [None]:
X.shape

In [None]:
df.shape

In [None]:
X.head()

In [None]:
df = pd.merge(df[['card_id', 'feature_1', 'feature_2', 'feature_3']], X, on=['feature_1', 'feature_2', 'feature_3'], how='left')
df = df.drop(features, axis=1)
df.to_pickle(f'../remove_outlier_feature/{PREF}.pkl')

# FFM

In [None]:
import numpy as np
from sklearn.base import BaseEstimator
from keras.layers import Input, Embedding, Dense,Flatten, merge,Activation
from keras.models import Model
from keras.regularizers import l2 as l2_reg
from keras import initializers
import itertools


def make_batches(size, batch_size):
    nb_batch = int(np.ceil(size/float(batch_size)))
    return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]


def batch_generator(X,y,batch_size=128,shuffle=True):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    while 1:
        if shuffle:
            np.random.shuffle(index_array)
        batches = make_batches(sample_size, batch_size)
        for batch_index, (batch_start, batch_end) in enumerate(batches):
            batch_ids = index_array[batch_start:batch_end]
            X_batch = [X[i][batch_ids] for i in range(len(X))]
            y_batch = y[batch_ids]
            yield X_batch,y_batch


def test_batch_generator(X,y,batch_size=128):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    batches = make_batches(sample_size, batch_size)
    for batch_index, (batch_start, batch_end) in enumerate(batches):
        batch_ids = index_array[batch_start:batch_end]
        X_batch = [X[i][batch_ids] for i in range(len(X))]
        y_batch = y[batch_ids]
        yield X_batch,y_batch


def predict_batch(model,X_t,batch_size=128):
    outcome = []
    for X_batch,y_batch in test_batch_generator(X_t,np.zeros(X_t[0].shape[0]),batch_size=batch_size):
        outcome.append(model.predict(X_batch,batch_size=batch_size))
    outcome = np.concatenate(outcome).ravel()
    return outcome



def build_model(max_features,K=8,solver='adam',l2=0.0,l2_fm = 0.0):

    inputs = []
    flatten_layers=[]
    columns = range(len(max_features))
    for c in columns:
        inputs_c = Input(shape=(1,), dtype='int32',name = 'input_%s'%c)
        num_c = max_features[c]

        embed_c = Embedding(
                        num_c,
                        K,
                        input_length=1,
                        name = 'embed_%s'%c,
                        W_regularizer=l2_reg(l2_fm)
                        )(inputs_c)

        flatten_c = Flatten()(embed_c)
        inputs.append(inputs_c)
        flatten_layers.append(flatten_c)

    fm_layers = []
    for emb1,emb2 in itertools.combinations(flatten_layers, 2):
        dot_layer = merge([emb1,emb2],mode='dot',dot_axes=1)
        fm_layers.append(dot_layer)

    for c in columns:
        num_c = max_features[c]
        embed_c = Embedding(
                        num_c,
                        1,
                        input_length=1,
                        name = 'linear_%s'%c,
                        W_regularizer=l2_reg(l2)
                        )(inputs[c])

        flatten_c = Flatten()(embed_c)

        fm_layers.append(flatten_c)
        
        
    flatten = merge(fm_layers,mode='sum')
    outputs = Activation('sigmoid',name='outputs')(flatten)
    
    model = Model(input=inputs, output=outputs)

    model.compile(
                optimizer=solver,
                loss= 'binary_crossentropy'
              )

    return model

class KerasFM(BaseEstimator):
    def __init__(self,max_features=[],K=8,solver='adam',l2=0.0,l2_fm = 0.0):
        self.model = build_model(max_features,K,solver,l2=l2,l2_fm = l2_fm)

    def fit(self,X,y,batch_size=128,nb_epoch=10,shuffle=True,verbose=1,validation_data=None):
        self.model.fit(X,y,batch_size=batch_size,nb_epoch=nb_epoch,shuffle=shuffle,verbose=verbose,validation_data=None)

    def fit_generator(self,X,y,batch_size=128,nb_epoch=10,shuffle=True,verbose=1,validation_data=None,callbacks=None):
        tr_gen = batch_generator(X,y,batch_size=batch_size,shuffle=shuffle)
        if validation_data:
            X_test,y_test = validation_data
            te_gen = batch_generator(X_test,y_test,batch_size=batch_size,shuffle=False)
            nb_val_samples = X_test[-1].shape[0]
        else:
            te_gen = None
            nb_val_samples = None

        self.model.fit_generator(
                tr_gen, 
                samples_per_epoch=X[-1].shape[0], 
                nb_epoch=nb_epoch, 
                verbose=verbose, 
                callbacks=callbacks, 
                validation_data=te_gen, 
                nb_val_samples=nb_val_samples, 
                max_q_size=10
                )

    def predict(self,X,batch_size=128):
        y_preds = predict_batch(self.model,X,batch_size=batch_size)
        return y_preds

In [2]:
PATH = os.path.join('..', 'remove_outlier_data')

KEY = 'card_id'

In [3]:
features = []

features +=  [f'f10{i}.pkl' for i in (2, 3)]
# features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
#                                for j in ('Y', 'N')]
# features += [f'f12{i}.pkl' for i in (1, 2)]


features += [f'f20{i}.pkl' for i in (2, 3)]
# features += [f'f21{i}_{j}.pkl' for i in (1, 2)
#                                for j in ('Y', 'N')]

features += [f'f40{i}.pkl' for i in (2, 3)]
# features += [f'f41{i}_{j}.pkl' for i in (1, 2)
#                                for j in ('Y', 'N')]
# features += [f'f42{i}.pkl' for i in (1, 2)]


# features = os.listdir('../remove_outlier_feature')

In [4]:
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

for f in tqdm(features):
    # print(f'Merge: {f}', end=' ')
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on=KEY, how='left')
    test = pd.merge(test, t, on=KEY, how='left')
    # print('Done!!')

100%|██████████| 6/6 [00:03<00:00,  1.74it/s]


In [5]:
cols = train.columns.values
for f in [
    'new_purchase_date_max', 'new_purchase_date_min',
    'hist_purchase_date_max', 'hist_purchase_date_min', 
    'N_hist_auth_purchase_date_max', 'N_hist_auth_purchase_date_min',
    'Y_hist_auth_purchase_date_max', 'Y_hist_auth_purchase_date_min', 
    'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min', 
    'N_new_auth_purchase_date_max', 'N_new_auth_purchase_date_min',
    'Y_new_auth_purchase_date_max_x', 'Y_new_auth_purchase_date_min_x', 
    'N_new_auth_purchase_date_max_x', 'N_new_auth_purchase_date_min_x', 
    'Y_new_auth_purchase_date_max_y', 'Y_new_auth_purchase_date_min_y', 
    'N_new_auth_purchase_date_max_y', 'N_new_auth_purchase_date_min_y'
]:
    if f in cols:
        train[f] = train[f].astype(np.int64) * 1e-9
        test[f] = test[f].astype(np.int64) * 1e-9

In [6]:
y = train['target']

In [7]:
int_cols = [c for d, c in zip(train.dtypes, train.columns) if str(d).startswith('int')]
len(int_cols)

39

In [10]:
pd.DataFrame(data=int_cols, columns=['ffm_columns'])

Unnamed: 0,ffm_columns
0,feature_1
1,feature_2
2,feature_3
3,first_active_month_year
4,first_active_month_weekday
5,first_active_month_month
6,first_active_month_weekofyear
7,first_active_month_quarter
8,first_active_month_is_month_start
9,elapsed_time


In [None]:
max_features = [train[c].max() + 1 for c in int_cols]
len(max_features)

In [None]:
for c in int_cols:
    trainno = len(train[c].unique())
    testno = len(test[c].unique())
    print(c,trainno,testno)

In [None]:
x = train[int_cols].values
y = train['target'].values

In [None]:
test.insert(1,'target',0)

In [None]:
test.head()

In [None]:
categories = int_cols
numerics = []

currentcode = len(numerics)
catdict = {}
catcodes = {}
for x in numerics:
    catdict[x] = 0
for x in categories:
    catdict[x] = 1

noofrows = train.shape[0]
noofcolumns = len(features)
with open("alltrainffm.txt", "w") as text_file:
    for n, r in enumerate(range(noofrows)):
        if((n%100000)==0):
            print('Row',n)
        datastring = ""
        datarow = train.iloc[r].to_dict()
        datastring += str(int(datarow['target']))


        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0):
                datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
            else:
                if(x not in catcodes):
                    catcodes[x] = {}
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode
                elif(datarow[x] not in catcodes[x]):
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode

                code = catcodes[x][datarow[x]]
                datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
        datastring += '\n'
        text_file.write(datastring)
        
noofrows = test.shape[0]
noofcolumns = len(features)
with open("alltestffm.txt", "w") as text_file:
    for n, r in enumerate(range(noofrows)):
        if((n%100000)==0):
            print('Row',n)
        datastring = ""
        datarow = test.iloc[r].to_dict()
        datastring += str(int(datarow['target']))

        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0):
                datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
            else:
                if(x not in catcodes):
                    catcodes[x] = {}
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode
                elif(datarow[x] not in catcodes[x]):
                    currentcode +=1
                    catcodes[x][datarow[x]] = currentcode

                code = catcodes[x][datarow[x]]
                datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
        datastring += '\n'
        text_file.write(datastring)

In [None]:
import numpy as np
import xlearn as xl
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load dataset
X = train[int_cols].values
y = train['target'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

# param:
#  0. binary classification
#  1. model scale: 0.1
#  2. epoch number: 10 (auto early-stop)
#  3. learning rate: 0.1
#  4. regular lambda: 1.0
#  5. use sgd optimization method
linear_model = xl.LRModel(task='reg', init=0.1,
                          epoch=10, lr=0.1,
                          reg_lambda=1.0, opt='sgd', metric='rmse')

# Start to train
linear_model.fit(X_train, y_train,
                 eval_set=[X_val, y_val],
                 is_lock_free=False)

# Generate predictions
y_pred = linear_model.predict(X_val)

In [None]:
y_pred

In [None]:
import xlearn as xl

# Training task
ffm_model = xl.create_ffm()                # Use field-aware factorization machine (ffm)
ffm_model.setTrain("./alltrainffm.txt.txt")    # Path of training data

# param:
#  0. task: binary classification
#  1. learning rate : 0.2
#  2. regular lambda : 0.002
param = {'task':'binary', 'lr':0.2, 'lambda':0.002}

# Train model
ffm_model.fit(param, "./model.out")

In [12]:
ffm_cols = pd.read_csv('../remove_outlier_py/ffm/ffm_cols.csv')

In [16]:
list(ffm_cols['ffm_cols'].values)

In [18]:
import os
import sys
import gc
import numpy as np
import pandas as pd

from tqdm import tqdm
import datetime
from sklearn.preprocessing import LabelEncoder
from multiprocessing import cpu_count, Pool

NTHREAD = cpu_count()

PREF = 'f103'

KEY = 'card_id'

stats = ['min', 'max', 'mean', 'std']

PATH = os.path.join('..', 'remove_outlier_data')

historical_transactions = pd.read_csv(os.path.join(PATH, 'historical_transactions.csv'))
historical_transactions['installments'] = historical_transactions['installments'].astype(int)
historical_transactions = historical_transactions.query('0 <= installments and installments <= 12')

prefix = 'hist_'
key =  ['card_id', 'month_lag']
num_aggregations = {
   'purchase_amount': stats,
   'installments': ['mean', 'sum', 'std']
}

grouped = historical_transactions.groupby(key)

In [None]:
agg = grouped.agg(num_aggregations)
agg.columns = ['_'.join(col).strip() for col in agg.columns.values]
agg.reset_index(inplace=True)

agg = agg.groupby('card_id').agg(['mean', 'std'])
agg.columns = [prefix+'_'.join(col).strip() for col in agg.columns.values]
agg = agg.reset_index()
agg = agg.rename(columns={prefix+KEY: KEY})

In [68]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_diff
0,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,-1
1,1,C_ID_4e6213e9bc,88,1,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,-1
2,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,-1
3,1,C_ID_4e6213e9bc,88,1,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,0
4,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,-1


In [19]:
agg.head()

Unnamed: 0,card_id,hist_month_lag_mean,hist_month_lag_std,hist_purchase_amount_min_mean,hist_purchase_amount_min_std,hist_purchase_amount_max_mean,hist_purchase_amount_max_std,hist_purchase_amount_mean_mean,hist_purchase_amount_mean_std,hist_purchase_amount_std_mean,hist_purchase_amount_std_std,hist_installments_mean_mean,hist_installments_mean_std,hist_installments_sum_mean,hist_installments_sum_std,hist_installments_std_mean,hist_installments_std_std
0,C_ID_00007093c1,-6.0,3.89444,-0.704152,0.041908,-0.027161,0.523539,-0.507947,0.097299,0.228582,0.162266,1.34848,0.357031,14.769231,6.431094,0.625616,0.609378
1,C_ID_0001238066,-2.5,1.870829,-0.730351,0.002699,-0.160977,0.485869,-0.61078,0.031274,0.145298,0.082009,1.684496,0.328656,33.5,25.137621,1.216746,0.509988
2,C_ID_0001506ef0,-6.230769,4.225988,-0.725724,0.011212,0.161333,0.748701,-0.461749,0.259737,0.478896,0.392598,0.012821,0.046225,0.076923,0.27735,0.034021,0.117851
3,C_ID_0001793786,-4.5,3.02765,-0.647028,0.166706,2.190924,1.290775,-0.042211,0.277565,0.85143,0.227191,0.014621,0.024394,0.5,0.849837,0.065065,0.105721
4,C_ID_000183fdda,-3.0,2.160247,-0.687901,0.107543,1.034983,1.018223,-0.425236,0.2176,0.468297,0.296189,2.061905,0.539997,38.285714,15.934389,2.154613,0.616657


In [71]:
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])
historical_transactions['month_diff'] = (datetime.date(2018, 2, 1) - historical_transactions['purchase_date'].dt.date).dt.days // 30  # TODO: change today
historical_transactions['month_diff'] += historical_transactions['month_lag']

pt = historical_transactions.pivot_table(
    index='card_id', 
    columns=['month_lag'], 
    values=['purchase_amount'], aggfunc=['sum', 'mean', 'count'])

In [72]:
pt = pt.fillna(0).reset_index()

In [73]:
pt.columns = [f'{c[0]}_{c[1]}_{c[2]}'.strip('_').replace('-', '') for c in pt.columns]

In [74]:
pt.head()

Unnamed: 0,card_id,sum_purchase_amount_13,sum_purchase_amount_12,sum_purchase_amount_11,sum_purchase_amount_10,sum_purchase_amount_9,sum_purchase_amount_8,sum_purchase_amount_7,sum_purchase_amount_6,sum_purchase_amount_5,sum_purchase_amount_4,sum_purchase_amount_3,sum_purchase_amount_2,sum_purchase_amount_1,sum_purchase_amount_0,mean_purchase_amount_13,mean_purchase_amount_12,mean_purchase_amount_11,mean_purchase_amount_10,mean_purchase_amount_9,mean_purchase_amount_8,mean_purchase_amount_7,mean_purchase_amount_6,mean_purchase_amount_5,mean_purchase_amount_4,mean_purchase_amount_3,mean_purchase_amount_2,mean_purchase_amount_1,mean_purchase_amount_0,count_purchase_amount_13,count_purchase_amount_12,count_purchase_amount_11,count_purchase_amount_10,count_purchase_amount_9,count_purchase_amount_8,count_purchase_amount_7,count_purchase_amount_6,count_purchase_amount_5,count_purchase_amount_4,count_purchase_amount_3,count_purchase_amount_2,count_purchase_amount_1,count_purchase_amount_0
0,C_ID_00007093c1,0.0,-1.334414,-5.712629,-6.508688,-3.425248,-9.868677,-9.238465,-6.402827,-2.946293,-10.423035,-3.068579,-5.9798,-5.686047,-6.250338,0.0,-0.333604,-0.571263,-0.542391,-0.342525,-0.519404,-0.615898,-0.376637,-0.589259,-0.61312,-0.51143,-0.498317,-0.568605,-0.520861,0.0,4.0,10.0,12.0,10.0,19.0,15.0,17.0,5.0,17.0,6.0,12.0,10.0,12.0
1,C_ID_0001238066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.302784,-6.798941,-14.426698,-21.442877,-13.060248,-14.894008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.651392,-0.618086,-0.627248,-0.579537,-0.567837,-0.620584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.0,23.0,37.0,23.0,24.0
2,C_ID_0001506ef0,-1.41718,-3.45733,-5.544708,0.0,-0.701828,-2.203352,0.034381,-0.217313,-0.336023,-0.872199,-3.591547,-11.071218,-3.176785,-2.046776,-0.70859,-0.691466,-0.693088,0.0,-0.701828,-0.734451,0.01719,-0.108657,-0.168011,-0.290733,-0.513078,-0.615068,-0.453826,-0.341129,2.0,5.0,8.0,0.0,1.0,3.0,2.0,2.0,2.0,3.0,7.0,18.0,7.0,6.0
3,C_ID_0001793786,0.0,0.0,0.0,0.0,0.414296,1.147469,-0.846067,-10.712039,2.091705,-5.934751,-0.66593,-10.843852,-5.938868,-5.497975,0.0,0.0,0.0,0.0,0.207148,0.191245,-0.052879,-0.357068,0.522926,-0.160399,-0.022963,-0.318937,-0.156286,-0.274899,0.0,0.0,0.0,0.0,2.0,6.0,16.0,30.0,4.0,37.0,29.0,34.0,38.0,20.0
4,C_ID_000183fdda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.22684,-13.915016,-15.177363,-3.359763,-12.340417,-6.583821,-15.638962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.037807,-0.632501,-0.607095,-0.373307,-0.514184,-0.253224,-0.558534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,22.0,25.0,9.0,24.0,26.0,28.0


In [75]:
pt.shape

(325540, 43)

In [76]:
pt = pt.add_prefix('hist_')
pt = pt.rename(columns={'hist_card_id': 'card_id'})

In [77]:
pt.head()

Unnamed: 0,card_id,hist_sum_purchase_amount_13,hist_sum_purchase_amount_12,hist_sum_purchase_amount_11,hist_sum_purchase_amount_10,hist_sum_purchase_amount_9,hist_sum_purchase_amount_8,hist_sum_purchase_amount_7,hist_sum_purchase_amount_6,hist_sum_purchase_amount_5,hist_sum_purchase_amount_4,hist_sum_purchase_amount_3,hist_sum_purchase_amount_2,hist_sum_purchase_amount_1,hist_sum_purchase_amount_0,hist_mean_purchase_amount_13,hist_mean_purchase_amount_12,hist_mean_purchase_amount_11,hist_mean_purchase_amount_10,hist_mean_purchase_amount_9,hist_mean_purchase_amount_8,hist_mean_purchase_amount_7,hist_mean_purchase_amount_6,hist_mean_purchase_amount_5,hist_mean_purchase_amount_4,hist_mean_purchase_amount_3,hist_mean_purchase_amount_2,hist_mean_purchase_amount_1,hist_mean_purchase_amount_0,hist_count_purchase_amount_13,hist_count_purchase_amount_12,hist_count_purchase_amount_11,hist_count_purchase_amount_10,hist_count_purchase_amount_9,hist_count_purchase_amount_8,hist_count_purchase_amount_7,hist_count_purchase_amount_6,hist_count_purchase_amount_5,hist_count_purchase_amount_4,hist_count_purchase_amount_3,hist_count_purchase_amount_2,hist_count_purchase_amount_1,hist_count_purchase_amount_0
0,C_ID_00007093c1,0.0,-1.334414,-5.712629,-6.508688,-3.425248,-9.868677,-9.238465,-6.402827,-2.946293,-10.423035,-3.068579,-5.9798,-5.686047,-6.250338,0.0,-0.333604,-0.571263,-0.542391,-0.342525,-0.519404,-0.615898,-0.376637,-0.589259,-0.61312,-0.51143,-0.498317,-0.568605,-0.520861,0.0,4.0,10.0,12.0,10.0,19.0,15.0,17.0,5.0,17.0,6.0,12.0,10.0,12.0
1,C_ID_0001238066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.302784,-6.798941,-14.426698,-21.442877,-13.060248,-14.894008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.651392,-0.618086,-0.627248,-0.579537,-0.567837,-0.620584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.0,23.0,37.0,23.0,24.0
2,C_ID_0001506ef0,-1.41718,-3.45733,-5.544708,0.0,-0.701828,-2.203352,0.034381,-0.217313,-0.336023,-0.872199,-3.591547,-11.071218,-3.176785,-2.046776,-0.70859,-0.691466,-0.693088,0.0,-0.701828,-0.734451,0.01719,-0.108657,-0.168011,-0.290733,-0.513078,-0.615068,-0.453826,-0.341129,2.0,5.0,8.0,0.0,1.0,3.0,2.0,2.0,2.0,3.0,7.0,18.0,7.0,6.0
3,C_ID_0001793786,0.0,0.0,0.0,0.0,0.414296,1.147469,-0.846067,-10.712039,2.091705,-5.934751,-0.66593,-10.843852,-5.938868,-5.497975,0.0,0.0,0.0,0.0,0.207148,0.191245,-0.052879,-0.357068,0.522926,-0.160399,-0.022963,-0.318937,-0.156286,-0.274899,0.0,0.0,0.0,0.0,2.0,6.0,16.0,30.0,4.0,37.0,29.0,34.0,38.0,20.0
4,C_ID_000183fdda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.22684,-13.915016,-15.177363,-3.359763,-12.340417,-6.583821,-15.638962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.037807,-0.632501,-0.607095,-0.373307,-0.514184,-0.253224,-0.558534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,22.0,25.0,9.0,24.0,26.0,28.0


In [78]:
pt.columns

Index(['card_id', 'hist_sum_purchase_amount_13', 'hist_sum_purchase_amount_12',
       'hist_sum_purchase_amount_11', 'hist_sum_purchase_amount_10',
       'hist_sum_purchase_amount_9', 'hist_sum_purchase_amount_8',
       'hist_sum_purchase_amount_7', 'hist_sum_purchase_amount_6',
       'hist_sum_purchase_amount_5', 'hist_sum_purchase_amount_4',
       'hist_sum_purchase_amount_3', 'hist_sum_purchase_amount_2',
       'hist_sum_purchase_amount_1', 'hist_sum_purchase_amount_0',
       'hist_mean_purchase_amount_13', 'hist_mean_purchase_amount_12',
       'hist_mean_purchase_amount_11', 'hist_mean_purchase_amount_10',
       'hist_mean_purchase_amount_9', 'hist_mean_purchase_amount_8',
       'hist_mean_purchase_amount_7', 'hist_mean_purchase_amount_6',
       'hist_mean_purchase_amount_5', 'hist_mean_purchase_amount_4',
       'hist_mean_purchase_amount_3', 'hist_mean_purchase_amount_2',
       'hist_mean_purchase_amount_1', 'hist_mean_purchase_amount_0',
       'hist_count_purchase_a

In [82]:
use_cols = ['card_id']

cols = [ 
    'hist_sum_purchase_amount_13', 'hist_sum_purchase_amount_12', 
    'hist_sum_purchase_amount_11', 'hist_sum_purchase_amount_10',
    'hist_sum_purchase_amount_9', 'hist_sum_purchase_amount_8',
    'hist_sum_purchase_amount_7', 'hist_sum_purchase_amount_6',
    'hist_sum_purchase_amount_5', 'hist_sum_purchase_amount_4',
    'hist_sum_purchase_amount_3', 'hist_sum_purchase_amount_2',
    'hist_sum_purchase_amount_1', 'hist_sum_purchase_amount_0',
]

cumsum_cols = []
for e, c in enumerate(cols):
    cumsum_cols.append(c)
    pt['hist_cumusum_sum_purchase_amount'+str(e)] = pt[cumsum_cols].apply(np.sum, axis=1)
    use_cols.append('hist_cumusum_sum_purchase_amount'+str(e))

In [86]:
pt.head()

Unnamed: 0,card_id,hist_sum_purchase_amount_13,hist_sum_purchase_amount_12,hist_sum_purchase_amount_11,hist_sum_purchase_amount_10,hist_sum_purchase_amount_9,hist_sum_purchase_amount_8,hist_sum_purchase_amount_7,hist_sum_purchase_amount_6,hist_sum_purchase_amount_5,hist_sum_purchase_amount_4,hist_sum_purchase_amount_3,hist_sum_purchase_amount_2,hist_sum_purchase_amount_1,hist_sum_purchase_amount_0,hist_mean_purchase_amount_13,hist_mean_purchase_amount_12,hist_mean_purchase_amount_11,hist_mean_purchase_amount_10,hist_mean_purchase_amount_9,hist_mean_purchase_amount_8,hist_mean_purchase_amount_7,hist_mean_purchase_amount_6,hist_mean_purchase_amount_5,hist_mean_purchase_amount_4,hist_mean_purchase_amount_3,hist_mean_purchase_amount_2,hist_mean_purchase_amount_1,hist_mean_purchase_amount_0,hist_count_purchase_amount_13,hist_count_purchase_amount_12,hist_count_purchase_amount_11,hist_count_purchase_amount_10,hist_count_purchase_amount_9,hist_count_purchase_amount_8,hist_count_purchase_amount_7,hist_count_purchase_amount_6,hist_count_purchase_amount_5,hist_count_purchase_amount_4,hist_count_purchase_amount_3,hist_count_purchase_amount_2,hist_count_purchase_amount_1,hist_count_purchase_amount_0,hist_cumusum_purchase_amount0,hist_cumusum_purchase_amount1,hist_cumusum_purchase_amount2,hist_cumusum_purchase_amount3,hist_cumusum_purchase_amount4,hist_cumusum_purchase_amount5,hist_cumusum_purchase_amount6,hist_cumusum_purchase_amount7,hist_cumusum_purchase_amount8,hist_cumusum_purchase_amount9,hist_cumusum_purchase_amount10,hist_cumusum_purchase_amount11,hist_cumusum_purchase_amount12,hist_cumusum_purchase_amount13
0,C_ID_00007093c1,0.0,-1.334414,-5.712629,-6.508688,-3.425248,-9.868677,-9.238465,-6.402827,-2.946293,-10.423035,-3.068579,-5.9798,-5.686047,-6.250338,0.0,-0.333604,-0.571263,-0.542391,-0.342525,-0.519404,-0.615898,-0.376637,-0.589259,-0.61312,-0.51143,-0.498317,-0.568605,-0.520861,0.0,4.0,10.0,12.0,10.0,19.0,15.0,17.0,5.0,17.0,6.0,12.0,10.0,12.0,0.0,-1.334414,-7.047043,-13.555732,-16.98098,-26.849657,-36.088121,-42.490948,-45.437241,-55.860276,-58.928855,-64.908655,-70.594702,-76.84504
1,C_ID_0001238066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.302784,-6.798941,-14.426698,-21.442877,-13.060248,-14.894008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.651392,-0.618086,-0.627248,-0.579537,-0.567837,-0.620584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.0,23.0,37.0,23.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.302784,-8.101724,-22.528423,-43.9713,-57.031547,-71.925555
2,C_ID_0001506ef0,-1.41718,-3.45733,-5.544708,0.0,-0.701828,-2.203352,0.034381,-0.217313,-0.336023,-0.872199,-3.591547,-11.071218,-3.176785,-2.046776,-0.70859,-0.691466,-0.693088,0.0,-0.701828,-0.734451,0.01719,-0.108657,-0.168011,-0.290733,-0.513078,-0.615068,-0.453826,-0.341129,2.0,5.0,8.0,0.0,1.0,3.0,2.0,2.0,2.0,3.0,7.0,18.0,7.0,6.0,-1.41718,-4.87451,-10.419218,-10.419218,-11.121046,-13.324399,-13.290018,-13.507331,-13.843354,-14.715553,-18.3071,-29.378318,-32.555103,-34.601879
3,C_ID_0001793786,0.0,0.0,0.0,0.0,0.414296,1.147469,-0.846067,-10.712039,2.091705,-5.934751,-0.66593,-10.843852,-5.938868,-5.497975,0.0,0.0,0.0,0.0,0.207148,0.191245,-0.052879,-0.357068,0.522926,-0.160399,-0.022963,-0.318937,-0.156286,-0.274899,0.0,0.0,0.0,0.0,2.0,6.0,16.0,30.0,4.0,37.0,29.0,34.0,38.0,20.0,0.0,0.0,0.0,0.0,0.414296,1.561765,0.715698,-9.996342,-7.904636,-13.839387,-14.505317,-25.349169,-31.288037,-36.786013
4,C_ID_000183fdda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.22684,-13.915016,-15.177363,-3.359763,-12.340417,-6.583821,-15.638962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.037807,-0.632501,-0.607095,-0.373307,-0.514184,-0.253224,-0.558534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,22.0,25.0,9.0,24.0,26.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.22684,-14.141856,-29.319218,-32.678981,-45.019399,-51.60322,-67.242182


In [None]:
cols = [  
    'hist_count_purchase_amount_13', 'hist_count_purchase_amount_12',
    'hist_count_purchase_amount_11', 'hist_count_purchase_amount_10',
    'hist_count_purchase_amount_9', 'hist_count_purchase_amount_8',
    'hist_count_purchase_amount_7', 'hist_count_purchase_amount_6',
    'hist_count_purchase_amount_5', 'hist_count_purchase_amount_4',
    'hist_count_purchase_amount_3', 'hist_count_purchase_amount_2',
    'hist_count_purchase_amount_1', 'hist_count_purchase_amount_0']

cumsum_cols = []
for e, c in enumerate(cols):
    cumsum_cols.append(c)
    pt['hist_cumsum_count_purchase_amount'+str(e)] = pt[cumsum_cols].apply(np.sum, axis=1)
    use_cols.append('hist_cumsum_count_purchase_amount'+str(e))
    
use_cols += ['']

In [93]:
new_merchant_transactions = pd.read_csv(os.path.join(PATH, 'new_merchant_transactions.csv'))
new_merchant_transactions['purchase_date'] = pd.to_datetime(new_merchant_transactions['purchase_date'])
new_merchant_transactions['month_diff'] = (datetime.date(2018, 2, 1) - new_merchant_transactions['purchase_date'].dt.date).dt.days // 30  # TODO: change today
new_merchant_transactions['month_diff'] += new_merchant_transactions['month_lag']

pt2 = new_merchant_transactions.pivot_table(
    index='card_id', 
    columns=['month_lag'], 
    values=['purchase_amount', 'installments'], aggfunc=['sum', 'mean', 'count'])

In [94]:
pt2 = pt2.fillna(0).reset_index()
pt2.columns = [f'{c[0]}_{c[1]}_{c[2]}'.strip('_').replace('-', '') for c in pt2.columns]
pt2 = pt2.add_prefix(prefix)
pt2 = pt2.rename(columns={prefix+KEY:KEY})

In [95]:
pt2.head()

Unnamed: 0,card_id,hist_sum_installments_1,hist_sum_installments_2,hist_sum_purchase_amount_1,hist_sum_purchase_amount_2,hist_mean_installments_1,hist_mean_installments_2,hist_mean_purchase_amount_1,hist_mean_purchase_amount_2,hist_count_installments_1,hist_count_installments_2,hist_count_purchase_amount_1,hist_count_purchase_amount_2
0,C_ID_00007093c1,0.0,2.0,0.0,-1.3287,0.0,1.0,0.0,-0.66435,0.0,2.0,0.0,2.0
1,C_ID_0001238066,19.0,23.0,-9.7551,-5.0955,1.117647,2.555556,-0.573829,-0.566167,17.0,9.0,17.0,9.0
2,C_ID_0001506ef0,0.0,0.0,-1.4473,0.0,0.0,0.0,-0.72365,0.0,2.0,0.0,2.0,0.0
3,C_ID_0001793786,0.0,0.0,1.675014,-1.9055,0.0,0.0,0.079763,-0.19055,21.0,10.0,21.0,10.0
4,C_ID_000183fdda,15.0,1.0,-5.1108,-1.47967,1.875,0.333333,-0.63885,-0.493223,8.0,3.0,8.0,3.0


In [96]:
features = []

features +=  [f'f10{i}.pkl' for i in (2, )]
features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
                               for j in ('Y', 'N')]
features += [f'f12{i}.pkl' for i in (1, 2)]
features += [f'f13{i}.pkl' for i in (1, 2)]

features += [f'f20{i}.pkl' for i in (2, 3)]
features += [f'f21{i}_{j}.pkl' for i in (1, 2)
                               for j in ('Y', 'N')]
features += [f'f23{i}.pkl' for i in (1, 2)]

# features += [f'f40{i}.pkl' for i in (2, 3)]
# features += [f'f41{i}_{j}.pkl' for i in (1, 2)
#                                for j in ('Y', 'N')]
# features += [f'f42{i}.pkl' for i in (1, 2)]

# features += [f'f50{i}.pkl' for i in (2, )]

# features = os.listdir('../remove_outlier_feature')

# =============================================================================
# read data and features
# =============================================================================
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    print(f, t.columns)

 12%|█▏        | 2/17 [00:00<00:02,  5.93it/s]

f102.pkl Index(['card_id', 'hist_transactions_count', 'hist_category_1_sum',
       'hist_category_1_mean', 'hist_category_2_nunique',
       'hist_category_3_nunique', 'hist_merchant_id_nunique',
       'hist_state_id_nunique', 'hist_subsector_id_nunique',
       'hist_city_id_nunique', 'hist_merchant_category_id_nunique',
       'hist_installments_nunique', 'hist_installments_mean',
       'hist_installments_std', 'hist_purchase_amount_sum',
       'hist_purchase_amount_mean', 'hist_purchase_amount_max',
       'hist_purchase_amount_min', 'hist_purchase_amount_std',
       'hist_purchase_month_median', 'hist_purchase_month_max',
       'hist_purchase_month_min', 'hist_purchase_month_std',
       'hist_purchase_date_max', 'hist_purchase_date_min',
       'hist_month_diff_median', 'hist_month_diff_max', 'hist_month_diff_min',
       'hist_month_diff_std', 'hist_purchase_date_diff',
       'hist_purchase_date_average', 'hist_purchase_date_uptonow'],
      dtype='object')
f111_Y.pkl Inde

 35%|███▌      | 6/17 [00:00<00:01, 10.96it/s]

f112_Y.pkl Index(['card_id', 'hist_Y_month_lag_mean', 'hist_Y_month_lag_std',
       'hist_Y_purchase_amount_min_mean', 'hist_Y_purchase_amount_min_std',
       'hist_Y_purchase_amount_max_mean', 'hist_Y_purchase_amount_max_std',
       'hist_Y_purchase_amount_mean_mean', 'hist_Y_purchase_amount_mean_std',
       'hist_Y_purchase_amount_std_mean', 'hist_Y_purchase_amount_std_std',
       'hist_Y_installments_mean_mean', 'hist_Y_installments_mean_std',
       'hist_Y_installments_sum_mean', 'hist_Y_installments_sum_std',
       'hist_Y_installments_std_mean', 'hist_Y_installments_std_std'],
      dtype='object')
f112_N.pkl Index(['card_id', 'hist_N_month_lag_mean', 'hist_N_month_lag_std',
       'hist_N_purchase_amount_min_mean', 'hist_N_purchase_amount_min_std',
       'hist_N_purchase_amount_max_mean', 'hist_N_purchase_amount_max_std',
       'hist_N_purchase_amount_mean_mean', 'hist_N_purchase_amount_mean_std',
       'hist_N_purchase_amount_std_mean', 'hist_N_purchase_amount_std_std

 59%|█████▉    | 10/17 [00:00<00:00, 11.01it/s]

f131.pkl Index(['card_id', 'hist_cumusum_sum_purchase_amount0',
       'hist_cumusum_sum_purchase_amount1',
       'hist_cumusum_sum_purchase_amount2',
       'hist_cumusum_sum_purchase_amount3',
       'hist_cumusum_sum_purchase_amount4',
       'hist_cumusum_sum_purchase_amount5',
       'hist_cumusum_sum_purchase_amount6',
       'hist_cumusum_sum_purchase_amount7',
       'hist_cumusum_sum_purchase_amount8',
       'hist_cumusum_sum_purchase_amount9',
       'hist_cumusum_sum_purchase_amount10',
       'hist_cumusum_sum_purchase_amount11',
       'hist_cumusum_sum_purchase_amount12',
       'hist_cumusum_sum_purchase_amount13',
       'hist_cumsum_count_purchase_amount0',
       'hist_cumsum_count_purchase_amount1',
       'hist_cumsum_count_purchase_amount2',
       'hist_cumsum_count_purchase_amount3',
       'hist_cumsum_count_purchase_amount4',
       'hist_cumsum_count_purchase_amount5',
       'hist_cumsum_count_purchase_amount6',
       'hist_cumsum_count_purchase_amount7',


100%|██████████| 17/17 [00:01<00:00, 14.72it/s]

f211_Y.pkl Index(['card_id', 'Y_new_auth_category_1_sum', 'Y_new_auth_category_1_mean',
       'Y_new_auth_category_2_nunique', 'Y_new_auth_category_3_nunique',
       'Y_new_auth_merchant_id_nunique', 'Y_new_auth_state_id_nunique',
       'Y_new_auth_subsector_id_nunique', 'Y_new_auth_city_id_nunique',
       'Y_new_auth_merchant_category_id_nunique',
       'Y_new_auth_installments_nunique', 'Y_new_auth_installments_mean',
       'Y_new_auth_installments_std', 'Y_new_auth_purchase_amount_sum',
       'Y_new_auth_purchase_amount_mean', 'Y_new_auth_purchase_amount_max',
       'Y_new_auth_purchase_amount_min', 'Y_new_auth_purchase_amount_std',
       'Y_new_auth_purchase_month_median', 'Y_new_auth_purchase_month_max',
       'Y_new_auth_purchase_month_min', 'Y_new_auth_purchase_month_std',
       'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min',
       'Y_new_auth_month_diff_median', 'Y_new_auth_month_diff_max',
       'Y_new_auth_month_diff_min', 'Y_new_auth_month_diff_s


