In [192]:
import os
import sys
import gc
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from glob import glob
from tqdm import tqdm
from collections import defaultdict
from multiprocessing import cpu_count, Pool

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter('ignore')

In [193]:
pd.options.display.max_columns = None

In [194]:
KEY = 'card_id'

SEED = 6

NTHREAD = cpu_count()

NFOLD = 11

In [195]:
PATH = os.path.join('..', 'input')
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [196]:
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['target_raw'] = 2**train['target']

test['target'] = np.nan
test['target_raw'] = np.nan

df = pd.concat([train, test], axis=0)

del train, test
gc.collect()

df['first_active_month'] = pd.to_datetime(df['first_active_month'])
df['elapsed_time'] = (datetime.date(2018, 5, 1) - df['first_active_month'].dt.date).dt.days

features = ['feature_1', 'feature_2', 'feature_3']
for f in features:
    map_mean = df.groupby(f)['outliers'].mean()
    map_sum = df.groupby(f)['outliers'].sum()
    df[f + '_outliers_mean'] = df[f].map(map_mean)
    df[f + '_outliers_sum'] = df[f].map(map_sum)

train = df[df['target'].notnull()]
test = df[df['target'].isnull()]

del df
gc.collect()

113

In [197]:
train.head()

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,target_raw,elapsed_time,feature_1_outliers_mean,feature_1_outliers_sum,feature_2_outliers_mean,feature_2_outliers_sum,feature_3_outliers_mean,feature_3_outliers_sum
0,C_ID_92a2005557,5,2,1,2017-06-01,0.0,-0.820283,0.566331,334.0,0.013145,534.0,0.008752,655.0,0.011428,1305.0
1,C_ID_3d0044924f,4,1,0,2017-01-01,0.0,0.392913,1.313042,485.0,0.010712,213.0,0.011385,1016.0,0.010283,902.0
2,C_ID_d639edf6cd,2,2,0,2016-08-01,0.0,0.688056,1.611111,638.0,0.01061,592.0,0.008752,655.0,0.010283,902.0
3,C_ID_186d6a6901,4,3,0,2017-09-01,0.0,0.142495,1.103813,242.0,0.010712,213.0,0.014166,536.0,0.010283,902.0
4,C_ID_cdbd2c0db2,1,3,0,2017-11-01,0.0,-0.159749,0.895181,181.0,0.008058,97.0,0.014166,536.0,0.010283,902.0


In [198]:
historical_transactions = pd.read_csv(os.path.join(PATH, 'historical_transactions.csv'))
new_merchant_transactions = pd.read_csv(os.path.join(PATH, 'new_merchant_transactions.csv'))

In [199]:
new_merchant_transactions['purchase_amount_new'] = np.round(new_merchant_transactions['purchase_amount'] / 0.00150265118 + 497.06,2)
historical_transactions['purchase_amount_new'] = np.round(historical_transactions['purchase_amount'] / 0.00150265118 + 497.06,2)

In [200]:
def hist_aggregate():
    hist_df = pd.read_csv(os.path.join(PATH, 'historical_transactions.csv'))

    hist_df['purchase_date'] = pd.to_datetime(hist_df['purchase_date'])
    hist_df['year'] = hist_df['purchase_date'].dt.year
    hist_df['month'] = hist_df['purchase_date'].dt.month
    hist_df['weekofyear'] = hist_df['purchase_date'].dt.weekofyear
    hist_df['day'] = hist_df['purchase_date'].dt.day
    hist_df['weekend'] = (hist_df['purchase_date'].dt.weekday >= 5).astype(int)
    hist_df['hour'] = hist_df['purchase_date'].dt.hour
    
    hist_df['authorized_flag'] = hist_df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    
    hist_df['category_1'] = hist_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    hist_df['category_2'] = hist_df['category_2'].fillna(0).astype(int)
    hist_df['category_3'] = hist_df['category_3'].map({'A': 3, 'B': 2, 'C': 1, np.nan: 0})
    
    hist_df['installments_999'] = hist_df['installments'].apply(lambda x: np.where(x == 999, 1, 0))
    hist_df['installments'] = hist_df['installments'].apply(lambda x: np.where((x == 999) or (x == -1), np.nan, x))
    
    hist_df['merchant_id'] = hist_df['merchant_id'].fillna('ONLINE')
    
    hist_df['city_minus_one'] = hist_df['city_id'].apply(lambda x: np.where(x == -1, 1, 0))
    hist_df['merchant_category_minus_one'] =  hist_df['merchant_category_id'].apply(lambda x: np.where(x == -1, 1, 0))
    hist_df['subsector_minus_one'] = hist_df['subsector_id'].apply(lambda x: np.where(x == -1, 1, 0))

    hist_df['purchase_amount_new'] = np.round(hist_df['purchase_amount'] / 0.00150265118 + 497.06, 2)
    
    pt1 = pd.pivot_table(
        hist_df, 
        index='card_id', columns='month_lag', values=['purchase_amount_new'], 
        aggfunc=['sum']).reset_index()
    pt1.columns = [f'{c[0]}_{c[1]}_{c[2]}'.replace('-', '').strip('_') for c in pt1.columns]
    
    pt2 = pd.pivot_table(
        hist_df, 
        index='card_id', columns='month_lag', values=['merchant_id'], 
        aggfunc=['count']).reset_index()
    pt2.columns = [f'{c[0]}_{c[1]}_{c[2]}'.replace('-', '').strip('_') for c in pt2.columns]
        
    num_aggregations = {
        'authorized_flag': ['sum', 'mean'],
        
        'purchase_amount_new': ['sum'],
        'purchase_date': ['max', 'min'],
        
        'year': ['nunique'],
        'month': ['nunique', 'mean'],
        'weekofyear': ['nunique', 'mean'],
        'day': ['nunique', 'mean'],
        'weekend': ['sum', 'mean'],
        'hour': ['nunique', 'mean'],
        
        'category_1': ['sum', 'mean'],
        'category_2': ['sum', 'mean'],
        'category_3': ['sum', 'mean'],
        
        'city_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'merchant_id': ['nunique'],
        'state_id': ['nunique'],
        'subsector_id': ['nunique'],
        
        'city_minus_one': ['sum', 'mean'],
        'merchant_category_minus_one': ['sum', 'mean'],
        'subsector_minus_one': ['sum', 'mean'],
        
        'installments_999': ['sum', 'mean'],

        'month_lag': ['max', 'min', 'sum', 'mean', 'std']        
    }
    
    g1 = hist_df.groupby(['card_id']).agg(num_aggregations).reset_index()
    g1.columns = [f'{c[0]}_{c[1]}'.strip('_') for c in g1.columns]
    g1 = g1.add_prefix('hist_')
    g1 = g1.rename(columns={'hist_card_id': 'card_id'})
    
    num_aggregations = {
        'purchase_amount_new': ['sum'],
        'purchase_date': ['max', 'min'],

        'city_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'merchant_id': ['nunique'],
        'state_id': ['nunique'],
        'subsector_id': ['nunique'],
    }
    g2 = hist_df.groupby(['card_id', 'authorized_flag']).agg(num_aggregations).reset_index()
    g2.columns = [f'{c[0]}_{c[1]}'.strip('_') for c in g2.columns]
    
    g2_Y = g2.query('authorized_flag == 1')
    g2_Y = g2_Y.add_prefix('Y_hist_')
    g2_Y = g2_Y.rename(columns={'Y_hist_card_id': 'card_id'})
    g2_N = g2.query('authorized_flag == 0')
    g2_N = g2_N.add_prefix('N_hist_')
    g2_N = g2_N.rename(columns={'N_hist_card_id': 'card_id'})
    g2 = pd.merge(g2_Y, g2_N, on='card_id', how='left') 
    g2 = g2.drop(['Y_hist_authorized_flag', 'N_hist_authorized_flag'], axis=1)
    
    pt = pd.merge(pt1, pt2, on='card_id', how='left')
    g = pd.merge(g1, g2, on='card_id', how='left')
    feature = pd.merge(pt, g, on='card_id', how='left')

    return feature

In [201]:
def new_aggregate():
    new_df = pd.read_csv(os.path.join(PATH, 'new_merchant_transactions.csv'))

    new_df['purchase_date'] = pd.to_datetime(new_df['purchase_date'])
    new_df['year'] = new_df['purchase_date'].dt.year
    new_df['month'] = new_df['purchase_date'].dt.month
    new_df['weekofyear'] = new_df['purchase_date'].dt.weekofyear
    new_df['day'] = new_df['purchase_date'].dt.day
    new_df['weekend'] = (new_df['purchase_date'].dt.weekday >= 5).astype(int)
    new_df['hour'] = new_df['purchase_date'].dt.hour
        
    new_df['category_1'] = new_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    new_df['category_2'] = new_df['category_2'].fillna(0).astype(int)
    new_df['category_3'] = new_df['category_3'].map({'A': 3, 'B': 2, 'C': 1, np.nan: 0})
    
    new_df['installments_999'] = new_df['installments'].apply(lambda x: np.where(x == 999, 1, 0))
    new_df['installments'] = new_df['installments'].apply(lambda x: np.where((x == 999) or (x == -1), np.nan, x))
    new_df['merchant_id'] = new_df['merchant_id'].fillna('ONLINE')
    
        
    new_df['city_minus_one'] = new_df['city_id'].apply(lambda x: np.where(x == -1, 1, 0))
    new_df['merchant_category_minus_one'] =  new_df['merchant_category_id'].apply(lambda x: np.where(x == -1, 1, 0))
    new_df['subsector_minus_one'] = new_df['subsector_id'].apply(lambda x: np.where(x == -1, 1, 0))
    
    new_df['purchase_amount_new'] = np.round(new_df['purchase_amount'] / 0.00150265118 + 497.06, 2)
    
    pt1 = pd.pivot_table(
        new_df, 
        index='card_id', columns='month_lag', values=['purchase_amount_new'], 
        aggfunc=['sum']).reset_index()
    pt1.columns = [f'{c[0]}_{c[1]}_{c[2]}'.replace('-', '').strip('_') for c in pt1.columns]
    
    pt2 = pd.pivot_table(
        new_df, 
        index='card_id', columns='month_lag', values=['merchant_id'], 
        aggfunc=['count']).reset_index()
    pt2.columns = [f'{c[0]}_{c[1]}_{c[2]}'.replace('-', '').strip('_') for c in pt2.columns]
        
    num_aggregations = { 
        'purchase_amount_new': ['sum'],
        'purchase_date': ['max', 'min'],
        'year': ['nunique'],
        'month': ['nunique', 'mean'],
        'weekofyear': ['nunique', 'mean'],
        'day': ['nunique', 'mean'],
        'weekend': ['sum', 'mean'],
        'hour': ['nunique', 'mean'],
        
        'category_1': ['sum', 'mean'],
        'category_2': ['sum', 'mean'],
        'category_3': ['sum', 'mean'],
        
        'city_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'merchant_id': ['nunique'],
        'state_id': ['nunique'],
        'subsector_id': ['nunique'],
        
        'installments_999': ['sum', 'mean'],

        'month_lag': ['max', 'min', 'sum', 'mean', 'std']        
    }
    
    g = new_df.groupby(['card_id']).agg(num_aggregations).reset_index()
    g.columns = [f'{c[0]}_{c[1]}'.strip('_') for c in g.columns]
    g = g.add_prefix('new_')
    g = g.rename(columns={'new_card_id': 'card_id'})
    
    feature = pd.merge(pt1, pt2, on='card_id', how='left')
    feature = pd.merge(feature, g, on='card_id', how='left')
    
    return feature

In [167]:
hist_df = hist_aggregate()

In [168]:
new_df = new_aggregate()

In [169]:
hist_df.head()

Unnamed: 0,card_id,sum_purchase_amount_new_13,sum_purchase_amount_new_12,sum_purchase_amount_new_11,sum_purchase_amount_new_10,sum_purchase_amount_new_9,sum_purchase_amount_new_8,sum_purchase_amount_new_7,sum_purchase_amount_new_6,sum_purchase_amount_new_5,sum_purchase_amount_new_4,sum_purchase_amount_new_3,sum_purchase_amount_new_2,sum_purchase_amount_new_1,sum_purchase_amount_new_0,count_merchant_id_13,count_merchant_id_12,count_merchant_id_11,count_merchant_id_10,count_merchant_id_9,count_merchant_id_8,count_merchant_id_7,count_merchant_id_6,count_merchant_id_5,count_merchant_id_4,count_merchant_id_3,count_merchant_id_2,count_merchant_id_1,count_merchant_id_0,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_purchase_amount_new_sum,hist_purchase_date_max,hist_purchase_date_min,hist_year_nunique,hist_month_nunique,hist_month_mean,hist_weekofyear_nunique,hist_weekofyear_mean,hist_day_nunique,hist_day_mean,hist_weekend_sum,hist_weekend_mean,hist_hour_nunique,hist_hour_mean,hist_category_1_sum,hist_category_1_mean,hist_category_2_sum,hist_category_2_mean,hist_category_3_sum,hist_category_3_mean,hist_city_id_nunique,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_state_id_nunique,hist_subsector_id_nunique,hist_city_minus_one_sum,hist_city_minus_one_mean,hist_merchant_category_minus_one_sum,hist_merchant_category_minus_one_mean,hist_subsector_minus_one_sum,hist_subsector_minus_one_mean,hist_installments_999_sum,hist_installments_999_mean,hist_month_lag_max,hist_month_lag_min,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_std,Y_hist_purchase_amount_new_sum,Y_hist_purchase_date_max,Y_hist_purchase_date_min,Y_hist_city_id_nunique,Y_hist_merchant_category_id_nunique,Y_hist_merchant_id_nunique,Y_hist_state_id_nunique,Y_hist_subsector_id_nunique,N_hist_purchase_amount_new_sum,N_hist_purchase_date_max,N_hist_purchase_date_min,N_hist_city_id_nunique,N_hist_merchant_category_id_nunique,N_hist_merchant_id_nunique,N_hist_state_id_nunique,N_hist_subsector_id_nunique
0,C_ID_00007093c1,,1100.2,1168.9,1633.25,2691.13,2876.63,1307.79,4189.0,524.57,1513.59,940.25,1985.22,1186.59,1805.18,,4.0,10.0,12.0,10.0,19.0,15.0,17.0,5.0,17.0,6.0,12.0,10.0,12.0,114,0.765101,22922.3,2018-02-27 05:14:57,2017-02-14 14:00:43,2,12,6.375839,39,25.550336,28,13.248322,25,0.167785,18,14.416107,28,0.187919,365,2.449664,274,1.838926,4,18,29,3,13,28,0.187919,0,0.0,0,0.0,0,0.0,0,-12,-872,-5.852349,3.453114,15109.57,2018-02-27 05:14:57,2017-02-14 14:00:43,4,18,28,3,13,7812.73,2018-02-08 16:37:19,2017-03-11 16:28:46,3.0,8.0,11.0,2.0,7.0
1,C_ID_0001238066,,,,,,,,,127.13,943.03,1831.55,4121.19,3884.94,2017.62,,,,,,,,,2.0,11.0,23.0,37.0,26.0,24.0,120,0.97561,12925.46,2018-02-27 16:18:59,2017-09-28 22:25:14,2,6,7.308943,23,29.96748,30,16.138211,52,0.422764,20,14.739837,2,0.01626,195,1.585366,208,1.691057,18,29,65,6,17,8,0.065041,0,0.0,0,0.0,0,0.0,0,-5,-223,-1.813008,1.28898,12625.46,2018-02-27 16:18:59,2017-09-28 22:25:14,18,29,65,6,17,300.0,2018-02-10 13:22:21,2017-12-30 18:49:51,2.0,1.0,2.0,2.0,1.0
2,C_ID_0001506ef0,51.0,184.48,286.53,,30.0,24.87,1017.0,849.5,770.5,910.74,1089.28,1579.29,1365.3,1620.25,2.0,5.0,8.0,,1.0,3.0,2.0,2.0,2.0,3.0,7.0,18.0,7.0,6.0,62,0.939394,9778.74,2018-02-17 12:33:56,2017-01-14 16:16:01,2,11,6.80303,24,27.090909,25,12.0,32,0.484848,15,12.606061,0,0.0,194,2.939394,197,2.984848,3,19,28,2,12,0,0.0,0,0.0,0,0.0,0,0.0,0,-13,-319,-4.833333,4.2375,9724.27,2018-02-17 12:33:56,2017-01-14 16:16:01,3,19,28,2,12,54.47,2018-02-17 12:33:27,2017-06-06 17:58:52,1.0,4.0,4.0,1.0,4.0
3,C_ID_0001793786,,,,,1269.83,3745.99,7389.91,7783.04,3380.25,14441.7,13971.57,9683.56,14936.02,6282.35,,,,,2.0,6.0,16.0,30.0,4.0,37.0,29.0,34.0,38.0,20.0,189,0.875,82884.22,2017-10-31 20:20:18,2017-01-21 10:15:21,1,10,6.671296,33,27.134259,31,16.115741,37,0.171296,21,15.606481,2,0.009259,208,0.962963,643,2.976852,10,48,119,4,24,2,0.009259,0,0.0,0,0.0,0,0.0,0,-9,-719,-3.328704,2.306373,77333.25,2017-10-31 20:20:18,2017-01-21 10:15:21,9,45,114,4,22,5550.97,2017-09-14 19:07:17,2017-03-04 13:56:41,8.0,14.0,17.0,4.0,11.0
4,C_ID_000183fdda,,,,,,,,2831.4,1675.01,2326.11,2237.65,4312.91,8872.47,3510.1,,,,,,,,6.0,22.0,25.0,9.0,26.0,28.0,28.0,137,0.951389,25765.65,2018-02-25 20:57:08,2017-08-07 09:49:14,2,7,6.881944,27,27.881944,30,14.034722,33,0.229167,19,16.465278,4,0.027778,407,2.826389,243,1.6875,9,36,73,7,21,4,0.027778,0,0.0,0,0.0,0,0.0,0,-6,-353,-2.451389,1.895264,22912.71,2018-02-25 20:57:08,2017-09-03 13:00:12,9,34,71,7,20,2852.94,2017-10-23 23:22:23,2017-08-07 09:49:14,1.0,4.0,4.0,1.0,4.0


In [170]:
new_df.head()

Unnamed: 0,card_id,sum_purchase_amount_new_1,sum_purchase_amount_new_2,count_merchant_id_1,count_merchant_id_2,new_purchase_amount_new_sum,new_purchase_date_max,new_purchase_date_min,new_year_nunique,new_month_nunique,new_month_mean,new_weekofyear_nunique,new_weekofyear_mean,new_day_nunique,new_day_mean,new_weekend_sum,new_weekend_mean,new_hour_nunique,new_hour_mean,new_category_1_sum,new_category_1_mean,new_category_2_sum,new_category_2_mean,new_category_3_sum,new_category_3_mean,new_city_id_nunique,new_merchant_category_id_nunique,new_merchant_id_nunique,new_state_id_nunique,new_subsector_id_nunique,new_installments_999_sum,new_installments_999_mean,new_month_lag_max,new_month_lag_min,new_month_lag_sum,new_month_lag_mean,new_month_lag_std
0,C_ID_00007093c1,,110.0,,2.0,110.0,2018-04-09 16:23:59,2018-04-03 11:13:35,1,1,4.0,2,14.5,2,6.0,0,0.0,2,13.5,0,0.0,4,2.0,4,2.0,2,2,2,2,2,0,0.0,2,2,4,2.0,0.0
1,C_ID_0001238066,1957.98,1083.01,17.0,9.0,3040.99,2018-04-30 19:57:30,2018-03-01 16:48:27,1,2,3.346154,9,12.846154,14,18.230769,12,0.461538,16,15.192308,2,0.076923,35,1.346154,46,1.769231,8,15,26,4,9,0,0.0,2,1,35,1.346154,0.485165
2,C_ID_0001506ef0,30.92,,2.0,,30.92,2018-03-22 09:14:30,2018-03-16 22:21:58,1,1,3.0,2,11.5,2,19.0,0,0.0,2,15.5,0,0.0,6,3.0,6,3.0,1,2,2,1,2,0,0.0,1,1,2,1.0,0.0
3,C_ID_0001793786,11552.9,3703.15,21.0,10.0,15256.05,2017-12-31 17:35:56,2017-11-15 15:44:20,1,2,11.322581,6,48.387097,13,23.612903,14,0.451613,10,11.419355,0,0.0,51,1.645161,93,3.0,7,21,31,5,14,0,0.0,2,1,41,1.322581,0.475191
4,C_ID_000183fdda,575.28,506.28,8.0,3.0,1081.56,2018-04-30 14:59:53,2018-03-02 12:26:26,1,2,3.272727,7,11.818182,9,11.727273,2,0.181818,8,15.454545,0,0.0,33,3.0,16,1.454545,2,9,11,2,6,0,0.0,2,1,14,1.272727,0.467099


In [171]:
hist_df.shape, new_df.shape

((325540, 85), (290001, 37))

In [202]:
for df in (hist_df, new_df):
    train = pd.merge(train, df, on='card_id', how='left')
    test = pd.merge(test, df, on='card_id', how='left')

In [203]:
df = pd.concat([train, test], axis=0)
df['first_active_month'] = pd.to_datetime(df['first_active_month'])

df['hist_first_buy'] = (df['hist_purchase_date_min'].dt.date - df['first_active_month'].dt.date).dt.days
df['hist_last_buy'] = (df['hist_purchase_date_max'].dt.date - df['first_active_month'].dt.date).dt.days
df['new_first_buy'] = (df['new_purchase_date_min'].dt.date - df['first_active_month'].dt.date).dt.days
df['new_last_buy'] = (df['new_purchase_date_max'].dt.date - df['first_active_month'].dt.date).dt.days

date_features = [
    'hist_purchase_date_max','hist_purchase_date_min',
    'Y_hist_purchase_date_min', 'N_hist_purchase_date_min',
    'Y_hist_purchase_date_max', 'N_hist_purchase_date_max',
    'new_purchase_date_max', 'new_purchase_date_min',
]

for f in date_features:
    df[f] = df[f].astype(np.int64) * 1e-9

df['nans'] = df.isnull().sum(axis=1)

train = df[df['target'].notnull()]
test = df[df['target'].isnull()]

categorical_features = ['feature_1', 'feature_2', 'feature_3']
pca = PCA(n_components=1)
pca.fit(train[categorical_features])
pca_train_values = pca.transform(train[categorical_features])
pca_test_values = pca.transform(test[categorical_features])

pca_train_values = np.transpose(pca_train_values, (1, 0))
pca_test_values = np.transpose(pca_test_values, (1, 0))

for e, (pca_train, pca_test) in enumerate(zip(pca_train_values, pca_test_values)):
    train[f'pca_feature_{e}'] = pca_train
    test[f'pca_feature_{e}'] = pca_test

del df
gc.collect()

7

In [204]:
train.head()

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,target_raw,elapsed_time,feature_1_outliers_mean,feature_1_outliers_sum,feature_2_outliers_mean,feature_2_outliers_sum,feature_3_outliers_mean,feature_3_outliers_sum,sum_purchase_amount_new_13,sum_purchase_amount_new_12,sum_purchase_amount_new_11,sum_purchase_amount_new_10,sum_purchase_amount_new_9,sum_purchase_amount_new_8,sum_purchase_amount_new_7,sum_purchase_amount_new_6,sum_purchase_amount_new_5,sum_purchase_amount_new_4,sum_purchase_amount_new_3,sum_purchase_amount_new_2_x,sum_purchase_amount_new_1_x,sum_purchase_amount_new_0,count_merchant_id_13,count_merchant_id_12,count_merchant_id_11,count_merchant_id_10,count_merchant_id_9,count_merchant_id_8,count_merchant_id_7,count_merchant_id_6,count_merchant_id_5,count_merchant_id_4,count_merchant_id_3,count_merchant_id_2_x,count_merchant_id_1_x,count_merchant_id_0,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_purchase_amount_new_sum,hist_purchase_date_max,hist_purchase_date_min,hist_year_nunique,hist_month_nunique,hist_month_mean,hist_weekofyear_nunique,hist_weekofyear_mean,hist_day_nunique,hist_day_mean,hist_weekend_sum,hist_weekend_mean,hist_hour_nunique,hist_hour_mean,hist_category_1_sum,hist_category_1_mean,hist_category_2_sum,hist_category_2_mean,hist_category_3_sum,hist_category_3_mean,hist_city_id_nunique,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_state_id_nunique,hist_subsector_id_nunique,hist_city_minus_one_sum,hist_city_minus_one_mean,hist_merchant_category_minus_one_sum,hist_merchant_category_minus_one_mean,hist_subsector_minus_one_sum,hist_subsector_minus_one_mean,hist_installments_999_sum,hist_installments_999_mean,hist_month_lag_max,hist_month_lag_min,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_std,Y_hist_purchase_amount_new_sum,Y_hist_purchase_date_max,Y_hist_purchase_date_min,Y_hist_city_id_nunique,Y_hist_merchant_category_id_nunique,Y_hist_merchant_id_nunique,Y_hist_state_id_nunique,Y_hist_subsector_id_nunique,N_hist_purchase_amount_new_sum,N_hist_purchase_date_max,N_hist_purchase_date_min,N_hist_city_id_nunique,N_hist_merchant_category_id_nunique,N_hist_merchant_id_nunique,N_hist_state_id_nunique,N_hist_subsector_id_nunique,sum_purchase_amount_new_1_y,sum_purchase_amount_new_2_y,count_merchant_id_1_y,count_merchant_id_2_y,new_purchase_amount_new_sum,new_purchase_date_max,new_purchase_date_min,new_year_nunique,new_month_nunique,new_month_mean,new_weekofyear_nunique,new_weekofyear_mean,new_day_nunique,new_day_mean,new_weekend_sum,new_weekend_mean,new_hour_nunique,new_hour_mean,new_category_1_sum,new_category_1_mean,new_category_2_sum,new_category_2_mean,new_category_3_sum,new_category_3_mean,new_city_id_nunique,new_merchant_category_id_nunique,new_merchant_id_nunique,new_state_id_nunique,new_subsector_id_nunique,new_installments_999_sum,new_installments_999_mean,new_month_lag_max,new_month_lag_min,new_month_lag_sum,new_month_lag_mean,new_month_lag_std,hist_first_buy,hist_last_buy,new_first_buy,new_last_buy,nans,pca_feature_0
0,C_ID_92a2005557,5,2,1,2017-06-01,0.0,-0.820283,0.566331,334.0,0.013145,534.0,0.008752,655.0,0.011428,1305.0,,,,,,112.75,2510.0,1252.17,1432.91,1344.69,1444.36,8178.19,1425.84,1084.08,,,,,,3.0,49.0,44.0,20.0,22.0,21.0,57.0,21.0,23.0,247,0.95,18784.99,1519551000.0,1498573000.0,2,9,8.057692,35,33.073077,31,15.511538,90,0.346154,23,13.315385,0,0.0,272,1.046154,776,2.984615,7,41,95,3,21,0,0.0,0,0.0,0,0.0,0,0.0,0,-8,-1017,-3.911538,2.397687,18027.61,1519551000.0,1498573000.0,7,41,94,3,21,757.38,1514385000.0,1500131000.0,2.0,10.0,13.0,1.0,7.0,1512.57,1105.92,12.0,11.0,2618.49,1525001000.0,1520259000.0,1.0,2.0,3.478261,7.0,13.304348,17.0,16.434783,6.0,0.26087,8.0,12.869565,0.0,0.0,23.0,1.0,69.0,3.0,3.0,14.0,23.0,1.0,10.0,0.0,0.0,2.0,1.0,34.0,1.478261,0.510754,26.0,269.0,277.0,332.0,10,-1.901802
1,C_ID_3d0044924f,4,1,0,2017-01-01,0.0,0.392913,1.313042,485.0,0.010712,213.0,0.011385,1016.0,0.010283,902.0,,5511.95,2633.93,2729.94,2050.8,1429.77,1897.7,3300.85,1674.26,5669.88,2137.24,901.55,2256.74,2019.18,,21.0,22.0,13.0,18.0,15.0,34.0,49.0,14.0,24.0,26.0,16.0,47.0,51.0,339,0.968571,34213.79,1517438000.0,1483720000.0,2,12,6.22,50,25.22,31,16.665714,132,0.377143,24,14.717143,31,0.088571,319,0.911429,624,1.782857,9,57,142,3,24,31,0.088571,0,0.0,0,0.0,0,0.0,0,-12,-1761,-5.031429,3.804934,29493.4,1517438000.0,1483720000.0,9,57,141,3,24,4720.39,1514467000.0,1488576000.0,2.0,9.0,9.0,2.0,9.0,23.99,59.67,3.0,3.0,83.66,1522393000.0,1517505000.0,1.0,2.0,2.5,4.0,9.0,4.0,13.5,0.0,0.0,5.0,11.166667,0.0,0.0,6.0,1.0,12.0,2.0,1.0,5.0,6.0,1.0,4.0,0.0,0.0,2.0,1.0,9.0,1.5,0.547723,5.0,395.0,396.0,453.0,2,-0.79638
2,C_ID_d639edf6cd,2,2,0,2016-08-01,0.0,0.688056,1.611111,638.0,0.01061,592.0,0.008752,655.0,0.010283,902.0,206.42,253.81,116.22,273.73,158.74,,66.38,71.34,29.86,263.86,,26.02,439.6,56.98,6.0,6.0,5.0,7.0,4.0,,2.0,2.0,1.0,6.0,,1.0,2.0,1.0,41,0.953488,1962.96,1519759000.0,1484123000.0,2,10,4.55814,22,18.372093,19,19.325581,11,0.255814,14,17.906977,0,0.0,199,4.627907,129,3.0,5,8,13,2,7,0,0.0,0,0.0,0,0.0,0,0.0,0,-13,-370,-8.604651,3.842987,1859.91,1519759000.0,1484123000.0,5,8,13,2,7,103.05,1492801000.0,1487878000.0,1.0,1.0,1.0,1.0,1.0,,31.0,,1.0,31.0,1524937000.0,1524937000.0,1.0,1.0,4.0,1.0,17.0,1.0,28.0,1.0,1.0,1.0,17.0,0.0,0.0,5.0,5.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,2.0,,163.0,575.0,635.0,635.0,7,1.235037
3,C_ID_186d6a6901,4,3,0,2017-09-01,0.0,0.142495,1.103813,242.0,0.010712,213.0,0.014166,536.0,0.010283,902.0,,,,,,,,,243.72,1254.03,2228.44,362.29,453.84,795.27,,,,,,,,,11.0,31.0,7.0,6.0,6.0,16.0,77,1.0,5337.59,1519818000.0,1506443000.0,2,6,7.74026,20,32.012987,25,16.87013,11,0.142857,16,14.441558,12,0.155844,224,2.909091,143,1.857143,7,25,50,5,13,12,0.155844,0,0.0,0,0.0,0,0.0,0,-5,-218,-2.831169,1.802065,5337.59,1519818000.0,1506443000.0,7,25,50,5,13,,-9223372000.0,-9223372000.0,,,,,,140.88,241.1,2.0,5.0,381.98,1524049000.0,1520424000.0,1.0,2.0,3.714286,5.0,13.857143,7.0,13.142857,3.0,0.428571,5.0,13.0,1.0,0.142857,24.0,3.428571,12.0,1.714286,2.0,6.0,7.0,2.0,5.0,0.0,0.0,2.0,1.0,12.0,1.714286,0.48795,25.0,180.0,187.0,229.0,22,-0.572408
4,C_ID_cdbd2c0db2,1,3,0,2017-11-01,0.0,-0.159749,0.895181,181.0,0.008058,97.0,0.014166,536.0,0.010283,902.0,,,,,,,,,,,3866.92,2161.79,3656.8,24022.3,,,,,,,,,,,21.0,31.0,46.0,35.0,128,0.962406,33707.81,1519850000.0,1510445000.0,2,4,5.406015,17,21.781955,30,16.097744,42,0.315789,22,13.045113,15,0.112782,443,3.330827,259,1.947368,6,26,67,6,17,15,0.112782,0,0.0,0,0.0,0,0.0,0,-3,-171,-1.285714,1.0267,17677.91,1519850000.0,1510445000.0,6,26,66,6,17,16029.9,1519759000.0,1516485000.0,2.0,2.0,2.0,2.0,2.0,1560.33,3073.11,16.0,20.0,4633.44,1524941000.0,1519992000.0,1.0,2.0,3.555556,8.0,13.361111,22.0,14.583333,12.0,0.333333,14.0,14.722222,2.0,0.055556,123.0,3.416667,69.0,1.916667,5.0,17.0,36.0,5.0,10.0,0.0,0.0,2.0,1.0,56.0,1.555556,0.503953,11.0,119.0,121.0,178.0,20,2.306738


In [205]:
test.head()

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,target_raw,elapsed_time,feature_1_outliers_mean,feature_1_outliers_sum,feature_2_outliers_mean,feature_2_outliers_sum,feature_3_outliers_mean,feature_3_outliers_sum,sum_purchase_amount_new_13,sum_purchase_amount_new_12,sum_purchase_amount_new_11,sum_purchase_amount_new_10,sum_purchase_amount_new_9,sum_purchase_amount_new_8,sum_purchase_amount_new_7,sum_purchase_amount_new_6,sum_purchase_amount_new_5,sum_purchase_amount_new_4,sum_purchase_amount_new_3,sum_purchase_amount_new_2_x,sum_purchase_amount_new_1_x,sum_purchase_amount_new_0,count_merchant_id_13,count_merchant_id_12,count_merchant_id_11,count_merchant_id_10,count_merchant_id_9,count_merchant_id_8,count_merchant_id_7,count_merchant_id_6,count_merchant_id_5,count_merchant_id_4,count_merchant_id_3,count_merchant_id_2_x,count_merchant_id_1_x,count_merchant_id_0,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_purchase_amount_new_sum,hist_purchase_date_max,hist_purchase_date_min,hist_year_nunique,hist_month_nunique,hist_month_mean,hist_weekofyear_nunique,hist_weekofyear_mean,hist_day_nunique,hist_day_mean,hist_weekend_sum,hist_weekend_mean,hist_hour_nunique,hist_hour_mean,hist_category_1_sum,hist_category_1_mean,hist_category_2_sum,hist_category_2_mean,hist_category_3_sum,hist_category_3_mean,hist_city_id_nunique,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_state_id_nunique,hist_subsector_id_nunique,hist_city_minus_one_sum,hist_city_minus_one_mean,hist_merchant_category_minus_one_sum,hist_merchant_category_minus_one_mean,hist_subsector_minus_one_sum,hist_subsector_minus_one_mean,hist_installments_999_sum,hist_installments_999_mean,hist_month_lag_max,hist_month_lag_min,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_std,Y_hist_purchase_amount_new_sum,Y_hist_purchase_date_max,Y_hist_purchase_date_min,Y_hist_city_id_nunique,Y_hist_merchant_category_id_nunique,Y_hist_merchant_id_nunique,Y_hist_state_id_nunique,Y_hist_subsector_id_nunique,N_hist_purchase_amount_new_sum,N_hist_purchase_date_max,N_hist_purchase_date_min,N_hist_city_id_nunique,N_hist_merchant_category_id_nunique,N_hist_merchant_id_nunique,N_hist_state_id_nunique,N_hist_subsector_id_nunique,sum_purchase_amount_new_1_y,sum_purchase_amount_new_2_y,count_merchant_id_1_y,count_merchant_id_2_y,new_purchase_amount_new_sum,new_purchase_date_max,new_purchase_date_min,new_year_nunique,new_month_nunique,new_month_mean,new_weekofyear_nunique,new_weekofyear_mean,new_day_nunique,new_day_mean,new_weekend_sum,new_weekend_mean,new_hour_nunique,new_hour_mean,new_category_1_sum,new_category_1_mean,new_category_2_sum,new_category_2_mean,new_category_3_sum,new_category_3_mean,new_city_id_nunique,new_merchant_category_id_nunique,new_merchant_id_nunique,new_state_id_nunique,new_subsector_id_nunique,new_installments_999_sum,new_installments_999_mean,new_month_lag_max,new_month_lag_min,new_month_lag_sum,new_month_lag_mean,new_month_lag_std,hist_first_buy,hist_last_buy,new_first_buy,new_last_buy,nans,pca_feature_0
0,C_ID_0ab67a22ab,3,3,1,2017-04-01,,,,395.0,0.010479,771.0,0.014166,536.0,0.011428,1305.0,,,,,,930.85,214.0,2178.46,51.66,1172.31,338.92,709.7,329.88,766.39,,,,,,4.0,6.0,11.0,2.0,11.0,9.0,11.0,4.0,10.0,44,0.647059,6692.17,1514510000.0,1491330000.0,1,9,8.367647,24,34.544118,24,16.352941,12,0.176471,17,14.367647,23,0.338235,45,0.661765,114,1.676471,7,16,24,3,12,23,0.338235,0,0.0,0,0.0,0,0.0,0,-8,-247,-3.632353,2.454994,3211.27,1514510000.0,1491330000.0,7,15,23,3,12,3480.9,1513186000.0,1493303000.0,4.0,7.0,7.0,2.0,7.0,,308.5,,3.0,308.5,1519845000.0,1517651000.0,1.0,1.0,2.0,3.0,7.0,3.0,15.666667,1.0,0.333333,3.0,13.666667,0.0,0.0,3.0,1.0,5.0,1.666667,3.0,3.0,3.0,1.0,3.0,0.0,0.0,2.0,2.0,6.0,2.0,0.0,3.0,272.0,308.0,333.0,15,0.129615
1,C_ID_130fd0cbdd,2,3,0,2017-01-01,,,,485.0,0.01061,592.0,0.014166,536.0,0.010283,902.0,741.77,712.48,1285.97,2266.72,916.86,,,,,,,,,147.0,11.0,11.0,14.0,23.0,17.0,,,,,,,,,2.0,77,0.987179,6070.8,1518989000.0,1484321000.0,2,5,3.282051,20,12.435897,27,16.512821,17,0.217949,18,14.423077,2,0.025641,303,3.884615,154,1.974359,4,16,27,3,12,2,0.025641,0,0.0,0,0.0,0,0.0,0,-13,-812,-10.410256,2.164866,6060.8,1518989000.0,1484321000.0,4,16,27,3,12,10.0,1494079000.0,1494079000.0,1.0,1.0,1.0,1.0,1.0,228.5,288.9,5.0,4.0,517.4,1524247000.0,1520080000.0,1.0,2.0,3.444444,6.0,12.444444,7.0,11.333333,3.0,0.333333,8.0,15.222222,2.0,0.222222,28.0,3.111111,17.0,1.888889,2.0,8.0,9.0,2.0,6.0,0.0,0.0,2.0,1.0,13.0,1.444444,0.527046,12.0,413.0,426.0,474.0,19,1.347023
2,C_ID_b709037bc5,5,1,1,2017-08-01,,,,273.0,0.013145,534.0,0.011385,1016.0,0.011428,1305.0,,,,,,,,149.76,,349.5,1089.67,1529.5,3678.0,2679.25,,,,,,,,1.0,,1.0,3.0,3.0,2.0,3.0,9,0.692308,9475.68,1517598000.0,1503673000.0,2,6,7.307692,7,29.0,7,8.538462,0,0.0,7,17.0,1,0.076923,56,4.307692,11,0.846154,4,8,9,4,6,1,0.076923,0,0.0,0,0.0,0,0.0,0,-6,-27,-2.076923,1.754116,3878.23,1517598000.0,1503673000.0,4,6,7,4,5,5597.45,1517593000.0,1515681000.0,1.0,2.0,2.0,1.0,1.0,1114.0,,2.0,,1114.0,1520947000.0,1519916000.0,1.0,1.0,3.0,2.0,10.0,2.0,7.0,0.0,0.0,2.0,13.5,1.0,0.5,5.0,2.5,3.0,1.5,2.0,2.0,2.0,2.0,2.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,24.0,185.0,212.0,224.0,21,-2.013788
3,C_ID_d27d835a9f,2,1,0,2017-12-01,,,,151.0,0.01061,592.0,0.011385,1016.0,0.010283,902.0,,,,,,,,,,,,2569.1,652.43,590.99,,,,,,,,,,,,15.0,2.0,9.0,26,1.0,3812.52,1519127000.0,1512392000.0,2,3,7.692308,6,31.115385,11,11.423077,7,0.269231,9,17.923077,0,0.0,26,1.0,41,1.576923,1,18,23,1,11,0,0.0,0,0.0,0,0.0,0,0.0,0,-2,-32,-1.230769,0.951113,3812.52,1519127000.0,1512392000.0,1,18,23,1,11,,-9223372000.0,-9223372000.0,,,,,,830.74,317.5,7.0,3.0,1148.24,1524000000.0,1520162000.0,1.0,2.0,3.3,7.0,12.1,8.0,13.6,3.0,0.3,7.0,18.2,1.0,0.1,13.0,1.3,15.0,1.5,3.0,10.0,10.0,3.0,8.0,0.0,0.0,2.0,1.0,13.0,1.3,0.483046,3.0,81.0,93.0,137.0,31,1.123051
4,C_ID_2b5e3df5c2,5,1,1,2015-12-01,,,,882.0,0.013145,534.0,0.011385,1016.0,0.011428,1305.0,8429.39,2105.43,1165.33,2098.6,1371.89,483.0,1240.9,4271.54,11560.78,10502.19,6300.0,16011.5,537.8,5328.27,9.0,2.0,18.0,8.0,8.0,2.0,8.0,5.0,7.0,10.0,1.0,5.0,4.0,23.0,87,0.790909,71406.62,1519728000.0,1483444000.0,2,12,4.827273,34,18.836364,27,13.272727,21,0.190909,21,14.436364,0,0.0,423,3.845455,214,1.945455,5,31,47,4,15,0,0.0,0,0.0,0,0.0,0,0.0,0,-13,-685,-6.227273,4.530547,70330.62,1519728000.0,1483554000.0,4,30,45,3,15,1076.0,1514992000.0,1483444000.0,3.0,4.0,5.0,3.0,4.0,10186.5,825.0,5.0,1.0,11011.5,1523535000.0,1520132000.0,1.0,2.0,3.166667,3.0,10.5,5.0,7.333333,2.0,0.333333,5.0,8.0,0.0,0.0,23.0,3.833333,9.0,1.5,2.0,5.0,6.0,2.0,4.0,0.0,0.0,2.0,1.0,7.0,1.166667,0.408248,399.0,819.0,824.0,863.0,3,-2.013788


In [206]:
col_not_to_use = ['first_active_month', 'card_id', 'target', 'target_raw', 'outliers']
col_to_use = [c for c in train.columns if c not in col_not_to_use]

In [207]:
y = train['target']
y_raw = train['target_raw']

In [208]:
X = train[col_to_use]
X_test = test[col_to_use]

In [209]:
folds = KFold(n_splits=NFOLD, shuffle=True, random_state=SEED)

prediction = np.zeros(len(X_test))
scores = []

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    dtrain = lgb.Dataset(X.iloc[train_index], label=y.iloc[train_index])
    dvalid = lgb.Dataset(X.iloc[valid_index], label=y.iloc[valid_index])

    params = {
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.01,
        'subsample': 0.9855232997390695,
        'max_depth': 7,
        'top_rate': 0.9064148448434349,
        'num_leaves': 63,
        'min_child_weight': 41.9612869171337,
        'other_rate': 0.0721768246018207,
        'reg_alpha': 9.677537745007898,
        'colsample_bytree': 0.5665320670155495,
        'min_split_gain': 9.820197773625843,
        'reg_lambda': 8.2532317400459,
        'min_data_in_leaf': 21,
        'verbose': -1,
        'seed':int(2**fold_n),
        'bagging_seed':int(2**fold_n),
        'drop_seed':int(2**fold_n)
    }

    model = lgb.train(
        params,
        dtrain,
        20000,          
        valid_sets=[dtrain, dvalid],
        verbose_eval=200,
        early_stopping_rounds=200)
    
    y_pred_valid = model.predict(X.iloc[valid_index], num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    scores.append(np.sqrt(mean_squared_error(y.iloc[valid_index], y_pred_valid)))
    
    prediction += y_pred / folds.n_splits   

    del model

print('shape:', X.shape)
print('CV {0:} mean score: {1:.4f}, std: {2:.4f}, max: {3:.4f}, min: {4:.4f}.'.format(NFOLD, np.mean(scores), np.std(scores), np.max(scores), np.min(scores)))

Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 3.6019	valid_1's rmse: 3.62655
[400]	training's rmse: 3.52512	valid_1's rmse: 3.60909
[600]	training's rmse: 3.48121	valid_1's rmse: 3.60587
[800]	training's rmse: 3.44902	valid_1's rmse: 3.60458
[1000]	training's rmse: 3.42003	valid_1's rmse: 3.60351
[1200]	training's rmse: 3.39315	valid_1's rmse: 3.60317
[1400]	training's rmse: 3.36559	valid_1's rmse: 3.60333
Early stopping, best iteration is:
[1367]	training's rmse: 3.36995	valid_1's rmse: 3.603
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 3.60558	valid_1's rmse: 3.58704
[400]	training's rmse: 3.52976	valid_1's rmse: 3.56793
[600]	training's rmse: 3.48643	valid_1's rmse: 3.56325
[800]	training's rmse: 3.45353	valid_1's rmse: 3.5617
[1000]	training's rmse: 3.42355	valid_1's rmse: 3.56182
[1200]	training's rmse: 3.39472	valid_1's rmse: 3.56169
Early stopping, best iteration is:
[1044]	training's rmse: 3.41671	

In [None]:
folds = KFold(n_splits=NFOLD, shuffle=True, random_state=SEED)

prediction = np.zeros(len(X_test))
scores = []

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    dtrain = lgb.Dataset(X.iloc[train_index], label=y_new.iloc[train_index])
    dvalid = lgb.Dataset(X.iloc[valid_index], label=y_new.iloc[valid_index])

    params = {
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.01,
        'subsample': 0.9855232997390695,
        'max_depth': 7,
        'top_rate': 0.9064148448434349,
        'num_leaves': 63,
        'min_child_weight': 41.9612869171337,
        'other_rate': 0.0721768246018207,
        'reg_alpha': 9.677537745007898,
        'colsample_bytree': 0.5665320670155495,
        'min_split_gain': 9.820197773625843,
        'reg_lambda': 8.2532317400459,
        'min_data_in_leaf': 21,
        'verbose': -1,
        'seed':int(2**fold_n),
        'bagging_seed':int(2**fold_n),
        'drop_seed':int(2**fold_n)
    }

    model = lgb.train(
        params,
        dtrain,
        20000,          
        valid_sets=[dtrain, dvalid],
        verbose_eval=200,
        early_stopping_rounds=200)
    
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    scores.append(np.sqrt(mean_squared_error(y.iloc[valid_index], y_pred_valid)))
    
    prediction += y_pred / folds.n_splits   

    del model

print('shape:', X.shape)
print('CV {0:} mean score: {1:.4f}, std: {2:.4f}, max: {3:.4f}, min: {4:.4f}.'.format(NFOLD, np.mean(scores), np.std(scores), np.max(scores), np.min(scores)))
print(features)

In [153]:
submission = pd.read_csv(os.path.join('..', 'input', 'sample_submission.csv'))
submission['target'] = prediction
submission.to_csv(os.path.join('..', 'submission', 'lightgbm_outlier_{}.csv'.format(str(datetime.datetime.today().date()).replace('-', ''))), index=False)

In [183]:
for i, j in zip(X.columns, X.dtypes):
    print(i, j)

feature_1 int64
feature_2 int64
feature_3 int64
elapsed_time float64
feature_1_outliers_mean float64
feature_1_outliers_sum float64
feature_2_outliers_mean float64
feature_2_outliers_sum float64
feature_3_outliers_mean float64
feature_3_outliers_sum float64
sum_purchase_amount_new_13_x float64
sum_purchase_amount_new_12_x float64
sum_purchase_amount_new_11_x float64
sum_purchase_amount_new_10_x float64
sum_purchase_amount_new_9_x float64
sum_purchase_amount_new_8_x float64
sum_purchase_amount_new_7_x float64
sum_purchase_amount_new_6_x float64
sum_purchase_amount_new_5_x float64
sum_purchase_amount_new_4_x float64
sum_purchase_amount_new_3_x float64
sum_purchase_amount_new_2_x float64
sum_purchase_amount_new_2_x float64
sum_purchase_amount_new_2_x float64
sum_purchase_amount_new_1_x float64
sum_purchase_amount_new_1_x float64
sum_purchase_amount_new_1_x float64
sum_purchase_amount_new_0_x float64
count_merchant_id_13_x float64
count_merchant_id_12_x float64
count_merchant_id_11_x float

In [211]:
historical_transactions[['installments', 'month_lag', 'purchase_date']]

Unnamed: 0,installments,month_lag,purchase_date
0,0,-8,2017-06-25 15:33:07
1,0,-7,2017-07-15 12:10:45
2,0,-6,2017-08-09 22:04:29
3,0,-5,2017-09-02 10:06:26
4,0,-11,2017-03-10 01:14:19
5,0,0,2018-02-24 08:45:05
6,0,-11,2017-03-21 00:10:51
7,0,-3,2017-11-18 20:05:55
8,0,-8,2017-06-01 22:02:56
9,0,-11,2017-03-16 15:41:22
