In [1]:
%matplotlib inline
import os
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import trange, tqdm

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import lightgbm as lgb

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats

from fastai.datasets import Config

import warnings
warnings.filterwarnings("ignore")

base_path = Config.data_path()

# Competiton files setup

In [2]:
data_path = base_path/'LANL_Earthquake_Prediction'
competition_name = 'LANL-Earthquake-Prediction'
data_path

PosixPath('/home/jupyter/.fastai/data/LANL_Earthquake_Prediction')

In [5]:
train_prcessed_segments_path = data_path/'train'/'processed_df'
test_prcessed_segments_path = data_path/'test_processed_df'

[PosixPath('/home/jupyter/.fastai/data/LANL_Earthquake_Prediction/test'),
 PosixPath('/home/jupyter/.fastai/data/LANL_Earthquake_Prediction/train.csv'),
 PosixPath('/home/jupyter/.fastai/data/LANL_Earthquake_Prediction/sample_submission.csv')]

In [None]:
train_prcessed_segments_path.ls()

In [None]:
test_prcessed_segments_path.ls()

In [73]:
path = data_path/'train'/'processed_df'
a.to_csv(path/'no_overlap_only_time_features.csv')

In [None]:
path = data_path/'train'/'processed_df'
training_set_step_10000 = create_train_set(training_segments_10000)
training_set_step_10000.to_csv(path/'step_10000_only_time_features.csv')

 38%|███▊      | 23805/62660 [1:43:03<3:02:01,  3.56it/s] 

In [3]:
path = data_path/'train'/'processed_df'/'step_10000_only_time_features.csv'
training_set_step_10000 = pd.read_csv(path)

In [9]:
# training_set_step_10000.drop(['Unnamed: 0'], axis=1)
training_set_step_10000.head()

Unnamed: 0.1,Unnamed: 0,target,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,abs_min,...,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
0,0,1.430797,4.884113,5.101106,104.0,-98.0,-8e-05,74836.577199,104.0,0.0,...,0.295715,5.629,3.896,4.072,4.379,5.338,5.484,-1.704698e-06,74222.343443,5.629
1,1,1.4276,4.857127,4.324007,52.0,-56.0,7e-06,74890.988734,56.0,0.0,...,0.290901,5.629,3.896,4.072,4.375,5.32,5.483,-1.90604e-06,74333.053867,5.629
2,2,1.425498,4.837627,5.334216,181.0,-154.0,-2.7e-05,74986.731761,181.0,0.0,...,0.298291,5.667,3.412,4.069,4.36,5.32,5.483,-6.107383e-07,74474.087926,5.667
3,3,1.423396,4.81116,5.322245,181.0,-154.0,3.3e-05,74989.309697,181.0,0.0,...,0.303731,5.667,3.412,4.069,4.36,5.32,5.483,-3.402685e-06,74346.540137,5.667
4,4,1.420198,4.792853,5.418203,181.0,-154.0,2.7e-05,75145.482449,181.0,0.0,...,0.30277,5.667,3.412,4.069,4.359,5.32,5.483,-3.221477e-07,74551.37712,5.667


In [8]:
print(f"Train length: {len(df_train)}")
print(f"Test length: {len(df_test)}")

Train length: 200000
Test length: 200000


In [122]:
# ID_code won't be used as a feature
df_train = df_train_original.drop(['ID_code'], axis=1)
df_test = df_test_original.drop(['ID_code'], axis=1)

We will use LightGBM as a model to predict target for test dataset. This is a tree-based model, which cannot see frequency of the feature (number of occurrences of particular value in a feature). It turns out that this is important in this case, so frequencies needs to be added as separate feature. 

# Model Training
## 1st Layer of ensamble

In [36]:
param = {
    'learning_rate': 0.04,
    'num_leaves': 3,
    'metric':'mean_absolute_error',
    'boost_from_average':'false',
    'feature_fraction': 0.8, # This is important; we must use ALL features in every iteration to make sure that feature freq will be used
    'max_depth': -1,
    'objective': 'regression',
    'verbosity': -10
}

In [20]:
df_train = training_set_step_10000
df_test = test_set_only_time_features

In [38]:
%%time

from sklearn.model_selection import KFold

nbr_of_folds = 5
nbr_of_rounds = 10000

folds = KFold(
    n_splits=nbr_of_folds,
    shuffle=True,
    random_state=2019
)

out_of_fold_predictions = np.zeros((len(df_train), 1))
test_predictions = np.zeros(len(df_test))

train_data = df_train.drop('target', axis=1)
targets = df_train['target']
    
folds_predictions = np.zeros(len(df_test))
training_results = dict()
    
for fold_nbr, (train_idx, valid_idx) in enumerate(folds.split(train_data, targets)):
    fold_train_data = train_data.iloc[train_idx]
    fold_train_target = targets.iloc[train_idx]
        
    fold_valid_data = train_data.iloc[valid_idx]
    fold_valid_target = targets.iloc[valid_idx]

    lgb_fold_train = lgb.Dataset(fold_train_data, label=fold_train_target)
    lgb_fold_valid = lgb.Dataset(fold_valid_data, label=fold_valid_target, reference=lgb_fold_train)

    model = lgb.train(
        param, 
        lgb_fold_train, 
        nbr_of_rounds, 
        valid_sets=[lgb_fold_train, lgb_fold_valid], 
#        early_stopping_rounds=500,
        verbose_eval=False, 
        evals_result=training_results
#        verbose_eval=5000
    )
    
#     print(training_results)
    scores = training_results['valid_1']['l1']
    best_score = min(scores)
    best_round = scores.index(min(scores))
    print(f"Fold #{fold_nbr}: Best l1: {best_score} ({best_round} iteration)")

    out_of_fold_predictions[valid_idx, 0] = model.predict(fold_valid_data, num_iteration=best_round)
    folds_predictions += model.predict(df_test, num_iteration=best_round)

test_predictions = folds_predictions/nbr_of_folds

# print(f"Final score: {roc_auc_score(targets, out_of_fold_predictions[:, feature_nbr])}")

Fold #0: Best l1: 0.515371526094659 (9999 iteration)
Fold #1: Best l1: 0.5272989277800292 (9999 iteration)
Fold #2: Best l1: 0.5066278580699147 (9999 iteration)
Fold #3: Best l1: 0.4844142458256674 (9999 iteration)
Fold #4: Best l1: 0.47267541668625046 (9999 iteration)
CPU times: user 11min 28s, sys: 1.26 s, total: 11min 29s
Wall time: 2min 52s


In [39]:
test_predictions

array([-10.848564, -11.95273 , -11.030486, -11.354133, ..., -10.981439, -10.721504, -10.321516, -11.120937])

In [98]:
%%time

nbr_of_original_features = 200
nbr_of_folds = 5
nbr_of_rounds = 750

folds = StratifiedKFold(
    n_splits=nbr_of_folds,
    shuffle=True,
    random_state=2019
)

out_of_fold_predictions = np.zeros((len(df_train), nbr_of_original_features))
test_predictions = np.zeros((len(df_test), nbr_of_original_features))

train_data = df_train.drop('target', axis=1)
targets = df_train['target']

for feature_nbr in range(nbr_of_original_features):
    print(f"Processing feature {feature_nbr}/{nbr_of_original_features}")
    # Use only original feature and frequencies of this feature 
    features = [f"var_{feature_nbr}", f"var_{feature_nbr}_freq"]
    
    folds_predictions = np.zeros(len(df_test))
    training_results = dict()
    
    for fold_nbr, (train_idx, valid_idx) in enumerate(folds.split(train_data, targets)):
        fold_train_data = train_data[features].iloc[train_idx]
        fold_train_target = targets.iloc[train_idx]
        
        fold_valid_data = train_data[features].iloc[valid_idx]
        fold_valid_target = targets.iloc[valid_idx]

        lgb_fold_train = lgb.Dataset(fold_train_data, label=fold_train_target)
        lgb_fold_valid = lgb.Dataset(fold_valid_data, label=fold_valid_target, reference=lgb_fold_train)

        model = lgb.train(
            param, 
            lgb_fold_train, 
            nbr_of_rounds, 
            valid_sets=[lgb_fold_train, lgb_fold_valid], 
#             early_stopping_rounds=500,
            verbose_eval=False, 
            evals_result=training_results
#             verbose_eval=1000
        )
        
        scores = training_results['valid_1']['auc']
        best_score = max(scores)
        best_round = scores.index(max(scores))
        print(f"Fold #{fold_nbr}: Best auc: {best_score} ({best_round} iteration)")

        out_of_fold_predictions[valid_idx, feature_nbr] = model.predict(fold_valid_data, num_iteration=best_round)
        folds_predictions += model.predict(df_test, num_iteration=best_round)

    test_predictions[:, feature_nbr] = folds_predictions/nbr_of_folds

    print(f"Final feature auc score: {roc_auc_score(targets, out_of_fold_predictions[:, feature_nbr])}")
    print("-" * 20)

Processing feature 0/200
Fold #0: Best auc: 0.5416882300097301 (464 iteration)
Fold #1: Best auc: 0.5486695852883107 (106 iteration)
Fold #2: Best auc: 0.5566815208283209 (229 iteration)
Fold #3: Best auc: 0.5451479361305063 (165 iteration)
Fold #4: Best auc: 0.5514666783583979 (278 iteration)
Final feature auc score: 0.5466116484750508
--------------------
Processing feature 1/200
Fold #0: Best auc: 0.5519179933411512 (188 iteration)
Fold #1: Best auc: 0.553960143558354 (202 iteration)
Fold #2: Best auc: 0.5398148778066311 (164 iteration)
Fold #3: Best auc: 0.5438991983741486 (267 iteration)
Fold #4: Best auc: 0.5443476691662352 (252 iteration)
Final feature auc score: 0.5459670911053918
--------------------
Processing feature 2/200
Fold #0: Best auc: 0.5462121523230682 (237 iteration)
Fold #1: Best auc: 0.5587087733285436 (271 iteration)
Fold #2: Best auc: 0.5504921404649902 (172 iteration)
Fold #3: Best auc: 0.5481189647949339 (159 iteration)
Fold #4: Best auc: 0.5545247760740706 (3

Fold #0: Best auc: 0.5202315076185179 (187 iteration)
Fold #1: Best auc: 0.5249154957543236 (192 iteration)
Fold #2: Best auc: 0.5255242029153842 (68 iteration)
Fold #3: Best auc: 0.526682928131398 (165 iteration)
Fold #4: Best auc: 0.5248938546628362 (120 iteration)
Final feature auc score: 0.5179572344514115
--------------------
Processing feature 24/200
Fold #0: Best auc: 0.5238930586775967 (2 iteration)
Fold #1: Best auc: 0.5311106774014643 (194 iteration)
Fold #2: Best auc: 0.5348139963052995 (179 iteration)
Fold #3: Best auc: 0.529849297687015 (663 iteration)
Fold #4: Best auc: 0.5251476069547913 (749 iteration)
Final feature auc score: 0.5196775647964788
--------------------
Processing feature 25/200
Fold #0: Best auc: 0.5102167347581594 (716 iteration)
Fold #1: Best auc: 0.5086061452278365 (4 iteration)
Fold #2: Best auc: 0.5144387290894057 (55 iteration)
Fold #3: Best auc: 0.5082088090187507 (78 iteration)
Fold #4: Best auc: 0.5133224707652547 (20 iteration)
Final feature auc 

Fold #0: Best auc: 0.5091463522552878 (48 iteration)
Fold #1: Best auc: 0.5044542683597105 (44 iteration)
Fold #2: Best auc: 0.5022368908652955 (12 iteration)
Fold #3: Best auc: 0.5032604750835422 (0 iteration)
Fold #4: Best auc: 0.5027952273947222 (729 iteration)
Final feature auc score: 0.5009402064147664
--------------------
Processing feature 47/200
Fold #0: Best auc: 0.5094433926639834 (369 iteration)
Fold #1: Best auc: 0.5142363866446373 (85 iteration)
Fold #2: Best auc: 0.503099856470842 (53 iteration)
Fold #3: Best auc: 0.5045736164834601 (749 iteration)
Fold #4: Best auc: 0.5079058325095872 (597 iteration)
Final feature auc score: 0.5033557103306272
--------------------
Processing feature 48/200
Fold #0: Best auc: 0.5295861614912569 (80 iteration)
Fold #1: Best auc: 0.5279623740058497 (163 iteration)
Fold #2: Best auc: 0.5396152194834609 (141 iteration)
Fold #3: Best auc: 0.5515084926642916 (452 iteration)
Fold #4: Best auc: 0.5245767049261976 (517 iteration)
Final feature auc

Fold #0: Best auc: 0.5041536536488785 (8 iteration)
Fold #1: Best auc: 0.5088348037749608 (79 iteration)
Fold #2: Best auc: 0.5085112168451793 (7 iteration)
Fold #3: Best auc: 0.513499603260278 (156 iteration)
Fold #4: Best auc: 0.5088865963383212 (84 iteration)
Final feature auc score: 0.5032579099336686
--------------------
Processing feature 70/200
Fold #0: Best auc: 0.535051466493994 (152 iteration)
Fold #1: Best auc: 0.523472134477829 (121 iteration)
Fold #2: Best auc: 0.5326693727029113 (328 iteration)
Fold #3: Best auc: 0.5183377048237106 (735 iteration)
Fold #4: Best auc: 0.52760976523271 (102 iteration)
Final feature auc score: 0.5242927340935641
--------------------
Processing feature 71/200
Fold #0: Best auc: 0.533517510139749 (124 iteration)
Fold #1: Best auc: 0.53138373127 (107 iteration)
Fold #2: Best auc: 0.5285986479498007 (210 iteration)
Fold #3: Best auc: 0.5350435694486764 (99 iteration)
Fold #4: Best auc: 0.5323458707326967 (739 iteration)
Final feature auc score: 0

Fold #0: Best auc: 0.5487059678124759 (98 iteration)
Fold #1: Best auc: 0.5423293574925738 (93 iteration)
Fold #2: Best auc: 0.5477910959377653 (233 iteration)
Fold #3: Best auc: 0.5391923175920492 (13 iteration)
Fold #4: Best auc: 0.5421921525892643 (107 iteration)
Final feature auc score: 0.5286699704748197
--------------------
Processing feature 93/200
Fold #0: Best auc: 0.529361851563173 (272 iteration)
Fold #1: Best auc: 0.535789767291499 (167 iteration)
Fold #2: Best auc: 0.5247912432003408 (78 iteration)
Fold #3: Best auc: 0.5310672028819196 (72 iteration)
Fold #4: Best auc: 0.5342352321470237 (173 iteration)
Final feature auc score: 0.5247042201631036
--------------------
Processing feature 94/200
Fold #0: Best auc: 0.5439995970786683 (573 iteration)
Fold #1: Best auc: 0.5429245133660233 (205 iteration)
Fold #2: Best auc: 0.5509732708055055 (118 iteration)
Fold #3: Best auc: 0.537483477246282 (394 iteration)
Fold #4: Best auc: 0.5448308762947982 (116 iteration)
Final feature au

Fold #0: Best auc: 0.5493535732858456 (113 iteration)
Fold #1: Best auc: 0.5407809518318195 (62 iteration)
Fold #2: Best auc: 0.5416393262979157 (327 iteration)
Fold #3: Best auc: 0.5479493182812436 (80 iteration)
Fold #4: Best auc: 0.5500843927696969 (174 iteration)
Final feature auc score: 0.5356488354531971
--------------------
Processing feature 116/200
Fold #0: Best auc: 0.5261989640469452 (124 iteration)
Fold #1: Best auc: 0.5220040434552178 (66 iteration)
Fold #2: Best auc: 0.5209651817344627 (1 iteration)
Fold #3: Best auc: 0.5189280531151295 (691 iteration)
Fold #4: Best auc: 0.5322857857915314 (240 iteration)
Final feature auc score: 0.5115128317686399
--------------------
Processing feature 117/200
Fold #0: Best auc: 0.5098412913061772 (395 iteration)
Fold #1: Best auc: 0.5134244669761445 (500 iteration)
Fold #2: Best auc: 0.5165865088122478 (89 iteration)
Fold #3: Best auc: 0.5087986524818673 (154 iteration)
Fold #4: Best auc: 0.5056118442954609 (749 iteration)
Final featur

Fold #0: Best auc: 0.521242395620353 (121 iteration)
Fold #1: Best auc: 0.5151856300333192 (286 iteration)
Fold #2: Best auc: 0.523914059496846 (217 iteration)
Fold #3: Best auc: 0.5179258479144575 (98 iteration)
Fold #4: Best auc: 0.5186174108227719 (157 iteration)
Final feature auc score: 0.5168053378336757
--------------------
Processing feature 139/200
Fold #0: Best auc: 0.5781767215173403 (341 iteration)
Fold #1: Best auc: 0.5768971732040445 (303 iteration)
Fold #2: Best auc: 0.5797014095724822 (251 iteration)
Fold #3: Best auc: 0.5628114150945875 (139 iteration)
Fold #4: Best auc: 0.578126055903718 (192 iteration)
Final feature auc score: 0.5743137778258922
--------------------
Processing feature 140/200
Fold #0: Best auc: 0.5133657813597309 (95 iteration)
Fold #1: Best auc: 0.5114693513616432 (36 iteration)
Fold #2: Best auc: 0.5184073863589225 (55 iteration)
Fold #3: Best auc: 0.5131098619799421 (26 iteration)
Fold #4: Best auc: 0.5086087125619677 (0 iteration)
Final feature au

Fold #0: Best auc: 0.5094643752693689 (64 iteration)
Fold #1: Best auc: 0.5063563916611047 (4 iteration)
Fold #2: Best auc: 0.5073366284198795 (36 iteration)
Fold #3: Best auc: 0.5054809519982971 (71 iteration)
Fold #4: Best auc: 0.5071444788173353 (338 iteration)
Final feature auc score: 0.5007857748878723
--------------------
Processing feature 162/200
Fold #0: Best auc: 0.5333543643335253 (281 iteration)
Fold #1: Best auc: 0.53224902695328 (84 iteration)
Fold #2: Best auc: 0.5405602995306956 (134 iteration)
Fold #3: Best auc: 0.5246181942056499 (210 iteration)
Fold #4: Best auc: 0.5225401964349163 (94 iteration)
Final feature auc score: 0.5252755717448975
--------------------
Processing feature 163/200
Fold #0: Best auc: 0.5372335468373924 (180 iteration)
Fold #1: Best auc: 0.531444812429335 (536 iteration)
Fold #2: Best auc: 0.5397530551799092 (112 iteration)
Fold #3: Best auc: 0.5288622615395105 (258 iteration)
Fold #4: Best auc: 0.5315617202390922 (54 iteration)
Final feature auc

Fold #0: Best auc: 0.5600685152929663 (525 iteration)
Fold #1: Best auc: 0.5521591792296128 (501 iteration)
Fold #2: Best auc: 0.5497067642609631 (243 iteration)
Fold #3: Best auc: 0.5508626409214376 (502 iteration)
Fold #4: Best auc: 0.535501708048526 (137 iteration)
Final feature auc score: 0.5488549590126964
--------------------
Processing feature 185/200
Fold #0: Best auc: 0.500334121200783 (201 iteration)
Fold #1: Best auc: 0.5064990733777266 (370 iteration)
Fold #2: Best auc: 0.5043378507683927 (14 iteration)
Fold #3: Best auc: 0.4964019261758454 (6 iteration)
Fold #4: Best auc: 0.503181590474706 (10 iteration)
Final feature auc score: 0.5006699090997564
--------------------
Processing feature 186/200
Fold #0: Best auc: 0.5322847630611015 (69 iteration)
Fold #1: Best auc: 0.5307158863972016 (69 iteration)
Fold #2: Best auc: 0.5315594968459537 (391 iteration)
Fold #3: Best auc: 0.5311601604441162 (649 iteration)
Fold #4: Best auc: 0.5268271845476621 (355 iteration)
Final feature a

In [103]:
np.save(data_path/'oof', out_of_fold_predictions)
np.save(data_path/'ensemble_1st_lvl', test_predictions)

## 2nd Layer of ensamble

In [109]:
from sklearn.linear_model import LogisticRegression

X_train, X_valid, Y_train, Y_valid = train_test_split(
    out_of_fold_predictions, 
    targets.values, 
    test_size=0.15, 
    random_state=2019
)

ensamble_second_layer = LogisticRegression(n_jobs=-1)
ensamble_second_layer.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [118]:
probs = ensamble_second_layer.predict_proba(X_valid)
roc_auc_score(Y_valid, probs[:,1])

0.9040947053429751

In [115]:
import statsmodels.api as sm

logr = sm.Logit(targets, out_of_fold_predictions)
logr = logr.fit(disp=0)
ensemble_preds = logr.predict(out_of_fold_predictions)
ensemble_auc = roc_auc_score(targets, ensemble_preds)  
print('##################')
print('Combined Model with magic Val_AUC=',round(ensemble_auc,5))

##################
Combined Model with magic Val_AUC= 0.91402


In [117]:
lgb_ensemble_train = lgb.Dataset(X_train, label=Y_train)
lgb_ensemble_valid = lgb.Dataset(X_valid, label=Y_valid, reference=lgb_ensemble_train)

model = lgb.train(
    param, 
    lgb_ensemble_train, 
    50000, 
    valid_sets=[lgb_ensemble_train, lgb_ensemble_valid], 
    early_stopping_rounds=500,
#     verbose_eval=False, 
#             evals_result=training_results
    verbose_eval=1000
    )

# ensemble_predicions = model.predict(fold_valid_data)
# folds_predictions += model.predict(df_test, num_iteration=best_round)


Training until validation scores don't improve for 500 rounds.
[1000]	training's auc: 0.893676	valid_1's auc: 0.878561
[2000]	training's auc: 0.912475	valid_1's auc: 0.895864
[3000]	training's auc: 0.921207	valid_1's auc: 0.902603
[4000]	training's auc: 0.926821	valid_1's auc: 0.906273
[5000]	training's auc: 0.931289	valid_1's auc: 0.908226
[6000]	training's auc: 0.935035	valid_1's auc: 0.909387
[7000]	training's auc: 0.938219	valid_1's auc: 0.910122
[8000]	training's auc: 0.94106	valid_1's auc: 0.910847
[9000]	training's auc: 0.943547	valid_1's auc: 0.911536
[10000]	training's auc: 0.945812	valid_1's auc: 0.912033
[11000]	training's auc: 0.947898	valid_1's auc: 0.912439
[12000]	training's auc: 0.949817	valid_1's auc: 0.912743
[13000]	training's auc: 0.951595	valid_1's auc: 0.912847
Early stopping, best iteration is:
[12584]	training's auc: 0.950872	valid_1's auc: 0.91289


# Final prediction and submission to Kaggle

In [28]:
test_predictions

array([[-4.455415],
       [-4.660307],
       [-4.560704],
       [-4.508143],
       ...,
       [-4.597897],
       [-4.691847],
       [-4.623425],
       [-4.809001]])

In [119]:
predictions = model.predict(test_predictions)

In [123]:
submission_df = pd.DataFrame({'ID_code': df_test_original['ID_code'], 'target': predictions})

In [124]:
submission_df.head()

Unnamed: 0,ID_code,target
0,test_0,1.0
1,test_1,0.99999
2,test_2,0.999898
3,test_3,0.99999
4,test_4,0.999979


In [14]:
submission_file = data_path/'submission_lgbm_augmentation.csv'
submission_df.to_csv(submission_file, index=False)

In [15]:
# Only 3 submission allowed per day!
!kaggle competitions submit {competition_name} -f {submission_file} -m "LightGBM, 11 folds,augmentation"

100%|██████████████████████████████████████| 6.03M/6.03M [00:02<00:00, 2.44MB/s]
Successfully submitted to Santander Customer Transaction Prediction