# Santander Customer Transaction Prediction - Ensemble
## Ensemble of Random Forest, GBM, XGBoost and LightGBM

In the Kaggle competition, the objective is to identify which customer will make a transaction in the future.

**Link to the competition**: https://www.kaggle.com/c/santander-customer-transaction-prediction/  
**Type of Problem**: Classification  
**Metric for evalution**: AOC (Area Under Curve)

This Python 3 environment comes with many helpful analytics libraries installed
It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm
import xgboost
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv
/kaggle/input/santander-customer-transaction-prediction/train.csv
/kaggle/input/santander-customer-transaction-prediction/test.csv


## 1. Read Train CSV

In [3]:
input_dir = '/kaggle/input/santander-customer-transaction-prediction/'
df_train = pd.read_csv(input_dir + 'train.csv')
df_train

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.0930,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.6910,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.3890,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.3560,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.9250,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,train_199995,0,11.4880,-0.4956,8.2622,3.5142,10.3404,11.6081,5.6709,15.1516,...,6.1415,13.2305,3.9901,0.9388,18.0249,-1.7939,2.1661,8.5326,16.6660,-17.8661
199996,train_199996,0,4.9149,-2.4484,16.7052,6.6345,8.3096,-10.5628,5.8802,21.5940,...,4.9611,4.6549,0.6998,1.8341,22.2717,1.7337,-2.1651,6.7419,15.9054,0.3388
199997,train_199997,0,11.2232,-5.0518,10.5127,5.6456,9.3410,-5.4086,4.5555,21.5571,...,4.0651,5.4414,3.1032,4.8793,23.5311,-1.5736,1.2832,8.7155,13.8329,4.1995
199998,train_199998,0,9.7148,-8.6098,13.6104,5.7930,12.5173,0.5339,6.0479,17.0152,...,2.6840,8.6587,2.7337,11.1178,20.4158,-0.0786,6.7980,10.0342,15.5289,-13.9001


In [4]:
var_columns = [c for c in df_train.columns if c not in ['ID_code','target']]

X = df_train.loc[:, var_columns]
y = df_train.loc[:, 'target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((160000, 200), (40000, 200), (160000,), (40000,))

## 3. Create Models
#### 3.a Random Forest

In [5]:
model_rf = RandomForestClassifier(class_weight='balanced',
                                 criterion='gini',
                                 max_depth=55,
                                 max_features='log2',
                                 min_samples_leaf=0.005,
                                 min_samples_split=0.005,
                                 n_estimators=190)

model_rf.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=55,
                       max_features='log2', min_samples_leaf=0.005,
                       min_samples_split=0.005, n_estimators=190)

#### 3.b GBM

In [6]:
model_gbm = GradientBoostingClassifier(n_estimators=5000,
                                       learning_rate=0.05,
                                       max_depth=3,
                                       subsample=0.5,
                                       validation_fraction=0.1,
                                       n_iter_no_change=20,
                                       max_features='log2',
                                       verbose=0)
model_gbm.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.05, max_features='log2',
                           n_estimators=5000, n_iter_no_change=20,
                           subsample=0.5)

#### 3.c LightGBM

In [7]:
lgbm_train_data = lightgbm.Dataset(X_train, label=y_train)
lgbm_valid_data = lightgbm.Dataset(X_valid, label=y_valid)

parameters = {'objective': 'binary',
              'metric': 'auc',
              'is_unbalance': 'true',
              'boosting': 'gbdt',
              'num_leaves': 63,
              'feature_fraction': 0.5,
              'bagging_fraction': 0.5,
              'bagging_freq': 20,
              'learning_rate': 0.01,
              'verbose': 0
             }

model_lgbm = lightgbm.train(parameters,
                            lgbm_train_data,
                            valid_sets=lgbm_valid_data,
                            num_boost_round=5000,
                            early_stopping_rounds=50)

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.666089
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.686498
[3]	valid_0's auc: 0.714094
[4]	valid_0's auc: 0.719307
[5]	valid_0's auc: 0.730011
[6]	valid_0's auc: 0.740441
[7]	valid_0's auc: 0.748778
[8]	valid_0's auc: 0.75823
[9]	valid_0's auc: 0.762986
[10]	valid_0's auc: 0.768532
[11]	valid_0's auc: 0.772095
[12]	valid_0's auc: 0.773468
[13]	valid_0's auc: 0.777078
[14]	valid_0's auc: 0.781336
[15]	valid_0's auc: 0.782714
[16]	valid_0's auc: 0.7831
[17]	valid_0's auc: 0.782315
[18]	valid_0's auc: 0.783168
[19]	valid_0's auc: 0.782984
[20]	valid_0's auc: 0.783888
[21]	valid_0's auc: 0.786273
[22]	valid_0's auc: 0.786601
[23]	valid_0's auc: 0.786714
[24]	valid_0's auc: 0.787442
[25]	valid_0's auc: 0.788749
[26]	valid_0's auc: 0.790635
[27]	valid_0's auc: 0.791018
[28]	valid_0's auc: 0.791687
[29]	valid_0's auc: 0.792741
[30]	valid_0's auc: 0.793356
[31]	valid_0's auc: 0.7

#### 3.d XGBoost

In [8]:
model_xgboost = xgboost.XGBClassifier(learning_rate=0.05,
                                      max_depth=2,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.25,
                                      eval_metric='auc',
                                      verbosity=0,
                                      use_label_encoder=False)

eval_set = [(X_valid, y_valid)]

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.25, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=5000, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.5, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=0)

## 4. Combine scores

In [9]:
y_train_pred_rf = model_rf.predict_proba(X_train)[:,1]
y_train_pred_gbm = model_gbm.predict_proba(X_train)[:,1]
y_train_pred_lgbm = model_lgbm.predict(X_train)
y_train_pred_xgboost = model_xgboost.predict_proba(X_train)[:,1]

y_valid_pred_rf = model_rf.predict_proba(X_valid)[:,1]
y_valid_pred_gbm = model_gbm.predict_proba(X_valid)[:,1]
y_valid_pred_lgbm = model_lgbm.predict(X_valid)
y_valid_pred_xgboost = model_xgboost.predict_proba(X_valid)[:,1]

y_train_pred_all = np.mean([y_train_pred_rf, y_train_pred_gbm, y_train_pred_lgbm, y_train_pred_xgboost], axis=0)
y_valid_pred_all = np.mean([y_valid_pred_rf, y_valid_pred_gbm, y_valid_pred_lgbm, y_valid_pred_xgboost], axis=0)

  "because it will generate extra copies and increase " +
  "because it will generate extra copies and increase " +


In [10]:
train_auc_list = [roc_auc_score(y_train, y_train_pred_rf),
                  roc_auc_score(y_train, y_train_pred_gbm),
                  roc_auc_score(y_train, y_train_pred_lgbm),
                  roc_auc_score(y_train, y_train_pred_xgboost),
                  roc_auc_score(y_train, y_train_pred_all)]

valid_auc_list = [roc_auc_score(y_valid, y_valid_pred_rf),
                  roc_auc_score(y_valid, y_valid_pred_gbm),
                  roc_auc_score(y_valid, y_valid_pred_lgbm),
                  roc_auc_score(y_valid, y_valid_pred_xgboost),
                  roc_auc_score(y_valid, y_valid_pred_all)]

pd.DataFrame({"Technique": ["Random Forest", "GBM", "LightGBM", "XGBoost", "All"],
              "Train_AUC": train_auc_list,
              "Valid_AUC": valid_auc_list
})

Unnamed: 0,Technique,Train_AUC,Valid_AUC
0,Random Forest,0.878057,0.835756
1,GBM,0.927486,0.891064
2,LightGBM,0.987944,0.893027
3,XGBoost,0.920518,0.89412
4,All,0.965769,0.892887


## 5. Score the test data
First let us import test.csv


In [11]:
df_test = pd.read_csv(input_dir + 'test.csv')
df_sample_submissions = pd.read_csv(input_dir + 'sample_submission.csv')
df_test.shape, df_sample_submissions.shape

((200000, 201), (200000, 2))

In [12]:
X_test = df_test.loc[:, var_columns]

y_test_pred_rf = model_rf.predict_proba(X_test)[:,1]
y_test_pred_gbm = model_gbm.predict_proba(X_test)[:,1]
y_test_pred_lgbm = model_lgbm.predict(X_test)
y_test_pred_xgboost = model_xgboost.predict_proba(X_test)[:,1]

df_sample_submissions['target'] = np.mean([y_test_pred_rf, y_test_pred_gbm, y_test_pred_lgbm, y_test_pred_xgboost], axis=0)
df_sample_submissions

  "because it will generate extra copies and increase " +


Unnamed: 0,ID_code,target
0,test_0,0.257581
1,test_1,0.392748
2,test_2,0.380964
3,test_3,0.306213
4,test_4,0.178095
...,...,...
199995,test_199995,0.163926
199996,test_199996,0.103438
199997,test_199997,0.097664
199998,test_199998,0.252314


In [13]:
output_dir = '/kaggle/working/'
df_sample_submissions.to_csv(output_dir + "07_ensemble_scores.csv", index=False)