# Santander Customer Satisfaction Prediction Challenge: IE MBD-O1 Group G

Team members: A.Olivier, J.Kim, I.Ruperez, P.Viland, D.Istiartomo, S.Arumugan

# 0. Introduction

## Challenge Description

* Goal: Predicting the probability of how unsatisfied a customer is based on numerous anonymized variabales

# 1. Importing and Cleaning data

In [None]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline
pd.set_option("max_columns", None)

In [None]:
# read data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
# Shape of data
print("training set has {} observations and {} columns.".format(train.shape[0],train.shape[1]))
print("test set has {} observations and {} columns.".format(test.shape[0],test.shape[1]))

In [None]:
# check for missing data
print("missing data for training set:",len(train.isnull().sum()[train.isnull().sum() > 0]))
print("missing data for test set:",len(test.isnull().sum()[test.isnull().sum() > 0]))

In [None]:
# view first few rows of training data
train.head(2)

In [None]:
# check if IDs are unique
print("check if training IDs are unique:", len(train.ID.unique()) == len(train))
print("check if test IDs are unique:", len(test.ID.unique()) == len(test))

# EDA

In [None]:
# counts of satisfied vs not satisfied customers
# where 1 is unsatisfied customer
# and 0 is satisfied customer
plt.figure(figsize = (8,6))
sns.countplot(train.TARGET)
plt.title("Satisfied vs not-satisfied customer")
plt.show()

train.TARGET.value_counts() / len(train)

The target variable is highly imbalanced, with more than 96% of the customers recording as 'satisfied', while merely a little less than 4% saying they are not. This means that we need to be careful when predicting class, as class 0 may dominate the prediction. We will be using stratified corss validation to ensure that this isn't the case.

In [None]:
# look for columns with a single unique value
# remove these columns because they add no value to the training set
cols_single = [i for i in train.columns if len(np.unique(train[i])) == 1]
train = train.drop(cols_single, axis = 1)

# also remove from test
test = test.drop(cols_single, axis = 1)

In [None]:
# correlation matrix to see relationship among variables
plt.figure(figsize = (8,8))
plt.matshow(train.corr())
plt.show()

In [None]:
# correlation with target variable
from scipy.stats import pearsonr
corr_df = pd.DataFrame()
for i,v in enumerate(train.columns):
    corr_df.loc[i,'col'] = v
    corr_df.loc[i,'corr_score'] = pearsonr(train.TARGET, train[v])[0]
    
corr_df.sort_values('corr_score', ascending = False)

The correlation coefficient seems low, but that may be because the relationship may not be strictly linear.

In [None]:
# the columns can be identified with different names in the front
# let's check that 3 alphabet code
col_code = np.unique([i[:3] for i in train.columns if i not in ['TARGET', 'ID']])
col_code

# count the number of unique values
for i in col_code:
    print('----------'+i+'----------')
    unique = []
    count = 0
    for j in train.columns:
        if j[:3] == i:
            unique.append(len(np.unique(train[j])))
            count = count + 1
    print(np.unique(unique))
    print('total number of columns:', count)
    print('\n')

It's easy to see that columns with 'ind' are binary (only 2 unique values). Columns with 'num' are discrete values (values are integers, possibly multi labels categorical variable). 'sal' is actually short for 'saldo', which means balance in Spanish. 'del' is short for 'delta', which may mean hedge ratio. Note that there are negative values for these columns. 'var' and 'imp' are other float values.

In [None]:
# check if there are duplicate columns
# meaning columns with exact same values
from itertools import combinations
cols = train.columns.tolist()
cols.remove('ID')
cols.remove('TARGET')


col_combinations = list(combinations(cols,2))
same_col_vals = []

for i in col_combinations:
    if (train[i[0]] == train[i[1]]).all():
        same_col_vals.append(i)

In [None]:
# make sure that all the columns in the same_col_vals are unique
# compare length to do that
u = []
for i in same_col_vals:
    u.append(i[0])
    u.append(i[1])
print((len(same_col_vals) * 2) == len(u))

# remove one of the column pairs with same 
cols_to_remove = [i[0] for i in same_col_vals]
train = train.drop(cols_to_remove, axis = 1)
test = test.drop(cols_to_remove, axis = 1)

In [None]:
# current shape of dataset
print("training set has {} observations and {} columns after cleaning.".format(train.shape[0],train.shape[1]))
print("test set has {} observations and {} columns after cleaning.".format(test.shape[0],test.shape[1]))

63 columns were deleted from both the training and test set because they added no value to the dataset, either because they only had a single value for all rows, or because they are duplicate columns.

# 2. Initial Model Training

Since there are still 300+ columns and all are anonymized, there isn't much room for feature engineering at this stage. Therefore, we will first create some baseline classification model.

The score metric for this challenge is AUC area, and the actual submission value for the challenge is the probability of belonging to class 1 (Unsatisfied customers).

In [None]:
# import models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# import pipeline and other preprocessing/evaluation tools
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

In [None]:
# divide features (x) and target (y)
# store ID separately
y = train.TARGET
train_id = train.ID
train_x = train.drop(['TARGET','ID'], axis = 1)

# define stratifiedkfold fdor cross validation
# with random state 42
skf = StratifiedKFold(n_splits = 10, random_state = 42)

In [None]:
# build pipeline for baseline random forest classification model
pipeline_baseline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier())
])

scores_baseline = cross_val_score(pipeline_baseline, train_x, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 10)

In [None]:
print(scores_baseline.mean())

In [None]:
# build pipeline for baseline XGBoost classification model
pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier())
]) 

scores_baseline = cross_val_score(pipeline_xgb, train_x, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 10)

In [None]:
scores_baseline.mean()

# 3. Feature Engineering I

The baseline XGBoost model's cross validation score is decent, but we can probably do better through more feature engineering and hyperparameter tuning.

Since there are still 300+ columns, there is a high possibility that there are columns that are highly correlated and are adding noise to the model rather than value. Let's try to reduce the number of features used for training process by:
- PCA: feature extraction
- Feature reduction based on feature importance calculated by an ensemble tree model (XGBoost and Adaboost in this case)

In [None]:
# PCA: feature extraction
# selecting bset n_components by 
# comparing variance of different n
scaled_x = StandardScaler().fit_transform(train_x)
pca = PCA()

pca.fit(scaled_x)

plt.figure(1, figsize = (8,6))
plt.clf()
plt.axes([.2,.2,.7,.7])
plt.plot(pca.explained_variance_[:200], linewidth = 2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance')

It seems that components seem to give little or almost no added value when explaning the variance of the dataset at around 150. We can make a cumulative explained variance graph to figure out the best n_components number.

In [None]:
# try cumulative explained variance with different pca n values
for i in [100,120,140,160]:
    pca = PCA(n_components = i)
    pca.fit(scaled_x)
    cum_explained_var = []
    for i in range(0, len(pca.explained_variance_ratio_)):
        if i == 0:
            cum_explained_var.append(pca.explained_variance_ratio_[i])
        else:
            cum_explained_var.append(pca.explained_variance_ratio_[i] + 
                                     cum_explained_var[i-1])
    plt.plot(range(len(pca.explained_variance_ratio_)),cum_explained_var, linewidth = 2)
    plt.title("n = {}: total explained variance {}".format(i, max(cum_explained_var)))
    plt.show()

In [185]:
# use 160 as pca number and build pipeline for xgb
pipeline_pca_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components = 160)),
    ('clf', XGBClassifier())
])

scores_pca_xgb = cross_val_score(pipeline_pca_xgb, train_x, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 10)

[CV] ....................... , score=0.8312393833093451, total= 1.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.0min remaining:    0.0s


[CV] ....................... , score=0.8148105593326542, total= 1.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  9.6min remaining:    0.0s


[CV] ....................... , score=0.8322466180166463, total= 1.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 11.2min remaining:    0.0s


[CV] ....................... , score=0.8416643876663689, total= 1.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 12.7min remaining:    0.0s


[CV] ....................... , score=0.8292144911655939, total= 1.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 14.3min remaining:    0.0s


[CV] ........................ , score=0.811150527325024, total= 1.6min


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 15.9min finished


In [186]:
scores_pca_xgb.mean()

0.822802418642088

The scores don't seem to improve much. Let's try tuning the hyperparameters, to see if there are any improvements in the score.

# 4. Hyperparameter tuning I

In [None]:
# xgboost parameter tuning 1
# tuning max depth and min child weight
xgb_hp_1 = XGBClassifier(seed=42)

xgb_param_1 = {
 'clf__max_depth':range(3,10,2),
 'clf__min_child_weight':range(1,6,2)
}

pipeline_xgb_hp_1 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', xgb_hp_1)
])

xgb_rand_1 = RandomizedSearchCV(pipeline_xgb_hp_1, param_distributions = xgb_param_1,
                             cv = 5, scoring = 'roc_auc', verbose = 5)

xgb_rand_1.fit(train_x,y)

In [None]:
# view the best parameters based on the randomizedsearchCV
xgb_hp_2 = xgb_rand_1.best_estimator_
xgb_hp_2.get_params

In [None]:
# Use the best estimator from above to tune gamma this time
xgb_hp_2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1,learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=5, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=1)

xgb_param_2 = {
 'clf__gamma':[i/10.0 for i in range(0,5)]}

pipeline_xgb_hp_2 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', xgb_hp_2)
])

xgb_grid_2 = GridSearchCV(pipeline_xgb_hp_2, param_grid = xgb_param_2,
                             cv = 5, scoring = 'roc_auc', verbose = 5)

xgb_grid_2.fit(train_x,y)

In [None]:
# get the best estimator from above 
# to tune subsample and colsample_bytree this time
xgb_hp_3 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       learning_rate=0.1, max_delta_step=0,max_depth=3, min_child_weight=5,missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True, gamma = 0.4)

xgb_param_3 = {
    'clf__subsample': [i/10.0 for i in range(6,10)],
    'clf__colsample_bytree': [i/10.0 for i in range(6,10)]
}

pipeline_xgb_hp_3 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', xgb_hp_3)
])

xgb_rand_3 = RandomizedSearchCV(pipeline_xgb_hp_3, param_distributions = xgb_param_3,
                             cv = 5, scoring = 'roc_auc', verbose = 5)

xgb_rand_3.fit(train_x,y)

In [None]:
# get the best estimator from above 
# to tune reg_alpha this time
xgb_hp_4 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       learning_rate=0.1, max_delta_step=0,max_depth=3, min_child_weight=5,missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_lambda=1, scale_pos_weight=1, seed=42, silent=True, gamma = 0.4,
        colsample_bytree= 0.6, subsample= 0.8)

xgb_param_4 = {
    'clf__reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}

pipeline_xgb_hp_4 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', xgb_hp_4)
])

xgb_grid_4 = GridSearchCV(pipeline_xgb_hp_4, param_grid = xgb_param_4,
                             cv = 5, scoring = 'roc_auc', verbose = 5)

xgb_grid_4.fit(train_x,y)

In [None]:
# use the best parameters identified for XGboost
# and define the best XGboost estimator
xgb_hp_best = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=5, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.001,
       reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)

scaled_x = StandardScaler().fit_transform(train_x)
xgb_hp_best.fit(scaled_x, y)

# 5. Feature Engineering II

Tree models like XGBoost have a feature called "feature importance", which meausres how important individual feature is for the model. We can use this feature to 'filter' features that do not add as much value to the classification task by dropping all columns with 0.0 feature importance.

In [None]:
# create dataframe of feature importance for the best xgb model
xgb_feature_imp = pd.DataFrame({'feature': train_x.columns.tolist(),
             'feature_importance': xgb_hp_best.feature_importances_})

# store columns with feature importance = 0
cols_del = xgb_feature_imp[xgb_feature_imp.feature_importance == 0.0].feature.tolist()

In [None]:
# drop colums with 0 importance from the training and test set
train_x_2 = train_x.drop(cols_del, axis = 1)
test_ID = test.ID
test_2 = test.drop(cols_del, axis = 1)
test_2 = test_2.drop('ID', axis = 1)

In [None]:
# scale x2 and test 2
scaler = StandardScaler()
train_x_2_scaled = scaler.fit_transform(train_x_2)
test_2_scaled = scaler.transform(test_2)

In [None]:
# re-train the xgboost model with predefine parameters
# using the new dataset
xgb_hp_best = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=5, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.001,
       reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)

pipeline_xgb_x2 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', xgb_hp_best)
]) 

scores_xgb_x2 = cross_val_score(pipeline_xgb_x2, train_x_2, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 5)

# 6. Hyperparameter Tuning II

We will repeat the process above for XGBoost with Adaboost.

In [None]:
# adaboost classifier
ada_x2 = AdaBoostClassifier(random_state = 42)

pipeline_ada_x2 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', ada_x2)
]) 

scores_ada_x2 = cross_val_score(pipeline_ada_x2, train_x_2, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 5)

In [None]:
# adaboost classifier
# get the best estimator from above
ada_hp_1 = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, random_state=42)

ada_param_1 = {
    'clf__n_estimators': [50, 100, 500, 1000],
    'clf__learning_rate': [1.0, 0.1, 0.05]
}

pipeline_ada_hp_1 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', ada_hp_1)
])

ada_grid_1 = GridSearchCV(pipeline_ada_hp_1, param_grid = ada_param_1,
                             cv = 5, scoring = 'roc_auc', verbose = 5)

ada_grid_1.fit(train_x2,y)

In [None]:
# define adaboost estimator with best parameters
ada_grid_1.best_estimator_
ada_hp_best = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=500, random_state=42)

# 7. Feature Engineering III

We can try and reduce further columns by also filtering features based on importance of adaboost classifier.

In [None]:
# feature importance of adaboosts
ada_imp = pd.DataFrame({'features': train_x_2.columns.tolist(),'feat_imp': ada_hp_best.feature_importances_})
cols_drop_2 = ada_imp[ada_imp.feat_imp == 0.0].features.tolist()

# drop features from training and test set
train_x_3 = train_x_2.drop(cols_drop_2, axis = 1)
test_3 = test_2.drop(cols_drop_2, axis = 1)

In [None]:
# scale x3 and test 3
scaler = StandardScaler()
train_x_3_scaled = scaler.fit_transform(train_x_3)
test_3_scaled = scaler.transform(test_3)

In [None]:
# xgb with new features
xgb_hp_best = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=5, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.001,
       reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)

pipeline_xgb_x3 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', xgb_hp_best)
]) 

scores_xgb_x3 = cross_val_score(pipeline_xgb_x3, train_x_3, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 5)

In [None]:
# adaboost classifier
ada_hp_best = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=500, random_state=42)

pipeline_ada_x3 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', ada_hp_best)
])

scores_ada_x3 = cross_val_score(pipeline_ada_x3, train_x_3, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 5)

# 8. Hyperparameter Tuning III

We will try one more parameter tuning for gradientboosting classifier, using the new features set.

In [None]:
# gradient boost model
gbm = GradientBoostingClassifier(random_state = 42)

pipeline_gbm_x3 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', gbm)
])

scores_gbm_x3 = cross_val_score(pipeline_gbm_x3, train_x_3, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 5)

In [None]:
# gbm tuning
gbm_hp_1 = GradientBoostingClassifier(random_state=42)

gbm_param_1 = {
    'clf__max_depth': range(5,16,2),
    'clf__min_samples_split': range(200,1001,200)
}

pipeline_gbm_hp_1 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', gbm_hp_1)
])

gbm_rand_1 = RandomizedSearchCV(pipeline_gbm_hp_1, param_distributions = gbm_param_1,
                             cv = 5, scoring = 'roc_auc', verbose = 5)

gbm_rand_1.fit(train_x,y)

In [None]:
# best gbm with new features
gbm_hp_best = GradientBoostingClassifier(min_samples_split= 1000,
                            max_depth= 5,
                           random_state = 42)

pipeline_gbm_x3 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', gbm_hp_best)
]) 

scores_gbm_hp_x3 = cross_val_score(pipeline_gbm_x3, train_x_3, y,
                                 cv = skf, scoring = 'roc_auc', verbose = 5)

# 9. Ensembles

While all xgboost, adaboost, and gbm are ensemble models of their own (tree ensembles), we can create another ensemble using the defined models above. Ensemble models are more robust (since the prediction is based on multiple models), and thus are less vulnerable to variance errors. We will use the votingclassifier function to create different ensembles.

In [None]:
from sklearn.ensemble import VotingClassifier
estimators_2 = [('xgb', xgb_hp_best),('ada', ada_hp_best)]

#Voting Classifier
vc_2 = VotingClassifier(estimators= estimators_2,
                          voting='soft',            #soft for probability
                          weights=None,
                          n_jobs=1,
                          flatten_transform=None
                          )

#Voting Classifier fit
vc_2 = vc_2.fit(train_x_3_scaled, y)

#Voting Classifier predict
VC_pred_2 = vc_2.predict_proba(test_3_scaled)

In [None]:
from sklearn.ensemble import VotingClassifier
estimators_3 = [('xgb', xgb_hp_best),('ada', ada_hp_best), ('gbm', gbm_hp_best)]

#Voting Classifier
vc_3 = VotingClassifier(estimators= estimators_3,
                          voting='soft',            #soft for probability
                          weights=None,
                          n_jobs=1,
                          flatten_transform=None
                          )

#Voting Classifier fit
vc_3 = vc_3.fit(train_x_3_scaled, y)

#Voting Classifier predict
VC_pred_3 = vc_3.predict_proba(test_3_scaled)

In [None]:
from sklearn.ensemble import VotingClassifier
estimators_4 = [('xgb', xgb_hp_best),('gbm', gbm_hp_best)]

#Voting Classifier
vc_4 = VotingClassifier(estimators= estimators_4,
                          voting='soft',            #soft for probability
                          weights=None,
                          n_jobs=1,
                          flatten_transform=None
                          )

#Voting Classifier fit
vc_4 = vc_3.fit(train_x_3_scaled, y)

#Voting Classifier predict
VC_pred_4 = vc_4.predict_proba(test_3_scaled)

In [None]:
from sklearn.ensemble import VotingClassifier
estimators_5 = [('xgb', xgb_hp_best),('gbm', gbm_hp_best)]

#Voting Classifier
vc_5 = VotingClassifier(estimators= estimators_5,
                          voting='soft',            #soft for probability
                          weights=None,
                          n_jobs=1,
                          flatten_transform=None
                          )

#Voting Classifier fit
vc_5 = vc_5.fit(train_x_2_scaled, y)

#Voting Classifier predict
VC_pred_5 = vc_5.predict_proba(test_2_scaled)

In [None]:
from sklearn.ensemble import VotingClassifier
estimators_6 = [('xgb', xgb_hp_best),('ada', ada_hp_best), ('gbm', gbm_hp_best)]

#Voting Classifier
vc_6 = VotingClassifier(estimators= estimators_6,
                          voting='soft',            #soft for probability
                          weights=None,
                          n_jobs=1,
                          flatten_transform=None
                          )

#Voting Classifier fit
vc_6 = vc_6.fit(train_x_2_scaled, y)

#Voting Classifier predict
VC_pred_6 = vc_6.predict_proba(test_2_scaled)

# 10. Predictions & Results

The challenge had 2 different types of score systems: a private, and a public one. Our results had different scores for the best private score and the best public score, so here we display both.

In [None]:
# best private score model: 
# XGBoost model using 68 variables (training set 2)
# private score: 0.824089
# public score: 0.836971
pipeline_xgb_x2.fit(train_x_2, y)
predictions = pipeline_xgb_x2.predict_proba(test_2)

submission_xgb = pd.DataFrame({'ID': test_ID, 'TARGET': [i[1] for i in predictions]})
submission_xgb.to_csv("submission_xgb_20180620_2.csv", index = False)

In [None]:
# best public score model: 
# Ensembled model using XGboost and GBM
# with 68 variables (training set 2)
# private score: 0.824038
# public score: 0.837158
vc_5.fit(train_x_2, y)
VC_pred_5 = vc_5.predict_proba(test_2)

submission_vc = pd.DataFrame({'ID': test_ID, 'TARGET': [i[1] for i in VC_pred_5]})
submission_xgb.to_csv("submission_vc5_20180622_10.csv", index = False)