In [63]:
import pandas as pd
import numpy as np

In [64]:
import matplotlib.pyplot as plt
import seaborn as sns

In [65]:
import os

### load data

In [66]:
main_folder = r"C:\Users\Alexandre Boulenger\santransakt_main"
data_folder = os.path.join(main_folder, 'data')

In [67]:
train = pd.read_csv(os.path.join(data_folder,"train.csv"))
test = pd.read_csv(os.path.join(data_folder,"test.csv"))

### prep data

In [68]:
test['target'] = np.nan
cols = test.columns[:-1].tolist()
cols = [cols[0], 'target']+cols[1:]
test = test.loc[:,cols]

In [69]:
traintest = pd.concat([train, test])
traintest.reset_index(drop=True, inplace=True)
traintest['set'] = 'test'
traintest.loc[train.index,'set'] = 'train'
traintest.set.value_counts()

test     200000
train    200000
Name: set, dtype: int64

In [70]:
X_tr = train.drop(columns=['target', 'ID_code']).copy()
X_te = test.drop(columns=['target', 'ID_code']).copy()
X = traintest.drop(columns=['target', 'ID_code']).copy()

In [71]:
y_tr = train.target.copy()
y_te = test.target.copy()
y = traintest.target.copy()

### select part of the training data for a less greedy exploration

In [72]:
from sklearn.model_selection import train_test_split

In [10]:
#X_tr, X_tr_2, y_tr, y_tr_2 = train_test_split(X_tr, y_tr, test_size=0.9, random_state=42)
#X_te, X_tr_2, y_te, y_tr_2 = train_test_split(X_tr_2, y_tr_2, test_size=0.9, random_state=42)

In [73]:
X_tr, X_te, y_tr, y_te = train_test_split(X_tr, y_tr, test_size=0.1, random_state=42)

### train light model

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV

In [75]:
roc_auc_weighted = make_scorer(roc_auc_score, average='weighted')

#### create model

In [76]:
# set default parameters
cv = 5
n_estimators = 100
max_depth = 20
max_features = np.sqrt(200)
param_grid = {
    'max_depth':[4,10,20]
    ,'n_estimators':[25,100,150]
}

In [77]:
# instanciate model with default parameters
clf = RandomForestClassifier(
    n_estimators=n_estimators
    ,max_depth=max_depth
    #,max_features=max_features
    ,class_weight='balanced'
    ,random_state=0
)

#### hyperparameter tuning and cross-validated test error

In [78]:
# instanciate CV object
gscv = GridSearchCV(
    clf
    ,param_grid=param_grid
    ,scoring=roc_auc_weighted
    ,cv=cv
)

In [80]:
%%time
# run CV
gscv.fit(X_tr, y_tr)

Wall time: 2h 6min 8s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=20, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [4, 10, 20], 'n_estimators': [25, 100, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score, average=weighted), verbose=0)

In [81]:
# display out-of-sample performance 
print("estimated test ROC AUC score: {:.2%}".format(pred_score))
best_params = gscv.best_params_
print("best parameters: {}".format(best_params))

estimated test ROC AUC score: 69.63%
best parameters: {'max_depth': 4, 'n_estimators': 150}


#### performance on test set

In [82]:
y_te_pr = gscv.predict(X_te)
print("ROC AUC score on test set: {:.2%}".format(roc_auc_score(y_te, y_te_pr)))
y_te_pr = gscv.predict(X_tr_2)
print("ROC AUC score on test set: {:.2%}".format(roc_auc_score(y_tr_2, y_te_pr)))

ROC AUC score on test set: 70.75%
ROC AUC score on test set: 71.44%
