In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys

import csv
import datetime
import itertools
import numpy as np
import pandas as pd
import random
import re
import sklearn
import time
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context(rc={
       "figure.figsize": (16, 10),
       "axes.titlesize": 14})

from IPython.display import Image, display
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

from os.path import expanduser
sys.path.insert(1, '{}/datsci'.format(expanduser('~')))
from datsci import eda, munge
from datsci import kaggle as kg

In [2]:
import santander

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import SGDClassifier as SGDClf
from sklearn.cross_validation import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer

import xgboost as xgb

In [173]:
# TODO delete after comparing w new files - already implemented in santander.py
def get_sizes(train_csv, test_csv):
    df = pd.read_csv(train_csv)
    df_test = pd.read_csv(test_csv, index_col='ID')
    train_rows, train_cols = df.shape
    test_rows, test_cols = df_test.shape
    return train_rows, train_cols, test_rows, test_cols


data_shapes = []
for s, train_csv, test_csv in [
    ('raw',           FILE_TRAIN,                                 FILE_TEST),
    ('dedup',         FILE_TRAIN_DEDUP,                           FILE_TEST_DEDUP),
    ('bin onehot',    FILE_TRAIN_DEDUP_ONEHOT,                    FILE_TEST_DEDUP_ONEHOT),
    ('NaN',           FILE_TRAIN_DEDUP_ONEHOT_NA,                 FILE_TEST_DEDUP_ONEHOT_NA),
    ('impute mean',   FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_MEAN,     FILE_TEST_DEDUP_ONEHOT_NA_IMPUTE_MEAN),
    ('impute median', FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_MEDIAN,   FILE_TEST_DEDUP_ONEHOT_NA_IMPUTE_MEDIAN),
    ('impute freq',   FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_FREQ,     FILE_TEST_DEDUP_ONEHOT_NA_IMPUTE_FREQ),
    ('onehot int',    FILE_TRAIN_DEDUP_ONEHOT_NA_ONEHOTINT,       FILE_TEST_DEDUP_ONEHOT_NA_ONEHOTINT),
    ('rm test const', FILE_TRAIN_DEDUP_ONEHOT_NA_ONEHOTINT_1TEST, FILE_TEST_DEDUP_ONEHOT_NA_ONEHOTINT_1TEST),]:
    data_shapes.append((s,) + get_sizes(train_csv, test_csv))
pd.DataFrame(data_shapes, columns=['stage', 'train rows', 'train cols', 'test rows', 'test cols'])

Unnamed: 0,stage,train rows,train cols,test rows,test cols
0,raw,76020,371,75818,369
1,dedup,71213,307,75818,306
2,bin onehot,71213,363,75818,362
3,,71213,357,75818,356
4,impute mean,71213,357,75818,356
5,impute median,71179,357,75818,356
6,impute freq,71179,357,75818,356
7,onehot int,71213,398,75818,397
8,rm test const,71213,390,75818,389


In [3]:
def cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=None):
    
    # Train cv
    xgb_param = model.get_xgb_params()
    dtrain = xgb.DMatrix(X_train.values, label=y_train.values, missing=missing)
    cv_result = xgb.cv(
        xgb_param, dtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_nfold,
        metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=False)
    best_n_estimators = cv_result.shape[0]
    model.set_params(n_estimators=best_n_estimators)
    
    # Train model
    model.fit(X_train, y_train, eval_metric='auc')
        
    # Predict training data
    y_hat_train = model.predict(X_train)

    # Predict test data
    y_hat_test = model.predict(X_test)
    
    # Print model report:
    print("\nModel Report")
    print("best n_estimators: {}".format(best_n_estimators))
    print("AUC Score (Train): %f" % roc_auc_score(y_train, y_hat_train))
    print("AUC Score (Test) : %f" % roc_auc_score(y_test,  y_hat_test))
                    
#     feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')

model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=1000,
    objective= 'binary:logistic',
    nthread=4,
    seed=55,
)

## Var3 null values

In [71]:
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(santander.FILE_TRAIN,
                                                                                         santander.FILE_TEST)

df_train, df_test = santander.set_var3_null(df_train, df_test)
X_train, X_test = santander.set_var3_null(X_train, X_test)

cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
Stopping. Best iteration: 65



Model Report
best n_estimators: 66
AUC Score (Train): 0.873196
AUC Score (Test) : 0.835522


## Fix 'delta' cols that contain 9999999999

In [44]:
# df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN, santander.FILE_TEST)
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(santander.FILE_TRAIN,
                                                                                         santander.FILE_TEST)

In [5]:
ratio_cols = []
for c in df_train:
    if 9999999999 in df_train[c].unique():
        ratio_cols.append(c)
        
delta_cols = []
for c in df_train:
    if c.find('delta') == 0:
        delta_cols.append(c)

In [6]:
len(ratio_cols), len(delta_cols), ratio_cols == delta_cols

(26, 26, True)

In [7]:
ratio_cols

['delta_imp_amort_var18_1y3',
 'delta_imp_amort_var34_1y3',
 'delta_imp_aport_var13_1y3',
 'delta_imp_aport_var17_1y3',
 'delta_imp_aport_var33_1y3',
 'delta_imp_compra_var44_1y3',
 'delta_imp_reemb_var13_1y3',
 'delta_imp_reemb_var17_1y3',
 'delta_imp_reemb_var33_1y3',
 'delta_imp_trasp_var17_in_1y3',
 'delta_imp_trasp_var17_out_1y3',
 'delta_imp_trasp_var33_in_1y3',
 'delta_imp_trasp_var33_out_1y3',
 'delta_imp_venta_var44_1y3',
 'delta_num_aport_var13_1y3',
 'delta_num_aport_var17_1y3',
 'delta_num_aport_var33_1y3',
 'delta_num_compra_var44_1y3',
 'delta_num_reemb_var13_1y3',
 'delta_num_reemb_var17_1y3',
 'delta_num_reemb_var33_1y3',
 'delta_num_trasp_var17_in_1y3',
 'delta_num_trasp_var17_out_1y3',
 'delta_num_trasp_var33_in_1y3',
 'delta_num_trasp_var33_out_1y3',
 'delta_num_venta_var44_1y3']

In [33]:
x = 21
c = ratio_cols[x]
df_train[c].value_counts()

 0             76014
 9999999999        4
-1                 2
Name: delta_num_trasp_var17_in_1y3, dtype: int64

In [34]:
df_test[c].value_counts()

 0             75804
 9999999999        8
-1                 6
Name: delta_num_trasp_var17_in_1y3, dtype: int64

In [35]:
df_train[df_train[c] != 9999999999][c].describe()

count    76016.000000
mean        -0.000026
std          0.005129
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.000000
Name: delta_num_trasp_var17_in_1y3, dtype: float64

In [36]:
df_test[df_test[c] != 9999999999][c].describe()

count    75810.000000
mean        -0.000079
std          0.008896
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.000000
Name: delta_num_trasp_var17_in_1y3, dtype: float64

In [66]:
df.shape, df_test.shape

((76020, 372), (75818, 370))

In [68]:
df.shape, df_test.shape

((76020, 373), (75818, 371))

In [None]:
(a - b) / b

In [None]:
a b c
0 0 0
0 1 1
1 0 
1 1 0

In [37]:
len(ratio_cols)

26

In [78]:
# Check performance of ratio columns w/o any modifications
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(santander.FILE_TRAIN,
                                                                                         santander.FILE_TEST)

cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
Stopping. Best iteration: 115



Model Report
best n_estimators: 116
AUC Score (Train): 0.863213
AUC Score (Test) : 0.833875


In [80]:
# change 999999 to very negative numbers, i.e. -10

# df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN, santander.FILE_TEST)
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(santander.FILE_TRAIN,
                                                                                        santander.FILE_TEST)

replace_val = -10
for c in ratio_cols:
    if X_train[c].describe()['max'] == 9999999999:
        X_train[c] = X_train[c].replace(9999999999, replace_val)
    if X_test[c].describe()['max'] == 9999999999:
        X_test[c] = X_test[c].replace(9999999999, replace_val)
        
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.



Model Report
best n_estimators: 116
AUC Score (Train): 0.863213
AUC Score (Test) : 0.833875


In [81]:
# change 999999 to very positive numbers, i.e. +10

# df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN, santander.FILE_TEST)
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(santander.FILE_TRAIN,
                                                                                        santander.FILE_TEST)


replace_val = +10
for c in ratio_cols:
    if X_train[c].describe()['max'] == 9999999999:
        X_train[c] = X_train[c].replace(9999999999, replace_val)
    if X_test[c].describe()['max'] == 9999999999:
        X_test[c] = X_test[c].replace(9999999999, replace_val)
        
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.



Model Report
best n_estimators: 116
AUC Score (Train): 0.863213
AUC Score (Test) : 0.833875


In [82]:
# change 999999 to 1

# df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN, santander.FILE_TEST)
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(santander.FILE_TRAIN,
                                                                                        santander.FILE_TEST)


replace_val = 1
for c in ratio_cols:
    if X_train[c].describe()['max'] == 9999999999:
        X_train[c] = X_train[c].replace(9999999999, replace_val)
    if X_test[c].describe()['max'] == 9999999999:
        X_test[c] = X_test[c].replace(9999999999, replace_val)
        
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.



Model Report
best n_estimators: 116
AUC Score (Train): 0.863213
AUC Score (Test) : 0.833875


## Remove duplicates and constant columns

In [3]:
if not os.path.exists(santander.FILE_TRAIN_DEDUP):
    santander.read_process_write(santander.FILE_TRAIN,
                                 santander.FILE_TEST,
                                 santander.FILE_TRAIN_DEDUP,
                                 santander.FILE_TEST_DEDUP,
                                 santander.remove_duplicates_const)

## One-hot encode binary features - cols that start with 'ind_'

In [20]:
if not os.path.exists(santander.FILE_TRAIN_DEDUP_ONEHOT):
    santander.read_process_write(santander.FILE_TRAIN_DEDUP,
                                 santander.FILE_TEST_DEDUP,
                                 santander.FILE_TRAIN_DEDUP_ONEHOT,
                                 santander.FILE_TEST_DEDUP_ONEHOT,
                                 santander.one_hot_encode_binary_features)

## Process known NaNs

https://www.kaggle.com/c/santander-customer-satisfaction/forums/t/19291/data-dictionary/111360#post111360

In [5]:
# def process_known_nans_bak():
#     df = pd.read_csv(FILE_TRAIN_DEDUP_ONEHOT)
#     feature_cols = list(df.columns)
#     feature_cols.remove(TARGET_COL)
#     df_test = pd.read_csv(FILE_TEST_DEDUP_ONEHOT, index_col='ID')
    
#     # Var3
#     df['var3'] = df.var3.replace(-999999, np.nan)
#     df_test['var3'] = df_test.var3.replace(-999999, np.nan)
    
#     # Find integer features with null values
#     for c in feature_cols:
#         if df[c].describe()['max'] == 9999999999:
#             df[c] = df[c].replace(9999999999, np.nan)
#             df_test[c] = df_test[c].replace(9999999999, np.nan)
    
#     # Remove constant columns
#     df.drop(eda.find_const_cols(df), axis=1, inplace=True)

#     # Remove duplicate columns and then rows again
#     df = munge.remove_duplicates(df.T).T.drop_duplicates()
    
#     # Write to file
#     df.to_csv(FILE_TRAIN_DEDUP_ONEHOT_NA, index=False)
#     feature_cols = list(df.columns)
#     feature_cols.remove(TARGET_COL)
#     df_test[feature_cols].to_csv(FILE_TEST_DEDUP_ONEHOT_NA)
    
    
# if not os.path.exists(FILE_TRAIN_DEDUP_ONEHOT_NA):
#     process_known_nans()

## Fill in null values

In [25]:
if not os.path.exists(santander.FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_MEAN):
    santander.read_process_write(santander.FILE_TRAIN_DEDUP_ONEHOT_NA,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA,
                                 santander.FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_MEAN,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA_IMPUTE_MEAN,
                                 santander.impute_null_vals,
                                 pass_features=True,
                                 process_kwargs={'strategy': 'mean'})
    
if not os.path.exists(santander.FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_MEDIAN):
    santander.read_process_write(santander.FILE_TRAIN_DEDUP_ONEHOT_NA,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA,
                                 santander.FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_MEDIAN,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA_IMPUTE_MEDIAN,
                                 santander.impute_null_vals,
                                 pass_features=True,
                                 process_kwargs={'strategy': 'median'})
    
if not os.path.exists(santander.FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_FREQ):
    santander.read_process_write(santander.FILE_TRAIN_DEDUP_ONEHOT_NA,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA,
                                 santander.FILE_TRAIN_DEDUP_ONEHOT_NA_IMPUTE_FREQ,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA_IMPUTE_FREQ,
                                 santander.impute_null_vals,
                                 pass_features=True,
                                 process_kwargs={'strategy': 'most_frequent'})

## Turn some of the integer columns to categorical features

In [38]:
if not os.path.exists(santander.FILE_TRAIN_DEDUP_ONEHOT_NA_ONEHOTINT):
    santander.read_process_write(santander.FILE_TRAIN_DEDUP_ONEHOT_NA,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA,
                                 santander.FILE_TRAIN_DEDUP_ONEHOT_NA_ONEHOTINT,
                                 santander.FILE_TEST_DEDUP_ONEHOT_NA_ONEHOTINT,
                                 santander.one_hot_int,
                                 pass_features=True)

## Check step by step processing

In [4]:
# Read data from file
df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN, santander.FILE_TEST)

# Split up the data
X_all = df_train[feature_cols]  # feature values for all students
y_all = df_train[santander.TARGET_COL]
test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = santander.train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 115



Model Report
best n_estimators: 116
AUC Score (Train): 0.863213
AUC Score (Test) : 0.833875


In [5]:
# Dedup and const
df_train, df_test = santander.remove_duplicates_const(df_train, df_test)

feature_cols = list(df_train.columns)
feature_cols.remove(santander.TARGET_COL)

# Split up the data
X_all = df_train[feature_cols]  # feature values for all students
y_all = df_train[santander.TARGET_COL]
test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = santander.train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]



Model Report
best n_estimators: 116
AUC Score (Train): 0.860659
AUC Score (Test) : 0.841680


In [6]:
# Set var 3 null values
df_train, df_test = santander.set_var3_null(df_train, df_test)

feature_cols = list(df_train.columns)
feature_cols.remove(santander.TARGET_COL)

# Split up the data
X_all = df_train[feature_cols]  # feature values for all students
y_all = df_train[santander.TARGET_COL]
test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = santander.train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]



Model Report
best n_estimators: 116
AUC Score (Train): 0.861306
AUC Score (Test) : 0.842895


In [7]:
# Fix 99999999 values in delta columns
df_train, df_test = santander.fix_delta_cols(df_train, df_test, replace_with=1)

feature_cols = list(df_train.columns)
feature_cols.remove(santander.TARGET_COL)

# Split up the data
X_all = df_train[feature_cols]  # feature values for all students
y_all = df_train[santander.TARGET_COL]
test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = santander.train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]



Model Report
best n_estimators: 116
AUC Score (Train): 0.861306
AUC Score (Test) : 0.842895


In [4]:
# # Fix 99999999 values in delta columns

# df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN, santander.FILE_TEST)
# df_train, df_test = santander.remove_duplicates_const(df_train, df_test)
# df_train, df_test = santander.set_var3_null(df_train, df_test)
# df_train, df_test = santander.fix_delta_cols(df_train, df_test, replace_with=2)

# feature_cols = list(df_train.columns)
# feature_cols.remove(santander.TARGET_COL)

# # Split up the data
# X_all = df_train[feature_cols]  # feature values for all students
# y_all = df_train[santander.TARGET_COL]
# test_size = 0.3 # 30 percent
# X_train, X_test, y_train, y_test = santander.train_test_split(
#     X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

# cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 107



Model Report
best n_estimators: 108
AUC Score (Train): 0.860285
AUC Score (Test) : 0.842551


In [8]:
# one hot encode binary
df_train, df_test = santander.one_hot_encode_binary_features(df_train, df_test)

feature_cols = list(df_train.columns)
feature_cols.remove(santander.TARGET_COL)

# Split up the data
X_all = df_train[feature_cols]  # feature values for all students
y_all = df_train[santander.TARGET_COL]
test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = santander.train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]



Model Report
best n_estimators: 116
AUC Score (Train): 0.861306
AUC Score (Test) : 0.842895


In [9]:
# Save data to file
santander.write_data(df_train,
                     df_test, 
                     santander.FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT,
                     santander.FILE_TEST_DEDUP_VAR3_DELTA_1HOT)

In [8]:
# Load data from file
df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN_DEDUP_VAR3_DELTA1_1HOT,
                                                      santander.FILE_TEST_DEDUP_VAR3_DELTA1_1HOT)

In [None]:
# One hot encode int
df_train, df_test = santander.one_hot_int(df_train, df_test, feature_cols)

feature_cols = list(df_train.columns)
feature_cols.remove(santander.TARGET_COL)

# Split up the data
X_all = df_train[feature_cols]  # feature values for all students
y_all = df_train[santander.TARGET_COL]
test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = santander.train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

In [50]:
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 100 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]



Model Report
best n_estimators: 116
AUC Score (Train): 0.861654
AUC Score (Test) : 0.842732


In [51]:
# Save data to file
santander.write_data(df_train,
                     df_test, 
                     santander.FILE_TRAIN_DEDUP_VAR3_DELTA1_1HOT_1HOTINT,
                     santander.FILE_TEST_DEDUP_VAR3_DELTA1_1HOT_1HOTINT)

In [57]:
df_train[santander.TARGET_COL].value_counts()

0.0    68398
1.0     2815
Name: TARGET, dtype: int64

In [5]:
%%time

# With null values

# Read from file
df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN, santander.FILE_TEST)

# Dedup and const
df_train, df_test = santander.remove_duplicates_const(df_train, df_test)

# Set var 3 null values
df_train, df_test = santander.set_var3_null(df_train, df_test)

# Fix 99999999 values in delta columns
df_train, df_test = santander.fix_delta_cols(df_train, df_test, replace_with=np.nan)

# One hot encode binary
df_train, df_test = santander.one_hot_encode_binary_features(df_train, df_test)

# Save data to file
santander.write_data(df_train,
                     df_test, 
                     santander.FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT,
                     santander.FILE_TEST_DEDUP_VAR3_DELTANAN_1HOT)

CPU times: user 11min 38s, sys: 32.6 s, total: 12min 11s
Wall time: 12min 11s


In [6]:
# Load data from file
df_train, df_test, feature_cols = santander.read_data(santander.FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT,
                                                      santander.FILE_TEST_DEDUP_VAR3_DELTANAN_1HOT)

In [7]:
# One hot encode int
df_train, df_test = santander.one_hot_int(df_train, df_test, feature_cols, delta_nulltype=np.nan)

feature_cols = list(df_train.columns)
feature_cols.remove(santander.TARGET_COL)

# Split up the data
X_all = df_train[feature_cols]  # feature values for all students
y_all = df_train[santander.TARGET_COL]
test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = santander.train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

In [8]:
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 108



Model Report
best n_estimators: 109
AUC Score (Train): 0.860122
AUC Score (Test) : 0.842721


In [9]:
# Save data to file
santander.write_data(df_train,
                     df_test, 
                     santander.FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT_1HOTINT,
                     santander.FILE_TEST_DEDUP_VAR3_DELTANAN_1HOT_1HOTINT)

In [162]:
saldo_cols = []
for c in df:
    if c.find('saldo') > -1:
        saldo_cols.append(c)

In [164]:
for c in df:
    if c.find('var13') > -1:
        print(c)

num_var13_0
num_var13_largo_0
num_var13_largo
num_var13_medio_0
num_var13
saldo_var13_corto
saldo_var13_largo
saldo_var13_medio
saldo_var13
delta_imp_aport_var13_1y3
delta_num_aport_var13_1y3
imp_aport_var13_hace3
imp_aport_var13_ult1
imp_reemb_var13_ult1
num_aport_var13_hace3
num_aport_var13_ult1
saldo_medio_var13_corto_hace2
saldo_medio_var13_corto_hace3
saldo_medio_var13_corto_ult1
saldo_medio_var13_corto_ult3
saldo_medio_var13_largo_hace2
saldo_medio_var13_largo_hace3
saldo_medio_var13_largo_ult1
saldo_medio_var13_largo_ult3
saldo_medio_var13_medio_hace2
saldo_medio_var13_medio_ult3
onehot_ind_var13_0_0
onehot_ind_var13_0_1
onehot_ind_var13_corto_0_0
onehot_ind_var13_corto_0_1
onehot_ind_var13_corto_0
onehot_ind_var13_corto_1
onehot_ind_var13_largo_0_0
onehot_ind_var13_largo_0_1
onehot_ind_var13_largo_0
onehot_ind_var13_largo_1
onehot_ind_var13_medio_0_0
onehot_ind_var13_medio_0_1
onehot_ind_var13_0
onehot_ind_var13_1
onehot_num_reemb_var13_ult1_0
onehot_num_reemb_var13_ult1_3
oneh

In [163]:
saldo_cols

['saldo_var1',
 'saldo_var5',
 'saldo_var6',
 'saldo_var8',
 'saldo_var12',
 'saldo_var13_corto',
 'saldo_var13_largo',
 'saldo_var13_medio',
 'saldo_var13',
 'saldo_var14',
 'saldo_var17',
 'saldo_var18',
 'saldo_var20',
 'saldo_var24',
 'saldo_var26',
 'saldo_var25',
 'saldo_var30',
 'saldo_var31',
 'saldo_var32',
 'saldo_var33',
 'saldo_var34',
 'saldo_var37',
 'saldo_var40',
 'saldo_var42',
 'saldo_var44',
 'saldo_medio_var5_hace2',
 'saldo_medio_var5_hace3',
 'saldo_medio_var5_ult1',
 'saldo_medio_var5_ult3',
 'saldo_medio_var8_hace2',
 'saldo_medio_var8_hace3',
 'saldo_medio_var8_ult1',
 'saldo_medio_var8_ult3',
 'saldo_medio_var12_hace2',
 'saldo_medio_var12_hace3',
 'saldo_medio_var12_ult1',
 'saldo_medio_var12_ult3',
 'saldo_medio_var13_corto_hace2',
 'saldo_medio_var13_corto_hace3',
 'saldo_medio_var13_corto_ult1',
 'saldo_medio_var13_corto_ult3',
 'saldo_medio_var13_largo_hace2',
 'saldo_medio_var13_largo_hace3',
 'saldo_medio_var13_largo_ult1',
 'saldo_medio_var13_largo_ult

In [169]:
np.array_equal(df[['saldo_var13_corto', 'saldo_var13_largo', 'saldo_var13_medio']].sum(axis=1).values, df['saldo_var13'].values)

True

In [170]:
df[['saldo_var13_corto', 'saldo_var13_largo', 'saldo_var13_medio']].head()

Unnamed: 0,saldo_var13_corto,saldo_var13_largo,saldo_var13_medio
0,0.0,0.0,0.0
1,300.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
