
## <a id="Supervised-Learning-Model">Part 2: Supervised Learning Model<br></a>

In [1]:
import numpy as np
import pandas as pd
import pickle
import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib

import lightgbm as lgb

import dsp

  import pandas.util.testing as tm


In [2]:
mailout_train = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Load feature info
feat_info = pd.read_csv('data/features.csv')
feat_info.set_index('attribute', inplace=True)

feat_info['missing_or_unknown'] = feat_info['missing_or_unknown'].apply(dsp.parse_missing)

In [4]:
mailout_train_clean = dsp.clean_data_mailout(mailout_train, feat_info)

In [5]:
# mailout_train_clean = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN_clean.csv', sep=';')
mailout_train_clean.shape

(42962, 384)

In [6]:
mailout_train.shape

(42962, 368)

In [8]:
lnr = mailout_train_clean.LNR
y = mailout_train_clean.RESPONSE
X = mailout_train_clean.drop(['RESPONSE', 'LNR'], axis=1)
X.shape

(42962, 382)

In [9]:
# Load feature info
feat_info = pd.read_csv('data/features.csv')
feat_info.set_index('attribute', inplace=True)

In [13]:
# Make the preprocessor
numerical, categorical = dsp.get_columns(X.columns, feat_info)
preprocessor = dsp.make_preprocessor(numerical, categorical)

X_processed = preprocessor.fit_transform(X)

In [14]:
numerical

['KBA13_ANZAHL_PKW',
 'ANZ_HH_TITEL',
 'ANZ_STATISTISCHE_HAUSHALTE',
 'ANZ_PERSONEN',
 'MIN_GEBAEUDEJAHR',
 'ANZ_KINDER',
 'ANZ_TITEL',
 'ANZ_HAUSHALTE_AKTIV']

In [15]:
categorical

['KBA13_CCM_1200',
 'BALLRAUM',
 'D19_TELKO_REST',
 'KONSUMNAEHE',
 'KBA05_HERST5',
 'KBA13_HALTER_66',
 'KBA13_KMH_140',
 'D19_BANKEN_GROSS',
 'ORTSGR_KLS9',
 'D19_LETZTER_KAUF_BRANCHE_D19_BILDUNG',
 'KBA13_HERST_ASIEN',
 'CAMEO_DEUG_2015',
 'KBA13_SEG_OBEREMITTELKLASSE',
 'D19_KONSUMTYP_MAX',
 'SEMIO_ERL',
 'KBA13_SEG_KLEINWAGEN',
 'PLZ8_GBZ',
 'KBA13_MOTOR',
 'SEMIO_DOM',
 'GEBAEUDETYP_RASTER',
 'KBA13_ANTG3',
 'VK_DISTANZ',
 'FIRMENDICHTE',
 'KBA13_KRSHERST_BMW_BENZ',
 'KBA05_ANHANG',
 'CJT_GESAMTTYP',
 'KBA05_KRSHERST1',
 'ARBEIT',
 'D19_BIO_OEKO',
 'KBA05_MAXAH',
 'KBA13_ANTG1',
 'KBA13_SEG_GROSSRAUMVANS',
 'KBA13_FAB_ASIEN',
 'D19_LETZTER_KAUF_BRANCHE_D19_BANKEN_REST',
 'UMFELD_ALT',
 'KBA05_SEG5',
 'KBA13_HALTER_30',
 'KBA05_KRSHERST3',
 'FINANZTYP',
 'KBA13_HALTER_65',
 'D19_SOZIALES',
 'KBA13_SEG_KOMPAKTKLASSE',
 'SEMIO_TRADV',
 'KBA13_SITZE_5',
 'SEMIO_KAEM',
 'D19_LETZTER_KAUF_BRANCHE_D19_TELKO_REST',
 'KBA13_SEG_SONSTIGE',
 'KBA05_ZUL3',
 'KBA13_KRSZUL_NEU',
 'KBA13_PEUGEO

In [16]:
processed_cols = numerical+categorical
processed_cols

['KBA13_ANZAHL_PKW',
 'ANZ_HH_TITEL',
 'ANZ_STATISTISCHE_HAUSHALTE',
 'ANZ_PERSONEN',
 'MIN_GEBAEUDEJAHR',
 'ANZ_KINDER',
 'ANZ_TITEL',
 'ANZ_HAUSHALTE_AKTIV',
 'KBA13_CCM_1200',
 'BALLRAUM',
 'D19_TELKO_REST',
 'KONSUMNAEHE',
 'KBA05_HERST5',
 'KBA13_HALTER_66',
 'KBA13_KMH_140',
 'D19_BANKEN_GROSS',
 'ORTSGR_KLS9',
 'D19_LETZTER_KAUF_BRANCHE_D19_BILDUNG',
 'KBA13_HERST_ASIEN',
 'CAMEO_DEUG_2015',
 'KBA13_SEG_OBEREMITTELKLASSE',
 'D19_KONSUMTYP_MAX',
 'SEMIO_ERL',
 'KBA13_SEG_KLEINWAGEN',
 'PLZ8_GBZ',
 'KBA13_MOTOR',
 'SEMIO_DOM',
 'GEBAEUDETYP_RASTER',
 'KBA13_ANTG3',
 'VK_DISTANZ',
 'FIRMENDICHTE',
 'KBA13_KRSHERST_BMW_BENZ',
 'KBA05_ANHANG',
 'CJT_GESAMTTYP',
 'KBA05_KRSHERST1',
 'ARBEIT',
 'D19_BIO_OEKO',
 'KBA05_MAXAH',
 'KBA13_ANTG1',
 'KBA13_SEG_GROSSRAUMVANS',
 'KBA13_FAB_ASIEN',
 'D19_LETZTER_KAUF_BRANCHE_D19_BANKEN_REST',
 'UMFELD_ALT',
 'KBA05_SEG5',
 'KBA13_HALTER_30',
 'KBA05_KRSHERST3',
 'FINANZTYP',
 'KBA13_HALTER_65',
 'D19_SOZIALES',
 'KBA13_SEG_KOMPAKTKLASSE',
 'SEMI

## Model 1: Gradient Boost

In [17]:
clf = GradientBoostingClassifier()
clf.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [18]:
param_grid = {'learning_rate': [.001],
              'max_depth': [5],
              'random_state': [42]}

start_time = time.time()

grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs = 3)
grid.fit(X_processed, y)

elapsed_time = (time.time() - start_time) / 60
print('Elapsed computation time: {:.3f} mins'.format(elapsed_time))

Elapsed computation time: 9.132 mins


In [19]:
print(grid.best_score_)
print(grid.best_estimator_)

0.7141489049167998
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.001, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


0.7652356030850433<br>
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.001, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [20]:
pickle.dump(grid.best_estimator_, open('models/gb_model4.pkl', 'wb'))

## Model 2: AdaBoost

In [21]:
clf = AdaBoostClassifier(DecisionTreeClassifier())
clf.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator__ccp_alpha': 0.0,
 'base_estimator__class_weight': None,
 'base_estimator__criterion': 'gini',
 'base_estimator__max_depth': None,
 'base_estimator__max_features': None,
 'base_estimator__max_leaf_nodes': None,
 'base_estimator__min_impurity_decrease': 0.0,
 'base_estimator__min_impurity_split': None,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__min_samples_split': 2,
 'base_estimator__min_weight_fraction_leaf': 0.0,
 'base_estimator__presort': 'deprecated',
 'base_estimator__random_state': None,
 'base_estimator__splitter': 'best',
 'base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
               

In [22]:
param_grid = {'learning_rate': [0.1],
              'n_estimators':[50],
              'random_state': [42],
              'base_estimator__max_depth': [1, 3],
              'base_estimator__max_features': [20, None]}

start_time = time.time()

ada = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs = 3)
ada_result =  ada.fit(X_processed, y)

elapsed_time = (time.time() - start_time) / 60
print('Elapsed computation time: {:.3f} mins'.format(elapsed_time))

Elapsed computation time: 3.159 mins


In [23]:
print(ada.best_score_)
print(ada.best_estimator_)

0.7636689583386754
AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
       

In [24]:
means = ada_result.cv_results_['mean_test_score']
stds = ada_result.cv_results_['std_test_score']
params = ada_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.700693 (0.026409) with: {'base_estimator__max_depth': 1, 'base_estimator__max_features': 20, 'learning_rate': 0.1, 'n_estimators': 50, 'random_state': 42}
0.763669 (0.022653) with: {'base_estimator__max_depth': 1, 'base_estimator__max_features': None, 'learning_rate': 0.1, 'n_estimators': 50, 'random_state': 42}
0.651994 (0.038511) with: {'base_estimator__max_depth': 3, 'base_estimator__max_features': 20, 'learning_rate': 0.1, 'n_estimators': 50, 'random_state': 42}
0.687117 (0.041849) with: {'base_estimator__max_depth': 3, 'base_estimator__max_features': None, 'learning_rate': 0.1, 'n_estimators': 50, 'random_state': 42}


In [25]:
pickle.dump(ada.best_estimator_, open('models/ada_model4.pkl', 'wb'))

## Model 3: LightGBM

In [26]:
lgbm_clf = lgb.LGBMClassifier(objective='binary', metric='auc')
lgbm_clf.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'metric': 'auc'}

In [27]:
param_grid = {'learning_rate': [0.01],
              'num_iterations': [200],
              'boosting_type': ['gbdt','dart'],
              'num_leaves': [62],
              'random_state': [42]}

start_time = time.time()

lgbm = GridSearchCV(estimator=lgbm_clf, param_grid=param_grid, scoring='roc_auc', cv=5)
lgbm_result = lgbm.fit(X_processed, y)

elapsed_time = (time.time() - start_time) / 60
print('Elapsed computation time: {:.3f} mins'.format(elapsed_time))



Elapsed computation time: 0.876 mins


In [28]:
print(lgbm.best_score_)
print(lgbm.best_estimator_)

0.7598203476237045
LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.01, max_depth=-1,
               metric='auc', min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
               num_iterations=200, num_leaves=62, objective='binary',
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [29]:
means = lgbm_result.cv_results_['mean_test_score']
stds = lgbm_result.cv_results_['std_test_score']
params = lgbm_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.752557 (0.027327) with: {'boosting_type': 'gbdt', 'learning_rate': 0.01, 'num_iterations': 200, 'num_leaves': 62, 'random_state': 42}
0.759820 (0.022884) with: {'boosting_type': 'dart', 'learning_rate': 0.01, 'num_iterations': 200, 'num_leaves': 62, 'random_state': 42}


In [30]:
pickle.dump(lgbm.best_estimator_, open('models/lgbm_model4.pkl', 'wb'))

## <a id="Kaggle-Competition">Part 3: Kaggle Competition<br></a>


In [61]:
# Load feature info
feat_info = pd.read_csv('data/features.csv')
feat_info.set_index('attribute', inplace=True)

feat_info['missing_or_unknown'] = feat_info['missing_or_unknown'].apply(dsp.parse_missing)

In [62]:
mailout_test = pd.read_csv('data/Udacity_MAILOUT_052018_TEST.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [63]:
mailout_test_clean = dsp.clean_data_mailout(mailout_test, feat_info)

In [64]:
mailout_test_clean.to_csv('data/Udacity_MAILOUT_052018_TEST_clean.csv', sep=';')

In [65]:
mailout_test_clean = pd.read_csv('data/Udacity_MAILOUT_052018_TEST_clean.csv', sep=';')
print(mailout_test_clean.shape)
lnr = mailout_test_clean.LNR
mailout_test_clean.drop(['LNR'], axis=1, inplace=True)

(42833, 384)


In [66]:
# Preprocess
test_processed = preprocessor.transform(mailout_test_clean)



In [67]:
# Gradient Boost
gb_model = joblib.load('models/gb_model4.pkl')
preds = gb_model.predict_proba(test_processed)
submission = pd.DataFrame({'LNR':lnr, 'RESPONSE':preds[:,0]})
submission.to_csv('submission/gb_preds.csv', index=False)
submission.head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.985616
1,1770,0.985616
2,1465,0.988564
3,1470,0.988564
4,1478,0.988564


In [68]:
# AdaBoost
ada_model = joblib.load('models/ada_model4.pkl')
submission = make_submission(ada_model, test_processed, lnr)
submission.to_csv('submission/ada_preds.csv', index=False)
submission.head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.676132
1,1770,0.67117
2,1465,0.756437
3,1470,0.755163
4,1478,0.739913


In [69]:
# LightGBM
lgbm_model = joblib.load('models/lgbm_model4.pkl')
submission = make_submission(lgbm_model, test_processed, lnr)
submission.to_csv('submission/lgbm_preds.csv', index=False)
submission.head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.807328
1,1770,0.829305
2,1465,0.848008
3,1470,0.848012
4,1478,0.848018
