In [58]:
import pandas as pd
import numpy as np

### GOAL: Create boilerplate code that can easily be placed in to evaluate models

In [59]:
data = pd.read_csv('data/titanic/train.csv')
print(data.shape)
print("Number of Null Values:", np.sum(data.isnull().sum()))
data.head()

(891, 12)
Number of Null Values: 866


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Data Pre-Processing

In [60]:
# Identify columns with missing values
for column in data.columns:
    print("Column {} has {} NaN values".format(column, data[column].isnull().sum()))
    
# Impute accordingly
data.loc[data['Age'].isnull(), 'Age'] = data['Age'].median()
data.loc[data['Embarked'].isnull(), 'Embarked'] = data['Embarked'].mode()

Column PassengerId has 0 NaN values
Column Survived has 0 NaN values
Column Pclass has 0 NaN values
Column Name has 0 NaN values
Column Sex has 0 NaN values
Column Age has 177 NaN values
Column SibSp has 0 NaN values
Column Parch has 0 NaN values
Column Ticket has 0 NaN values
Column Fare has 0 NaN values
Column Cabin has 687 NaN values
Column Embarked has 2 NaN values


In [62]:
# Train-test prep
keeper_columns = ['Age', 'Fare', 'Sex', 'Embarked']
train_x = data[keeper_columns]
train_y = data['Survived']

# Convert to one-hot encoding
train_x = pd.get_dummies(train_x)
train_x.head()

Unnamed: 0,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,0,1,0,0,1
1,38.0,71.2833,1,0,1,0,0
2,26.0,7.925,1,0,0,0,1
3,35.0,53.1,1,0,0,0,1
4,35.0,8.05,0,1,0,0,1


#### Data Modeling

In a pipeline, scalers are applied before each cross validation so there isn't any information leaking to the test set versus applying the transform for the entire train-set and then doing cross-validation.

In [99]:
from scipy.stats import randint, uniform, expon
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [131]:
# Pipelines
common_steps1 = [('scl', StandardScaler())]
common_steps2 = [('scl', StandardScaler()),
                 ('pca', PCA(n_components=2))]
pipe_lr = Pipeline(common_steps1 + [('clf', LogisticRegression(random_state=42))])
pipe_svm = Pipeline(common_steps1 + [('clf', SVC(random_state=42))])
pipe_rf = Pipeline(common_steps1 + [('clf', RandomForestClassifier(random_state=42))])
pipe_xgb = Pipeline(common_steps1 + [('clf', XGBClassifier(random_state=42))])
pipe_lr_pca = Pipeline(common_steps2 + [('clf', LogisticRegression(random_state=42))])
pipe_svm_pca = Pipeline(common_steps2 + [('clf', SVC(random_state=42))])
pipe_rf_pca = Pipeline(common_steps2 + [('clf', RandomForestClassifier(random_state=42))])
pipe_xgb_pca = Pipeline(common_steps2 + [('clf', XGBClassifier(random_state=42))])

# Hyperparameters
params_lr = {'clf__penalty':['l1', 'l2'],
             'clf__C': [1.0, 0.5, 0.1],
             'clf__solver': ['liblinear']}
params_svm = {'clf__kernel': ['linear', 'rbf'],
              'clf__C':[0.001, 0.1, 10], 
              'clf__gamma':[0.1, 0.01]}
params_rf = {'clf__criterion': ['gini', 'entropy'],
             'clf__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             'clf__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             'clf__min_samples_split': [3, 5, 7, 9]}
params_xgb = {'clf__n_estimators': [10, 50, 100],
              'clf__max_depth': [1, 2, 3, 4, 5],
              'clf__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]}

pipe_dict_gs = {'LR': {'pipe': pipe_lr,
                    'params': params_lr},
                'LR_PCA': {'pipe': pipe_lr_pca,
                        'params': params_lr},
             'SVM': {'pipe': pipe_svm,
                     'params': params_svm},
             'SVM_PCA': {'pipe': pipe_svm_pca,
                         'params': params_svm},
             'RF': {'pipe': pipe_dt,
                    'params': params_dt},
             'RF_PCA': {'pipe': pipe_dt_pca,
                        'params': params_dt},
             'XGB': {'pipe': pipe_xgb,
                     'params': params_xgb},
             'XGB_PCA': {'pipe': pipe_xgb_pca,
                         'params': params_xgb},}

# Hyperparameters
params_lr = {'clf__penalty':['l1', 'l2'],
             'clf__C': uniform(0, 1),
             'clf__solver': ['liblinear']}
params_svm = {'clf__kernel': ['linear', 'rbf'],
              'clf__C': expon(0, 5), 
              'clf__gamma': uniform(0, 0.1)}
params_rf = {'clf__criterion': ['gini', 'entropy'],
             'clf__min_samples_leaf': randint(3, 10),
             'clf__max_depth': randint(3, 10),
             'clf__min_samples_split': randint(3, 10)}
params_xgb = {'clf__n_estimators': randint(10, 100),
              'clf__max_depth': randint(1, 5),
              'clf__learning_rate': expon(0, 0.5)}

pipe_dict_rs = {'LR': {'pipe': pipe_lr,
                    'params': params_lr},
                'LR_PCA': {'pipe': pipe_lr_pca,
                        'params': params_lr},
             'SVM': {'pipe': pipe_svm,
                     'params': params_svm},
             'SVM_PCA': {'pipe': pipe_svm_pca,
                         'params': params_svm},
             'RF': {'pipe': pipe_dt,
                    'params': params_dt},
             'RF_PCA': {'pipe': pipe_dt_pca,
                        'params': params_dt},
             'XGB': {'pipe': pipe_xgb,
                     'params': params_xgb},
             'XGB_PCA': {'pipe': pipe_xgb_pca,
                         'params': params_xgb},}

In [141]:
%%time
# Grid Search Cross-Validation
print('--Grid Search--')
gs_df = pd.DataFrame()
for key in pipe_dict_gs.keys():
    print("Processing:", key)
    grid = GridSearchCV(pipe_dict_gs[key]['pipe'],
                        param_grid=pipe_dict_gs[key]['params'],
                        cv=5,
                        n_jobs=-1)
    # Perform CV
    grid.fit(train_x, train_y)
    print('Best Parameters:', grid.best_params_)
    print('Best Score:', grid.best_score_)
    print('-----')
    # Collect results
    results = pd.DataFrame(grid.cv_results_)
    results['algo'] = key
    gs_df = gs_df.append(results, ignore_index=True, sort=False)
    
# Random Search Cross-Validation 
print('--Random Search--')
rs_df = pd.DataFrame()
for key in pipe_dict_rs.keys():
    print("Processing:", key)
    rs = RandomizedSearchCV(pipe_dict_rs[key]['pipe'],
                            param_distributions=pipe_dict_rs[key]['params'],
                            n_iter=10,
                            cv=5,
                            n_jobs=-1)
    # Perform CV
    rs.fit(train_x, train_y)
    print('Best Parameters:', rs.best_params_)
    print('Best Score:', rs.best_score_)
    print('-----')
    # Collect results
    results = pd.DataFrame(rs.cv_results_)
    results['algo'] = key
    rs_df = rs_df.append(results, ignore_index=True, sort=False)

--Grid Search--
Processing: LR
Best Parameters: {'clf__C': 0.1, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best Score: 0.7811447811447811
-----
Processing: LR_PCA
Best Parameters: {'clf__C': 1.0, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best Score: 0.7878787878787878
-----
Processing: SVM
Best Parameters: {'clf__C': 10, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}
Best Score: 0.792368125701459
-----
Processing: SVM_PCA
Best Parameters: {'clf__C': 0.1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}
Best Score: 0.7890011223344556
-----
Processing: RF
Best Parameters: {'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 9}
Best Score: 0.8002244668911336
-----
Processing: RF_PCA
Best Parameters: {'clf__criterion': 'entropy', 'clf__max_depth': 3, 'clf__min_samples_leaf': 10, 'clf__min_samples_split': 3}
Best Score: 0.7968574635241302
-----
Processing: XGB
Best Parameters: {'clf__learning_rate': 0.2, 'clf__max_depth': 4, 'clf__n_estimato

In [142]:
# Sort by top performing algorithm
gs_df.sort_values(by=['mean_test_score'], ascending=False).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__penalty,param_clf__solver,params,split0_test_score,split1_test_score,...,rank_test_score,algo,param_clf__gamma,param_clf__kernel,param_clf__criterion,param_clf__max_depth,param_clf__min_samples_leaf,param_clf__min_samples_split,param_clf__learning_rate,param_clf__n_estimators
1707,0.109202,0.003278,0.003456,0.000443,,,,"{'clf__learning_rate': 0.2, 'clf__max_depth': ...",0.787709,0.815642,...,1,XGB,,,,4,,,0.2,100
1722,0.08729,0.019579,0.002786,0.000444,,,,"{'clf__learning_rate': 0.3, 'clf__max_depth': ...",0.782123,0.815642,...,2,XGB,,,,4,,,0.3,100
1719,0.085444,0.009064,0.002759,0.000319,,,,"{'clf__learning_rate': 0.3, 'clf__max_depth': ...",0.793296,0.826816,...,3,XGB,,,,3,,,0.3,100
1710,0.125366,0.00658,0.00336,0.0001,,,,"{'clf__learning_rate': 0.2, 'clf__max_depth': ...",0.776536,0.815642,...,4,XGB,,,,5,,,0.2,100
1709,0.061872,0.004582,0.002683,0.000276,,,,"{'clf__learning_rate': 0.2, 'clf__max_depth': ...",0.776536,0.804469,...,4,XGB,,,,5,,,0.2,50
1724,0.045118,0.005421,0.002025,0.000212,,,,"{'clf__learning_rate': 0.3, 'clf__max_depth': ...",0.77095,0.821229,...,4,XGB,,,,5,,,0.3,50
1721,0.04674,0.006451,0.002389,0.000275,,,,"{'clf__learning_rate': 0.3, 'clf__max_depth': ...",0.776536,0.815642,...,7,XGB,,,,4,,,0.3,50
1695,0.127023,0.005423,0.00338,5.8e-05,,,,"{'clf__learning_rate': 0.1, 'clf__max_depth': ...",0.776536,0.804469,...,8,XGB,,,,5,,,0.1,100
1680,0.133507,0.008141,0.002937,6.6e-05,,,,"{'clf__learning_rate': 0.01, 'clf__max_depth':...",0.804469,0.804469,...,9,XGB,,,,5,,,0.01,100
1783,0.0163,0.000387,0.002442,0.000119,,,,"{'clf__learning_rate': 0.1, 'clf__max_depth': ...",0.793296,0.821229,...,1,XGB_PCA,,,,5,,,0.1,10


In [143]:
# Sort by top performing algorithm
rs_df.sort_values(by=['mean_test_score'], ascending=False).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__penalty,param_clf__solver,params,split0_test_score,split1_test_score,...,rank_test_score,algo,param_clf__gamma,param_clf__kernel,param_clf__min_samples_split,param_clf__min_samples_leaf,param_clf__max_depth,param_clf__criterion,param_clf__learning_rate,param_clf__n_estimators
68,0.02761,0.005625,0.001921,0.000362,,,,"{'clf__learning_rate': 0.48337032988825857, 'c...",0.804469,0.821229,...,1,XGB,,,,,3.0,,0.48337,41.0
63,0.054555,0.012431,0.002147,0.00031,,,,"{'clf__learning_rate': 0.5267185581206723, 'cl...",0.75419,0.821229,...,2,XGB,,,,,4.0,,0.526719,69.0
72,0.017173,0.002339,0.002664,0.000463,,,,"{'clf__learning_rate': 0.17698738194518285, 'c...",0.810056,0.821229,...,1,XGB_PCA,,,,,2.0,,0.176987,24.0
69,0.041368,0.000898,0.001821,8.5e-05,,,,"{'clf__learning_rate': 0.14560913828943942, 'c...",0.782123,0.804469,...,3,XGB,,,,,4.0,,0.145609,65.0
27,0.029949,0.009068,0.002853,0.000671,18.8333,,,"{'clf__C': 18.833334168287344, 'clf__gamma': 0...",0.804469,0.826816,...,1,SVM,0.0972203,rbf,,,,,,
66,0.02848,0.000816,0.00169,9.9e-05,,,,"{'clf__learning_rate': 0.3703016896980694, 'cl...",0.798883,0.815642,...,4,XGB,,,,,2.0,,0.370302,67.0
51,0.006571,0.001013,0.001862,0.000436,,,,"{'clf__min_samples_split': 9, 'clf__min_sample...",0.810056,0.815642,...,1,RF_PCA,,,9.0,8.0,4.0,entropy,,
47,0.003565,0.000274,0.001126,0.000131,,,,"{'clf__min_samples_split': 5, 'clf__min_sample...",0.804469,0.804469,...,1,RF,,,5.0,4.0,4.0,gini,,
60,0.019215,0.001757,0.00237,4.4e-05,,,,"{'clf__learning_rate': 0.800800813876257, 'clf...",0.782123,0.810056,...,5,XGB,,,,,3.0,,0.800801,20.0
46,0.003658,0.000447,0.001144,0.000228,,,,"{'clf__min_samples_split': 7, 'clf__min_sample...",0.821229,0.798883,...,2,RF,,,7.0,2.0,5.0,entropy,,


In [57]:
# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

'parameters'

### TPOT - AutoML
- Optimization with genetic programming
- Performs feature preprocessing, feature selection, feature construction, model selection, and HPO

In [144]:
%%time
from tpot import TPOTClassifier

# Construct and fit TPOT classifier
tpot = TPOTClassifier(generations=10, verbosity=2)
tpot.fit(train_x, train_y)

# Results
print('Best pipeline test accuracy: %.3f' % tpot.score(train_x, train_y))

# Save best pipeline as Python script file
tpot.export('tpot_titanic_pipeline.py')



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=1100, style=ProgressStyle(descrip…

Generation 1 - Current best internal CV score: 0.8091872187509198
Generation 2 - Current best internal CV score: 0.8091872187509198
Generation 3 - Current best internal CV score: 0.8091872187509198
Generation 4 - Current best internal CV score: 0.8148178213460804
Generation 5 - Current best internal CV score: 0.8148178213460804
Generation 6 - Current best internal CV score: 0.8148178213460804
Generation 7 - Current best internal CV score: 0.8148178213460804
Generation 8 - Current best internal CV score: 0.817071218499837
Generation 9 - Current best internal CV score: 0.817071218499837
Generation 10 - Current best internal CV score: 0.817071218499837

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=8, min_samples_split=4, n_estimators=100, subsample=0.9500000000000001)
Best pipeline test accuracy: 0.907
CPU times: user 16min 19s, sys: 4.86 s, total: 16min 24s
Wall time: 13min 31s
