In [1]:
from pathlib import Path
import pandas as pd
# import seaborn as sns

# preprocessing / pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# clfs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

# cross_validation
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import RepeatedStratifiedKFold, HalvingGridSearchCV

## IO

In [2]:
DATA_DIR = Path("../data/processed/")
train = pd.read_csv(DATA_DIR / "final_train.csv")
X, y = train.drop(columns='Survived'), train['Survived']

In [3]:
train['Survived'].value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

## ML pipeline

In [4]:
ct = ColumnTransformer(
    transformers=[("numeric", StandardScaler(), ['Age', 'Fare']),
                 ], remainder='passthrough'
)

model = Pipeline(
    steps=[
        ("ct", ct),
        ("clf", DummyClassifier(strategy='constant', constant=1))
    ]
)

In [5]:
model.fit(X, y)

In [6]:
X

Unnamed: 0,Age,Fare,TravelAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male,IsMinor
0,22.0,7.2500,0,0,0,1,0,0,1,1,0
1,38.0,71.2833,0,1,0,0,1,0,0,0,0
2,26.0,7.9250,1,0,0,1,0,0,1,0,0
3,35.0,53.1000,0,1,0,0,0,0,1,0,0
4,35.0,8.0500,1,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,13.0000,1,0,1,0,0,0,1,1,0
887,19.0,30.0000,1,1,0,0,0,0,1,0,0
888,28.0,23.4500,0,0,0,1,0,0,1,0,0
889,26.0,30.0000,1,1,0,0,1,0,0,1,0


## Hyperparameter tuning

- just using classifier parameters but all components of a pipeline can be part of this

In [18]:

clfs = [
    {'clf': (LogisticRegression(), ), # convention list of tuples for each
     'clf__C': (1,5,10),
     'clf__class_weight': ({0:1, 1:5},
                          'balanced',
                          {0:1, 1:50})
    },
    {'clf' : (DecisionTreeClassifier(), ),
     'clf__max_depth': (2,3,5),
     'clf__class_weight': ({0:1, 1:5},
                          'balanced',
                          {0:1, 1:50})},
    {'clf': (RandomForestClassifier(), ),
     'clf__max_depth': (1,3,5),
     'clf__class_weight': ({0:1, 1:5},
                           'balanced',
                          {0:1, 1:50})}
]

In [19]:
# cross validation strategy 
cv = RepeatedStratifiedKFold(n_repeats=3, 
                             n_splits=3,
                             random_state=42) # seed required
# Search strategy
grid = HalvingGridSearchCV(estimator=model,
                           param_grid=clfs,
                           cv=cv,
                           verbose=1)

In [20]:
grid.fit(X,y)

n_iterations: 3
n_required_iterations: 4
n_possible_iterations: 3
min_resources_: 36
max_resources_: 891
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 27
n_resources: 36
Fitting 9 folds for each of 27 candidates, totalling 243 fits
----------
iter: 1
n_candidates: 9
n_resources: 108
Fitting 9 folds for each of 9 candidates, totalling 81 fits
----------
iter: 2
n_candidates: 3
n_resources: 324
Fitting 9 folds for each of 3 candidates, totalling 27 fits


## Results 

- eval scores both test and train are available; good for checking model generalisation to unseen data

In [21]:
cv_results = pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score', 
                                                        ascending=False)

In [22]:
cv_results.head().T

Unnamed: 0,32,34,33,38,37
iter,1,1,1,2,2
n_resources,108,108,108,324,324
mean_fit_time,0.01226,0.021966,0.025531,0.013107,0.027352
std_fit_time,0.001342,0.001837,0.004334,0.000796,0.003355
mean_score_time,0.006738,0.008968,0.008983,0.008441,0.010494
std_score_time,0.000854,0.001216,0.001114,0.001261,0.004109
param_clf,DecisionTreeClassifier(class_weight='balanced'...,LogisticRegression(),LogisticRegression(),DecisionTreeClassifier(class_weight='balanced'...,LogisticRegression()
param_clf__C,,1,5,,1
param_clf__class_weight,balanced,balanced,balanced,balanced,balanced
param_clf__max_depth,3,,,3,


In [None]:
cv_results['params'].values[0]


## Retrieve params


In [23]:
grid.best_estimator_

In [24]:
grid.best_params_

{'clf': DecisionTreeClassifier(class_weight='balanced', max_depth=3),
 'clf__class_weight': 'balanced',
 'clf__max_depth': 3}

In [25]:
# or if the def of "best" varies you can pull any params from cv_results
best_params = cv_results.query("rank_test_score == 3")['params'].values[0] 
best_params

{'clf': LogisticRegression(), 'clf__C': 5, 'clf__class_weight': 'balanced'}

In [26]:
model.set_params(**best_params)

## Predict unseen data

In [31]:
X_test = pd.read_csv(DATA_DIR / "final_test.csv")
# X, y = train.drop(columns='Survived'), train['Survived']

In [32]:
y_pred = grid.best_estimator_.predict(X_test.drop(columns='PassengerId'))
test_submit = pd.Series(data= y_pred, 
                         index=X_test['PassengerId']).reset_index().rename(columns={0:"Survived"})

In [33]:
test_submit.to_csv(DATA_DIR / "test_submit_fgs.csv", index=False)

In [34]:
! kaggle competitions submit -c titanic -f ../data/processed/test_submit_fgs.csv -m "Message"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 26.5kB/s]
100%|##########| 3.18k/3.18k [00:01<00:00, 2.05kB/s]


In [35]:
! ls

00_DataPrep.ipynb
Analysis.ipynb
Logistic regression.ipynb
Modelling.ipynb
XGBoost example.ipynb
titanic_eda.ipynb
titanic_modeling.ipynb
tools.py
