In [1]:
from pathlib import Path
import pandas as pd
# import seaborn as sns

# preprocessing / pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# clfs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

# cross_validation
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import RepeatedStratifiedKFold, HalvingGridSearchCV

## IO

In [2]:
DATA_DIR = Path("../data/processed/")
train = pd.read_csv(DATA_DIR / "final_train.csv")
X, y = train.drop(columns='Survived'), train['Survived']

In [3]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

## ML pipeline

In [4]:
ct = ColumnTransformer(
    transformers=[("numeric", StandardScaler(), ['Age', 'Fare']),
                 ], remainder='passthrough'
)

model = Pipeline(
    steps=[
        ("ct", ct),
        ("clf", DummyClassifier(strategy='constant', constant=1))
    ]
)

In [5]:
model.fit(X, y)

## Hyperparameter tuning

- just using classifier parameters but all components of a pipeline can be part of this

In [6]:

clfs = [
    {'clf': (LogisticRegression(), ), # convention list of tuples for each
     'clf__C': (1,5,10)},
    {'clf' : (DecisionTreeClassifier(), ),
     'clf__max_depth': (1,3,5)},
    {'clf': (RandomForestClassifier(), ),
     'clf__max_depth': (1,3,5)}
]
     

In [7]:
# cross validation strategy 
cv = RepeatedStratifiedKFold(n_repeats=3, 
                             n_splits=3,
                             random_state=42) # seed required
# Search strategy
grid = HalvingGridSearchCV(estimator=model,
                           param_grid=clfs,
                           cv=cv)

In [8]:
grid.fit(X,y)

## Results 

- eval scores both test and train are available; good for checking model generalisation to unseen data

In [9]:
cv_results = pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score', 
                                                        ascending=False)

In [10]:
cv_results.head().T

Unnamed: 0,7,3,4,0,1
iter,0,0,0,0,0
n_resources,99,99,99,99,99
mean_fit_time,0.134224,0.004294,0.004706,0.008841,0.008976
std_fit_time,0.009039,0.000419,0.000644,0.001618,0.000911
mean_score_time,0.014074,0.002881,0.002989,0.00351,0.003442
std_score_time,0.000936,0.000368,0.000815,0.000485,0.000483
param_clf,RandomForestClassifier(),DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=3),LogisticRegression(),LogisticRegression()
param_clf__C,,,,1,5
param_clf__max_depth,3,1,3,,
params,"{'clf': RandomForestClassifier(), 'clf__max_de...","{'clf': DecisionTreeClassifier(max_depth=3), '...","{'clf': DecisionTreeClassifier(max_depth=3), '...","{'clf': LogisticRegression(), 'clf__C': 1}","{'clf': LogisticRegression(), 'clf__C': 5}"



## Retrieve params


In [11]:
grid.best_estimator_

In [12]:
grid.best_params_

{'clf': DecisionTreeClassifier(max_depth=3), 'clf__max_depth': 3}

In [17]:
# or if the def of "best" varies you can pull any params from cv_results
best_params = cv_results.query("rank_test_score == 1")['params'].values[0] 
best_params

{'clf': RandomForestClassifier(max_depth=3), 'clf__max_depth': 3}

In [18]:
model.set_params(**best_params)

## Predict unseen data

In [19]:
X_test = pd.read_csv(DATA_DIR / "final_test.csv")
# X, y = train.drop(columns='Survived'), train['Survived']

In [20]:
y_pred = grid.best_estimator_.predict(X_test.drop(columns='PassengerId'))
pd.Series(data= y_pred, 
          index=X_test['PassengerId'])

PassengerId
892     0
893     1
894     0
895     0
896     1
       ..
1305    0
1306    1
1307    0
1308    0
1309    0
Length: 418, dtype: int64