In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2
np.random.seed(0)

In [2]:
X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)

In [3]:
y = y.astype('int')

In [4]:
print(X.shape, y.shape)

(1309, 13) (1309,)


In [5]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
y.value_counts()

0    809
1    500
Name: survived, dtype: int64

In [7]:
X.isna().sum()

pclass          0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

### Features to train on
- Age: numerical
- Fare: numerical
- embarked (C, S, Q): category
- sex (female, male): category
- pclass (1,2,3): category

In [8]:
# Preprocessing pipeline of numeric and categorical data
# Imputer to clean
# Categorical: Encode, select best features
# Numerical: 

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')),
           ('scaler', StandardScaler())
          ])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('selector', SelectPercentile(chi2, percentile=50))
           ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
# full pipeline with classifier
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor), 
        ('classifier', LogisticRegression())
    ])

In [10]:
clf

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print(f'model score: {clf.score(X_test, y_test):.3f}')

model score: 0.798


## Pipeline with grid search

#### We will run a gridsearch on the ColumnTransformer object and the hyperparemeters of the classifier as psteps in the pipeline.

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "preprocessor__cat__selector__percentile": [10, 30, 50, 70],
    "classifier__C": [0.1, 1.0, 10, 100],
}

In [14]:
search_cv = GridSearchCV(clf, param_grid, refit=True, cv=3)
search_cv

In [15]:
search_cv.fit(X_train, y_train)
# prints the tuned model hyperparameters
#print(search_cv.best_params_)
print(f'model score: {search_cv.score(X_test, y_test):.3f}')

model score: 0.798


In [16]:
from sklearn.metrics import classification_report

y_ = search_cv.predict(X_test)
print(classification_report(y_test, y_))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84       162
           1       0.78      0.65      0.71       100

    accuracy                           0.80       262
   macro avg       0.79      0.77      0.78       262
weighted avg       0.80      0.80      0.79       262



## Saving the tuned model and test data

In [17]:
import joblib

with open('model/classifier.joblib', 'wb') as file:
    joblib.dump(search_cv, file)

In [18]:
X_test.to_csv('data/x_test_data.csv')
y_test.to_csv('data/y_test_data.csv')

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_)

0.7977099236641222