<a href="https://colab.research.google.com/github/kyalan/CUHK-STAT5106-2020/blob/main/Week_11_scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference: [sklearn tutorial: Column Transformer with Mixed Types](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html)

# Modelling Stage

In [None]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

np.random.seed(0)

## Import data

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [None]:
print(X.info())
print(X.head())

In [None]:
y

## Spliting Train-Dev-Test data

[Reference](https://towardsdatascience.com/train-validation-and-test-sets-72cb40cba9e7)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2)

print(f'Train-Dev-Test data has been splited: ')
print(f'Train set: {X_train.shape} , {y_train.shape}')
print(f'Dev set: {X_dev.shape} , {y_dev.shape}')
print(f'Test set: {X_test.shape} , {y_test.shape}')

## Set Regressor and Response Variables

In [None]:
regressors_num = ['age', 'fare']
regressors_cat = ['embarked', 'sex', 'pclass']
response = 'survived'

## Preprocessing Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

transformers_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    , ('logtransformer', FunctionTransformer(np.log1p))
    , ('scaler', StandardScaler())
])
transformers_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformers_num, regressors_num),
        ('cat', transformers_cat, regressors_cat)]
)

In [None]:
preprocessor

In [None]:
# fit / transform with preprocessor
X_train = preprocessor.fit_transform(X_train)
X_dev = preprocessor.transform(X_dev)

In [None]:
print(f'Train-Dev data has been transformed: ')
print(f'Train set: {X_train.shape} , {y_train.shape}')
print(f'Dev set: {X_dev.shape} , {y_dev.shape}')

## Modelling

In [None]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
model_logReg = LogisticRegression()
model_logReg.fit(X_train, y_train)
print(f'Logistic Model Accuracy (train set) = {model_logReg.score(X_train, y_train)}')
print(f'Logistic Model Accuracy (dev set) = {model_logReg.score(X_dev, y_dev)}')

In [None]:
# Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
model_dtree = DecisionTreeClassifier()
model_dtree.fit(X_train, y_train)
print(f'Decision Tree Model Accuracy (train set) = {model_dtree.score(X_train, y_train)}')
print(f'Decision Tree Model Accuracy (dev set) = {model_dtree.score(X_dev, y_dev)}')

Logistic Model is better in dev set performance. So...

# Saving Model

In [None]:
model = model_logReg

In [None]:
from joblib import dump, load
dump(preprocessor, 'preprocessor.joblib')
dump(model, 'model.joblib')

print(os.listdir())

# Predicting Stage

## Load back materials

In [None]:
# Load back the saves
preprocessor_test = load('preprocessor.joblib')
model_test = load('model.joblib')

In [None]:
X_test = preprocessor_test.transform(X_test)
y_test_pred = model_test.predict(X_test)