<a href="https://colab.research.google.com/github/kessingtonosazee/GCP_Project_1/blob/master/mlc_2324_w10_lec_pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLC 23/24 Week 10: `sklearn` Pipelines



## Importing and Configuring

In [None]:
import sklearn
sklearn.set_config(transform_output="pandas")

In [None]:
!pip install category_encoders -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(
    { "figure.figsize": (6, 4) },
    style='ticks',
    color_codes=True,
    font_scale=0.8
)
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, ParameterGrid

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

## Dataset

In [None]:
tc = pd.read_csv(
    'https://raw.githubusercontent.com/gerberl/6G7V0015-2324/main/datasets/titanic.csv'
)
tc.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


## Feature Selection and Train/Test Split

In [None]:
target = 'Survived'
cat_feat = ['Pclass', 'Sex', 'Embarked']
num_feat = ['Age', 'SibSp', 'Parch', 'Fare']

In [None]:
X = tc[ cat_feat + num_feat ]
y = tc[target]

In [None]:
X.head(1)

Unnamed: 0,Pclass,Sex,Embarked,Age,SibSp,Parch,Fare
0,3,male,S,22.0,1,0,7.25


In [None]:
y.head(1)

0    0
Name: Survived, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Data Preparation (No Pipelines)

### Categorical Encoding

In [None]:
ohe = OneHotEncoder(sparse_output=False, drop='if_binary')

In [None]:
ohe.fit(X_train[cat_feat])

In [None]:
X_train_enc = ohe.transform(X_train[cat_feat])

In [None]:
X_train_enc.head(1)

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
105,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [None]:
X_test_enc = ohe.transform(X_test[cat_feat])

In [None]:
X_test_enc.head(1)

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
495,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0


### Scaling of Numeric Data

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(X_train[num_feat])

In [None]:
X_train_scaled = scaler.transform(X_train[num_feat])

In [None]:
X_train_scaled.head(1)

Unnamed: 0,Age,SibSp,Parch,Fare
105,0.34451,0.0,0.0,0.015412


In [None]:
X_test_scaled = scaler.transform(X_test[num_feat])

In [None]:
X_test_scaled.head(1)

Unnamed: 0,Age,SibSp,Parch,Fare
495,,0.0,0.0,0.028221


### Putting It Back Together

In [None]:
X_train_tr = pd.concat([X_train_enc, X_train_scaled], axis=1)

In [None]:
X_test_tr = pd.concat([X_test_enc, X_test_scaled], axis=1)

In [None]:
X_train_tr.head(1)

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Age,SibSp,Parch,Fare
105,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.34451,0.0,0.0,0.015412


### A Late Value Imputation

In [None]:
imp = SimpleImputer()

In [None]:
imp.fit(X_train_tr)

In [None]:
X_train_imp = imp.transform(X_train_tr)

In [None]:
X_train_imp.head(1)

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Age,SibSp,Parch,Fare
105,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.34451,0.0,0.0,0.015412


In [None]:
X_test_imp = imp.transform(X_test_tr)

In [None]:
X_test_imp.head(1)

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Age,SibSp,Parch,Fare
495,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.368461,0.0,0.0,0.028221


## Data Preparation (With Pipelines)

#### First Iteration: kNN Classifier with Categorical Features Only

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline

In [None]:
knn_pp_cat = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False, drop='if_binary'),
    KNeighborsRegressor(n_neighbors=11)
)

In [None]:
knn_pp_cat.fit(X_train[cat_feat], y_train)

In [None]:
knn_pp_cat.score(X_test[cat_feat], y_test)

0.3530014241263423

In [None]:
cross_val_score(knn_pp_cat, X_train[cat_feat], y_train)

array([0.291489  , 0.41579848, 0.28306572, 0.29098121, 0.31396062])

In [None]:
print(knn_pp_cat)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder',
                 OneHotEncoder(drop='if_binary', sparse_output=False)),
                ('kneighborsregressor', KNeighborsRegressor(n_neighbors=11))])


#### Second Iteration: kNN Pipeline with Feature Selector

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", MinMaxScaler())
    ]
)

In [None]:
numeric_transformer

In [None]:
print(numeric_transformer)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', MinMaxScaler())])


In [None]:
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(sparse_output=False, drop='if_binary')),
    ]
)

In [None]:
categorical_transformer

In [None]:
print(categorical_transformer)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder',
                 OneHotEncoder(drop='if_binary', sparse_output=False))])


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_feat),
        ("cat", categorical_transformer, cat_feat),
    ]
)

In [None]:
preprocessor

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
         ("classifier", KNeighborsClassifier(n_neighbors=11))]
)

In [None]:
clf

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

0.7757847533632287

In [None]:
cross_val_score(clf, X_train, y_train)

array([0.79850746, 0.82835821, 0.7761194 , 0.80451128, 0.81954887])