# Import Common Libraries

In [1]:
import pandas as pd

# Import Dataset

In [2]:
df = pd.read_csv("../datasets/titanic.csv", index_col="PassengerId")
df.drop(columns=["Name", "Ticket", "Cabin"], inplace=True)
df.Pclass = df.Pclass.astype("object")
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


# Build Model

## Dataset Spliting

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(columns="Survived")
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print(f"X_train shape : {X_train.shape}")
print(f"X_train shape : {y_train.shape}")
print(f"X_test shape  : {X_test.shape}")
print(f"y_test shape  : {y_test.shape}")

X_train shape : (712, 7)
X_train shape : (712,)
X_test shape  : (179, 7)
y_test shape  : (179,)


# Pipeline

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 62.6+ KB


## Preprocessor Pipeline

In [6]:
numerical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", MinMaxScaler())
    ],
    verbose=True
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ],
    verbose=True
)

preprocess_pipeline = ColumnTransformer(
    transformers=[
        ("numerical_pipeline", numerical_pipeline, make_column_selector(dtype_include="number")),
        ("categorical_pipeline", categorical_pipeline, make_column_selector(dtype_include="object"))
    ],
    verbose=True
)

## Model Pipeline

In [7]:
model_pipeline = Pipeline([
    ("preprocess_pipeline", preprocess_pipeline),
    ("algo", KNeighborsClassifier())
])

In [8]:
model_pipeline.fit(X_train, y_train)

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


In [9]:
model_pipeline.score(X_train, y_train)

0.8553370786516854

In [10]:
model_pipeline.score(X_test, y_test)

0.8100558659217877

# Grid Search CV

In [11]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ["uniform", "distance"],
    "algo__p": [1, 2]
}

model = GridSearchCV(model_pipeline, parameters, cv=5, scoring="f1", n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


In [12]:
pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score").iloc[0, :]

mean_fit_time                                                       0.053389
std_fit_time                                                        0.003664
mean_score_time                                                     0.045572
std_score_time                                                      0.003591
param_algo__n_neighbors                                                    5
param_algo__p                                                              1
param_algo__weights                                                  uniform
params                     {'algo__n_neighbors': 5, 'algo__p': 1, 'algo__...
split0_test_score                                                   0.734694
split1_test_score                                                   0.674157
split2_test_score                                                   0.803571
split3_test_score                                                   0.776699
split4_test_score                                                    0.72549

In [13]:
model.best_params_

{'algo__n_neighbors': 5, 'algo__p': 1, 'algo__weights': 'uniform'}

In [14]:
model.best_score_

0.7429223669395761

In [15]:
model.score(X_train, y_train)

0.7976424361493124

In [16]:
model.score(X_test, y_test)

0.71875

# Save Model

In [17]:
from joblib import dump, load

dump(model, '../pretrained_models/knn_model.joblib')

['../pretrained_models/knn_model.joblib']

In [18]:
model = load("../pretrained_models/knn_model.joblib")

In [19]:
model.score(X_train, y_train)

0.7976424361493124

In [20]:
model.score(X_test, y_test)

0.71875