# Import Common Libraries

In [1]:
import pandas as pd

# Import Dataset

In [2]:
df = pd.read_csv("../datasets/titanic.csv", index_col="PassengerId")
df.drop(columns=["Name", "Ticket", "Cabin"], inplace=True)
df.Pclass = df.Pclass.astype("object")
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


# Build Model

## Dataset Spliting

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(columns="Survived")
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"X_train shape : {X_train.shape}")
print(f"X_train shape : {y_train.shape}")
print(f"X_test shape  : {X_test.shape}")
print(f"y_test shape  : {y_test.shape}")

X_train shape : (712, 7)
X_train shape : (712,)
X_test shape  : (179, 7)
y_test shape  : (179,)


# Pipeline

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 62.6+ KB


## Preprocessor Pipeline

In [21]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder())
])

preprocess_pipeline = ColumnTransformer([
    ("numerical_pipeline", numerical_pipeline, make_column_selector(dtype_include="number")),
    ("categorical_pipeline", categorical_pipeline, make_column_selector(dtype_include="object"))
], verbose=True)

## Model Pipeline

In [22]:
model_pipeline = Pipeline([
    ("preprocess_pipeline", preprocess_pipeline),
    ("algo", KNeighborsClassifier())
])

In [23]:
model_pipeline.fit(X_train, y_train)

[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


In [24]:
model_pipeline.score(X_train, y_train)

0.8553370786516854

In [25]:
model_pipeline.score(X_test, y_test)

0.8100558659217877

# Grid Search CV

In [26]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ["uniform", "distance"],
    "algo__p": [1, 2]
}

model = GridSearchCV(model_pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


In [12]:
pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__n_neighbors,param_algo__p,param_algo__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.035875,0.001313,0.047728,0.010219,5,1,uniform,"{'algo__n_neighbors': 5, 'algo__p': 1, 'algo__...",0.818182,0.797203,0.845070,0.838028,0.802817,0.820260,0.018823,1
52,0.046343,0.002356,0.044768,0.004053,27,1,uniform,"{'algo__n_neighbors': 27, 'algo__p': 1, 'algo_...",0.832168,0.762238,0.838028,0.852113,0.809859,0.818881,0.031424,2
56,0.052841,0.002875,0.056805,0.011329,29,1,uniform,"{'algo__n_neighbors': 29, 'algo__p': 1, 'algo_...",0.832168,0.762238,0.838028,0.852113,0.802817,0.817473,0.031950,3
48,0.059937,0.013528,0.053862,0.008503,25,1,uniform,"{'algo__n_neighbors': 25, 'algo__p': 1, 'algo_...",0.825175,0.769231,0.838028,0.845070,0.809859,0.817473,0.026940,4
50,0.048614,0.007115,0.052133,0.006740,25,2,uniform,"{'algo__n_neighbors': 25, 'algo__p': 2, 'algo_...",0.825175,0.762238,0.838028,0.845070,0.809859,0.816074,0.029471,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,0.058079,0.010347,0.033372,0.008386,11,1,distance,"{'algo__n_neighbors': 11, 'algo__p': 1, 'algo_...",0.790210,0.748252,0.823944,0.823944,0.767606,0.790791,0.030152,96
3,0.048088,0.006034,0.038413,0.009643,1,2,distance,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.741259,0.720280,0.753521,0.809859,0.732394,0.751463,0.031159,97
2,0.045707,0.004698,0.047982,0.009297,1,2,uniform,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.741259,0.720280,0.753521,0.809859,0.732394,0.751463,0.031159,97
1,0.051398,0.010678,0.024904,0.001668,1,1,distance,"{'algo__n_neighbors': 1, 'algo__p': 1, 'algo__...",0.734266,0.720280,0.732394,0.802817,0.725352,0.743022,0.030313,99


In [13]:
model.best_params_

{'algo__n_neighbors': 5, 'algo__p': 1, 'algo__weights': 'uniform'}

In [14]:
model.score(X_train, y_train)

0.8553370786516854

In [15]:
model.score(X_test, y_test)

0.7988826815642458

# Save Model

In [16]:
from joblib import dump, load

dump(model, '../pretrained_models/knn_model.joblib')

['../pretrained_models/knn_model.joblib']

In [17]:
model = load("../pretrained_models/knn_model.joblib")

In [18]:
model.score(X_train, y_train)

0.8553370786516854

In [19]:
model.score(X_test, y_test)

0.7988826815642458