# Load Dataset

In [1]:
import pandas as pd

train_df = pd.read_csv("../../datasets/feature_engineering/train.csv", index_col="PassengerId")
test_df = pd.read_csv("../../datasets/feature_engineering/test.csv", index_col="PassengerId")

# Split Dataset (Train and Test)

In [2]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "no_ticket", "is_alone", "survived"])
y_train = train_df.survived

X_test = test_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "is_alone", "no_ticket"])

print(f"X_train shape : {X_train.shape}")
print(f"X_train shape : {y_train.shape}")
print(f"X_test shape  : {X_test.shape}")

X_train shape : (891, 10)
X_train shape : (891,)
X_test shape  : (418, 10)


## Preprocessing Data Pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  MinMaxScaler, OneHotEncoder

numerical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", MinMaxScaler())
    ],
    verbose=1
)

categorical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ],
    verbose=1
)

preprocess_pipeline = ColumnTransformer(
    transformers=[
        ("numerical_pipeline", numerical_prep_pipeline, make_column_selector(dtype_include="number")),
        ("categorical_pipeline", categorical_prep_pipeline, make_column_selector(dtype_include="object"))
    ],
    verbose=1, verbose_feature_names_out=True
)

### X_train transform

In [4]:
pd.DataFrame(preprocess_pipeline.fit_transform(X_train)[:3])

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.125,0.0,0.014151,0.0,0.1,0.294373,0.007076,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.125,0.0,0.139136,0.5,0.1,0.167722,0.069568,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.015469,0.0,0.0,0.348652,0.015469,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### X_test transform

In [5]:
pd.DataFrame(preprocess_pipeline.fit_transform(X_test)[:3])

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.015282,0.0,0.0,0.569037,0.02984,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.125,0.0,0.013663,0.0,0.1,0.776231,0.01334,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.018909,0.0,0.0,0.682303,0.036922,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Model Pipeline

In [6]:
from sklearn.neighbors import KNeighborsClassifier

model_pipeline = Pipeline(
    steps=[
        ("preprocess_pipeline", preprocess_pipeline),
        ("algo", KNeighborsClassifier(n_jobs=-1))
    ],
    verbose=1
)

# Train

In [7]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "algo__n_neighbors": [k for k in range(1, 52, 2)],
    "algo__weights": ["uniform", "distance"],
    "algo__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "algo__leaf_size": [n for n in range(1, 50)]
}

model = RandomizedSearchCV(estimator=model_pipeline, param_distributions=params, cv=3, scoring="accuracy", n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score").iloc[:5, :]

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s
[Pipeline]  (step 1 of 2) Processing preprocess_pipeline, total=   0.0s
[Pipeline] .............. (step 2 of 2) Processing algo, total=   0.0s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__weights,param_algo__n_neighbors,param_algo__leaf_size,param_algo__algorithm,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.152025,0.002311,0.126951,0.006353,uniform,3,43,ball_tree,"{'algo__weights': 'uniform', 'algo__n_neighbor...",0.801347,0.804714,0.808081,0.804714,0.002749,1
4,0.092761,0.01354,0.157738,0.005452,distance,45,46,brute,"{'algo__weights': 'distance', 'algo__n_neighbo...",0.801347,0.811448,0.787879,0.800224,0.009655,2
0,0.125188,0.029778,0.124355,0.017896,distance,13,27,auto,"{'algo__weights': 'distance', 'algo__n_neighbo...",0.794613,0.808081,0.777778,0.79349,0.012397,3
1,0.136402,0.044095,0.117386,0.007263,distance,25,17,auto,"{'algo__weights': 'distance', 'algo__n_neighbo...",0.791246,0.808081,0.781145,0.79349,0.011111,3
6,0.15539,0.007327,0.073615,0.00245,distance,23,43,kd_tree,"{'algo__weights': 'distance', 'algo__n_neighbo...",0.784512,0.814815,0.781145,0.79349,0.015141,3


# Save Model and Prediction

## Model

In [8]:
from datetime import datetime
from joblib import dump, load

now = datetime.now()
now = now.strftime("%m_%d_%Y-%H_%M_%S")
model_name = str(model.estimator.named_steps.algo).split("(")[0]

dump(value=model, filename="../../pretrained_models/" + now + "_" + model_name +".joblib")
model = load(filename="../../pretrained_models/" + now + "_" + model_name + ".joblib")

## Prediction

In [9]:
y_test_preds = model.predict(X=X_test)

pred_df = pd.DataFrame({
    "PassengerId": X_test.index,
    "Survived": y_test_preds
})

pred_df.to_csv("../../submissions/" + now + "_" + model_name + ".csv", index=False)