# Load Dataset

In [1]:
import pandas as pd

train_df = pd.read_csv("../../datasets/feature_engineering/train.csv", index_col="PassengerId")
test_df = pd.read_csv("../../datasets/feature_engineering/test.csv", index_col="PassengerId")

# Split Dataset (Train and Test)

In [2]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "no_ticket", "is_alone", "survived"])
y_train = train_df.survived

X_test = test_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "is_alone", "no_ticket"])

print(f"X_train shape : {X_train.shape}")
print(f"X_train shape : {y_train.shape}")
print(f"X_test shape  : {X_test.shape}")

X_train shape : (891, 10)
X_train shape : (891,)
X_test shape  : (418, 10)


## Preprocessing Data Pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, PowerTransformer, RobustScaler
from sklearn.compose import ColumnTransformer, make_column_selector

numerical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", RobustScaler(quantile_range=(5., 80.)))
    ],
    verbose=1
)

categorical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ],
    verbose=1
)

preprocess_pipeline = ColumnTransformer(
    transformers=[
        ("step1_numerical_pipeline", numerical_prep_pipeline, make_column_selector(dtype_include="number")),
        ("step2_categorical_pipeline", categorical_prep_pipeline, make_column_selector(dtype_include="object"))
    ],
    verbose=1, verbose_feature_names_out=True
)

## Model Pipeline

In [4]:
from sklearn.neighbors import KNeighborsClassifier

model_pipeline = Pipeline(
    steps=[
        ("step1_preprocess_pipeline", preprocess_pipeline),
        ("step2_algo", KNeighborsClassifier(n_jobs=-1))
    ],
    verbose=1
)

# Train

In [5]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "step2_algo__n_neighbors": [k for k in range(1, 52, 2)],
    "step2_algo__weights": ["uniform", "distance"],
    "step2_algo__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "step2_algo__leaf_size": [n for n in range(1, 50)]
}

model = RandomizedSearchCV(estimator=model_pipeline, param_distributions=params, cv=3, scoring="accuracy", n_jobs=-1, verbose=1)
model.fit(X_train, y_train);

pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score").iloc[:5, :]

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.1s
[ColumnTransformer]  (1 of 2) Processing step1_numerical_pipeline, total=   0.2s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing step2_categorical_pipeline, total=   0.0s
[Pipeline]  (step 1 of 2) Processing step1_preprocess_pipeline, total=   0.2s
[Pipeline] ........ (step 2 of 2) Processing step2_algo, total=   0.0s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_step2_algo__weights,param_step2_algo__n_neighbors,param_step2_algo__leaf_size,param_step2_algo__algorithm,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,9.489098,1.648664,1.988458,0.166168,uniform,25,44,brute,"{'step2_algo__weights': 'uniform', 'step2_algo...",0.791246,0.814815,0.824916,0.810325,0.014108,1
2,2.428105,0.258325,0.778721,0.102767,uniform,15,39,brute,"{'step2_algo__weights': 'uniform', 'step2_algo...",0.787879,0.801347,0.828283,0.805836,0.016798,2
8,10.313409,1.240175,1.692089,0.201907,uniform,15,17,kd_tree,"{'step2_algo__weights': 'uniform', 'step2_algo...",0.787879,0.801347,0.828283,0.805836,0.016798,2
5,6.793096,0.536703,1.214545,0.100751,uniform,11,32,ball_tree,"{'step2_algo__weights': 'uniform', 'step2_algo...",0.804714,0.784512,0.818182,0.802469,0.013837,4
6,9.635023,2.335202,1.503265,0.302388,distance,9,30,auto,"{'step2_algo__weights': 'distance', 'step2_alg...",0.804714,0.787879,0.801347,0.79798,0.007274,5


# Save Model and Prediction

## Model

In [6]:
from datetime import datetime
from joblib import dump, load

now = datetime.now()
now = now.strftime("%m_%d_%Y-%H_%M_%S")
model_name = str(model.estimator.named_steps.step2_algo).split("(")[0]

dump(value=model, filename="../../pretrained_models/" + now + "_" + model_name +".joblib")
model = load(filename="../../pretrained_models/" + now + "_" + model_name + ".joblib")

## Prediction

In [7]:
y_test_preds = model.predict(X=X_test)

pred_df = pd.DataFrame({
    "PassengerId": X_test.index,
    "Survived": y_test_preds
})

pred_df.to_csv("../../submissions/" + now + "_" + model_name + ".csv", index=False)