# Load Dataset

In [1]:
import pandas as pd

train_df = pd.read_csv("../../datasets/feature_engineering/train.csv", index_col="PassengerId")
test_df = pd.read_csv("../../datasets/feature_engineering/test.csv", index_col="PassengerId")

# Split Dataset (Train and Test)

In [2]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "no_ticket", "is_alone", "survived"])
y_train = train_df.survived

X_test = test_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "is_alone", "no_ticket"])

print(f"X_train shape : {X_train.shape}")
print(f"X_train shape : {y_train.shape}")
print(f"X_test shape  : {X_test.shape}")

X_train shape : (891, 10)
X_train shape : (891,)
X_test shape  : (418, 10)


## Preprocessing Data Pipeline

In [3]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

numerical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", MinMaxScaler())
    ],
    verbose=1
)

categorical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ],
    verbose=1
)

preprocess_pipeline = ColumnTransformer(
    transformers=[
        ("numerical_pipeline", numerical_prep_pipeline, make_column_selector(dtype_include="number")),
        ("categorical_pipeline", categorical_prep_pipeline, make_column_selector(dtype_include="object"))
    ],
    verbose=1, verbose_feature_names_out=True
)

preprocess_pipeline

### X_train transform

In [4]:
preprocess_pipeline.fit(X_train)
columns = [column_name.split("__")[1] for column_name in preprocess_pipeline.get_feature_names_out()]

X_train_transform = pd.DataFrame(
    preprocess_pipeline.fit_transform(X_train),
    columns=columns
)

X_train_transform.head()

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


Unnamed: 0,sibsp,parch,fare,fare_category,family_size,age*class,fare_per_person,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,title_Master,title_Miss,title_Mr,title_Mrs
0,0.125,0.0,0.014151,0.0,0.1,0.294373,0.007076,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.125,0.0,0.139136,0.5,0.1,0.167722,0.069568,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.015469,0.0,0.0,0.348652,0.015469,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.125,0.0,0.103644,0.5,0.1,0.154152,0.051822,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.015713,0.0,0.0,0.47078,0.015713,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


### X_test transform

In [5]:
preprocess_pipeline.fit(X_test)
columns = [column_name.split("__")[1] for column_name in preprocess_pipeline.get_feature_names_out()]

X_test_transform = pd.DataFrame(
    preprocess_pipeline.fit_transform(X_test),
    columns=columns
)

X_test_transform.head()

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


Unnamed: 0,sibsp,parch,fare,fare_category,family_size,age*class,fare_per_person,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,title_Master,title_Miss,title_Mr,title_Mrs
0,0.0,0.0,0.015282,0.0,0.0,0.569037,0.02984,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.125,0.0,0.013663,0.0,0.1,0.776231,0.01334,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.018909,0.0,0.0,0.682303,0.036922,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.016908,0.0,0.0,0.444721,0.033016,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.125,0.111111,0.023984,0.0,0.2,0.361843,0.015611,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


## Model Pipeline

In [6]:
from sklearn.neighbors import KNeighborsClassifier

model_pipeline = Pipeline(
    steps=[
        ("preprocess_pipeline", preprocess_pipeline),
        ("algo", KNeighborsClassifier(n_jobs=-1))
    ],
    verbose=1
)

model_pipeline

# Train

In [7]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "algo__n_neighbors": [k for k in range(1, 52, 2)],
    "algo__weights": ["uniform", "distance"],
    "algo__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "algo__leaf_size": [n for n in range(1, 50)]
}

model = RandomizedSearchCV(estimator=model_pipeline, param_distributions=params, cv=3, scoring="accuracy", n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score").iloc[:5, :]

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s
[Pipeline]  (step 1 of 2) Processing preprocess_pipeline, total=   0.0s
[Pipeline] .............. (step 2 of 2) Processing algo, total=   0.0s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__weights,param_algo__n_neighbors,param_algo__leaf_size,param_algo__algorithm,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1,0.045335,0.002357,0.059665,0.00499,uniform,9,48,kd_tree,"{'algo__weights': 'uniform', 'algo__n_neighbor...",0.804714,0.818182,0.818182,0.813692,0.006349,1
7,0.037336,0.002059,0.046653,0.002616,uniform,17,6,kd_tree,"{'algo__weights': 'uniform', 'algo__n_neighbor...",0.774411,0.838384,0.818182,0.810325,0.026701,2
8,0.034997,0.007117,0.097666,0.001248,uniform,31,25,auto,"{'algo__weights': 'uniform', 'algo__n_neighbor...",0.804714,0.814815,0.787879,0.802469,0.011111,3
5,0.034331,0.001247,0.076666,0.00386,distance,17,13,auto,"{'algo__weights': 'distance', 'algo__n_neighbo...",0.79798,0.814815,0.781145,0.79798,0.013746,4
9,0.044666,0.011814,0.077996,0.01314,uniform,27,4,auto,"{'algo__weights': 'uniform', 'algo__n_neighbor...",0.794613,0.808081,0.787879,0.796857,0.008399,5


# Save Model and Prediction

## Model

In [8]:
from datetime import datetime
from joblib import dump, load

now = datetime.now()
now = now.strftime("%m_%d_%Y-%H_%M_%S")
model_name = str(model.estimator.named_steps.algo).split("(")[0]

dump(value=model, filename="../../pretrained_models/" + now + "_" + model_name +".joblib")
model = load(filename="../../pretrained_models/" + now + "_" + model_name + ".joblib")

## Prediction

In [9]:
y_test_preds = model.predict(X=X_test)

pred_df = pd.DataFrame({
    "PassengerId": X_test.index,
    "Survived": y_test_preds
})

pred_df.to_csv("../../submissions/" + now + "_" + model_name + ".csv", index=False)