# Load Dataset

In [1]:
import pandas as pd

train_df = pd.read_csv("../../datasets/feature_engineering/train.csv", index_col="PassengerId")
test_df = pd.read_csv("../../datasets/feature_engineering/test.csv", index_col="PassengerId")

# Split Dataset (Train and Test)

In [2]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "no_ticket", "is_alone", "survived"])
y_train = train_df.survived

X_test = test_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "is_alone", "no_ticket"])

print(f"X_train shape : {X_train.shape}")
print(f"X_train shape : {y_train.shape}")
print(f"X_test shape  : {X_test.shape}")

X_train shape : (891, 10)
X_train shape : (891,)
X_test shape  : (418, 10)


## Preprocessing Data Pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, PowerTransformer, RobustScaler
from sklearn.compose import ColumnTransformer, make_column_selector

numerical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", RobustScaler(quantile_range=(5., 80.)))
    ],
    verbose=1
)

categorical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ],
    verbose=1
)

preprocess_pipeline = ColumnTransformer(
    transformers=[
        ("step1_numerical_pipeline", numerical_prep_pipeline, make_column_selector(dtype_include="number")),
        ("step2_categorical_pipeline", categorical_prep_pipeline, make_column_selector(dtype_include="object"))
    ],
    verbose=1, verbose_feature_names_out=True
)

## Model Pipeline

In [4]:
from sklearn.ensemble import RandomForestClassifier

model_pipeline = Pipeline(
    steps=[
        ("step1_preprocess_pipeline", preprocess_pipeline),
        ("step2_algo", RandomForestClassifier(n_jobs=-1))
    ],
    verbose=1
)

# Train

In [5]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "step2_algo__n_estimators": [100, 200, 300, 400, 500],
    "step2_algo__criterion": ["gini", "entropy", "log_loss"],
    "step2_algo__max_features": ["sqrt", "log2", None],
    "step2_algo__bootstrap": [True, False],
    "step2_algo__class_weight": ["balanced", "balanced_subsample"]
    
}

model = RandomizedSearchCV(estimator=model_pipeline, param_distributions=params, cv=3, scoring="accuracy", n_jobs=-1, verbose=1)
model.fit(X_train, y_train);

pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score").iloc[:5, :]

Fitting 3 folds for each of 10 candidates, totalling 30 fits


9 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\joblib\parallel.py", line 1056, in __call__
    self.retriev

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing step1_numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing step2_categorical_pipeline, total=   0.0s
[Pipeline]  (step 1 of 2) Processing step1_preprocess_pipeline, total=   0.1s
[Pipeline] ........ (step 2 of 2) Processing step2_algo, total=   0.7s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_step2_algo__n_estimators,param_step2_algo__max_features,param_step2_algo__criterion,param_step2_algo__class_weight,param_step2_algo__bootstrap,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,7.830009,1.19148,0.920177,0.063531,100,,entropy,balanced,True,"{'step2_algo__n_estimators': 100, 'step2_algo_...",0.787879,0.818182,0.801347,0.802469,0.012397,1
5,6.446938,1.16149,0.872549,0.312958,200,log2,gini,balanced,True,"{'step2_algo__n_estimators': 200, 'step2_algo_...",0.79798,0.824916,0.781145,0.801347,0.018027,2
8,6.399026,0.349417,0.493044,0.177624,400,log2,gini,balanced_subsample,True,"{'step2_algo__n_estimators': 400, 'step2_algo_...",0.801347,0.814815,0.784512,0.800224,0.012397,3
9,4.224968,0.195018,0.616075,0.070499,200,sqrt,gini,balanced,True,"{'step2_algo__n_estimators': 200, 'step2_algo_...",0.804714,0.801347,0.784512,0.796857,0.008837,4
0,7.997633,0.418959,0.989733,0.15629,300,sqrt,gini,balanced_subsample,False,"{'step2_algo__n_estimators': 300, 'step2_algo_...",0.79798,0.801347,0.781145,0.79349,0.008837,5


# Save Model and Prediction

## Model

In [6]:
from datetime import datetime
from joblib import dump, load

now = datetime.now()
now = now.strftime("%m_%d_%Y-%H_%M_%S")
model_name = str(model.estimator.named_steps.step2_algo).split("(")[0]

dump(value=model, filename="../../pretrained_models/" + now + "_" + model_name +".joblib")
model = load(filename="../../pretrained_models/" + now + "_" + model_name + ".joblib")

## Prediction

In [7]:
y_test_preds = model.predict(X=X_test)

pred_df = pd.DataFrame({
    "PassengerId": X_test.index,
    "Survived": y_test_preds
})

pred_df.to_csv("../../submissions/" + now + "_" + model_name + ".csv", index=False)