# Load Dataset

In [1]:
import pandas as pd

train_df = pd.read_csv("../../datasets/feature_engineering/train.csv", index_col="PassengerId")
test_df = pd.read_csv("../../datasets/feature_engineering/test.csv", index_col="PassengerId")

# Split Dataset (Train and Test)

In [2]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "no_ticket", "is_alone", "survived"])
y_train = train_df.survived

X_test = test_df.drop(columns=["age", "pclass", "name", "ticket", "cabin", "deck", "is_alone", "no_ticket"])

print(f"X_train shape : {X_train.shape}")
print(f"X_train shape : {y_train.shape}")
print(f"X_test shape  : {X_test.shape}")

X_train shape : (891, 10)
X_train shape : (891,)
X_test shape  : (418, 10)


## Preprocessing Data Pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  MinMaxScaler, OneHotEncoder

numerical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", MinMaxScaler())
    ],
    verbose=1
)

categorical_prep_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ],
    verbose=1
)

preprocess_pipeline = ColumnTransformer(
    transformers=[
        ("numerical_pipeline", numerical_prep_pipeline, make_column_selector(dtype_include="number")),
        ("categorical_pipeline", categorical_prep_pipeline, make_column_selector(dtype_include="object"))
    ],
    verbose=1, verbose_feature_names_out=True
)

### X_train transform

In [4]:
pd.DataFrame(preprocess_pipeline.fit_transform(X_train)[:3])

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.125,0.0,0.014151,0.0,0.1,0.294373,0.007076,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.125,0.0,0.139136,0.5,0.1,0.167722,0.069568,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.015469,0.0,0.0,0.348652,0.015469,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### X_test transform

In [5]:
pd.DataFrame(preprocess_pipeline.fit_transform(X_test)[:3])

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.015282,0.0,0.0,0.569037,0.02984,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.125,0.0,0.013663,0.0,0.1,0.776231,0.01334,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.018909,0.0,0.0,0.682303,0.036922,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Model Pipeline

In [6]:
from sklearn.linear_model import LogisticRegression

model_pipeline = Pipeline(
    steps=[
        ("preprocess_pipeline", preprocess_pipeline),
        ("algo", LogisticRegression(n_jobs=-1))
    ],
    verbose=1
)

# Train

In [7]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "algo__penalty": ["l2", "elasticnet", "none"],
    "algo__dual": [True, False],
    "algo__fit_intercept": [True, False],
    "algo__C": [0.125, 0.25, 0.5, 1., 2.],
    "algo__solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "algo__l1_ratio": [0.125, 0.25, 0.5, 1., 2., None]
}

model = RandomizedSearchCV(estimator=model_pipeline, param_distributions=params, cv=3, scoring=("accuracy"), n_jobs=-1, verbose=1)
model.fit(X_train, y_train);

pd.DataFrame(model.cv_results_).sort_values(by="rank_test_score").iloc[:5, :]

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer]  (1 of 2) Processing numerical_pipeline, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing encoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing categorical_pipeline, total=   0.0s
[Pipeline]  (step 1 of 2) Processing preprocess_pipeline, total=   0.0s
[Pipeline] .............. (step 2 of 2) Processing algo, total=   0.1s


21 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Miniconda3\envs\ml\lib\site-packages\sklearn\l

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__solver,param_algo__penalty,param_algo__l1_ratio,param_algo__fit_intercept,param_algo__dual,param_algo__C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.126902,0.011526,0.037637,0.001981,saga,l2,0.25,False,False,1.0,"{'algo__solver': 'saga', 'algo__penalty': 'l2'...",0.811448,0.818182,0.83165,0.820426,0.008399,1
3,0.119463,0.008401,0.020419,0.001513,newton-cg,l2,0.125,False,False,0.25,"{'algo__solver': 'newton-cg', 'algo__penalty':...",0.808081,0.808081,0.794613,0.803591,0.006349,2
9,0.075715,0.00808,0.01064,0.000973,newton-cg,l2,2.0,True,False,0.125,"{'algo__solver': 'newton-cg', 'algo__penalty':...",0.801347,0.801347,0.777778,0.79349,0.011111,3
0,0.060776,0.009143,0.0,0.0,liblinear,none,,True,True,0.125,"{'algo__solver': 'liblinear', 'algo__penalty':...",,,,,,4
1,0.053886,0.005415,0.0,0.0,newton-cg,elasticnet,0.25,True,False,2.0,"{'algo__solver': 'newton-cg', 'algo__penalty':...",,,,,,5


# Save Model and Prediction

## Model

In [8]:
from datetime import datetime
from joblib import dump, load

now = datetime.now()
now = now.strftime("%m_%d_%Y-%H_%M_%S")
model_name = str(model.estimator.named_steps.algo).split("(")[0]

dump(value=model, filename="../../pretrained_models/" + now + "_" + model_name +".joblib")
model = load(filename="../../pretrained_models/" + now + "_" + model_name + ".joblib")

## Prediction

In [9]:
y_test_preds = model.predict(X=X_test)

pred_df = pd.DataFrame({
    "PassengerId": X_test.index,
    "Survived": y_test_preds
})

pred_df.to_csv("../../submissions/" + now + "_" + model_name + ".csv", index=False)