# 0. Imports

In [2]:
import sys
import warnings
import os
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

In [38]:
from sklearn.experimental import enable_halving_search_cv

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing
import mlflow
from sklearn.model_selection import ParameterGrid, GridSearchCV, HalvingGridSearchCV



# 1. Variables

In [4]:
preprocessed_data_path = "../data/02_preprocessed"
submission_path = "../data/04_submission"

# 1. Load Data

In [5]:
df_train = pd.read_parquet(f"{preprocessed_data_path}/train_preprocessed.parquet")

In [6]:
x_train = df_train.drop(
    "Survived",axis=1
).to_numpy()
y_train = df_train["Survived"].to_numpy()

# 2. Modelling

In [62]:
def get_or_create_experiment(name: str) -> str:
    client = mlflow.tracking.MlflowClient()
    searched_experiment = client.search_experiments(
        filter_string=f"attribute.name = '{name}'"
    )
    if searched_experiment:
        return searched_experiment[0].experiment_id
    else:
        return client.create_experiment(
            name
        )

def get_or_create_run(experiment_id: str, name: str) -> str:
    client = mlflow.tracking.MlflowClient()
    searched_run = client.search_runs(
        experiment_ids=[experiment_id],
        filter_string=f"attribute.run_name = '{name}'"
    )
    if searched_run:
        return searched_run[0].info.run_id
    else:
        return client.create_run(
            experiment_id=experiment_id,
            run_name=name
        ).info.run_id
    
def track_experiment(run_name,model,train_dataset):
    experiment_id = get_or_create_experiment("TitanicSurvivalClassification")
    run_id = get_or_create_run(experiment_id,run_name)
    with mlflow.start_run(experiment_id=experiment_id, run_id=run_id, nested=True):
        with mlflow.start_run(experiment_id=experiment_id, nested=True):
            mlflow.sklearn.log_model(
                model.best_estimator_,
                input_example=x_train,
                artifact_path="titanic-kaggle-challenge/data/03_model"
            )

            mlflow.log_params(
                model.get_params()
            )

            train_pd_dataset = mlflow.data.from_pandas(
                train_dataset, predictions=None, targets="Survived"
            )
            mlflow.log_input(train_pd_dataset, context="Training")


            val_metrics = evaluate_model(model)
            mlflow.log_metrics(
                val_metrics
            ) 

        # mlflow.evaluate(data=train_pd_dataset, predictions=None, model=model, model_type="classifier")
        # mlflow.evaluate(data=val_pd_dataset, predictions=None, model=model, model_type="classifier")

def fit_model(model, data):
    return model.fit(data)

def evaluate_model(model):
    try:
        metric_scores = {
            f"f1_score": model.cv_results_["mean_test_f1"][model.best_index_],
            f"accuracy": model.cv_results_["mean_test_accuracy"][model.best_index_],
            f"precision": model.cv_results_["mean_test_precision"][model.best_index_],
            f"recall": model.cv_results_["mean_test_recall"][model.best_index_]
        }
    except:
        metric_scores = {
            "score": gboost_gs.cv_results_["mean_test_score"][gboost_gs.best_index_]
        }
    return metric_scores

## 2.1 Linear Approach

### 2.1.1 Logistic Regression

In [15]:
scaler = preprocessing.StandardScaler()
scaler_fit = scaler.fit(x_train)
x_train_scaled = scaler_fit.transform(x_train)

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
param_grid = list(ParameterGrid(
    {
        "penalty": [["l1"],["l2"],[None]],
        "dual": [[True],[False]],
        "solver": [["lbfgs"],["liblinear"], ["newton-cg"], ["newton-cholesky"], ["sag"], ["saga"]],
        "C": [[0.01],[0.1],[0.2],[0.3],[0.4],[0.5],[0.6],[0.7],[0.8],[0.9],[1]],
        "fit_intercept": [[True],[False]],
        "class_weight":[[None]],
        "random_state": [[None]],
        "max_iter": [[100],[500],[1000],[10000]],
    }
))

In [18]:
lr_gs = GridSearchCV(
    LogisticRegression(),
    param_grid,
    scoring=["accuracy","f1","precision","recall"],
    refit="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=0
)

In [19]:
lr_gs.fit(x_train,y_train)

In [20]:
print(lr_gs.cv_results_["mean_test_accuracy"][lr_gs.best_index_])
print(lr_gs.cv_results_["mean_test_f1"][lr_gs.best_index_])


0.8002385286548239
0.7266612619099048


In [21]:
track_experiment(
    "LogisticRegression",lr_gs,df_train,
)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



### 2.1.2 Logistic Regression with SGD

Scaling is needed when using SGD to avoid too oscillating results

In [23]:
scaler = preprocessing.StandardScaler()
scaler_fit = scaler.fit(x_train)
x_train_scaled = scaler_fit.transform(x_train)

In [24]:
from sklearn.linear_model import SGDClassifier

In [19]:
param_grid = list(ParameterGrid(
    {
        "loss": [["log_loss"]],
        "penalty": [["l1"],["l2"],[None]],
        "alpha": [[0.01,0.05,0.1,0.2,0.005,0.001,1,2]],
        "max_iter": [[100],[500],[1000],[10000]],
        "tol": [[0.001],[0.01],[0.05],[0.1],[0.0005],[0.0001]],
        "shuffle": [[True]],
        "learning_rate": [["optimal"],["constant"],["adaptive"]],
        "eta0": [[0.01,0.05,0.1,0.2,0.005,0.001]],
        "early_stopping": [[False],[True]],
    }
))

In [28]:
lr_gs = GridSearchCV(
    SGDClassifier(),
    param_grid,
    scoring=["accuracy","f1","precision","recall"],
    refit="accuracy",
    cv=5,
    n_jobs=-1, 
    verbose=0
)
lr_gs.fit(x_train,y_train)

In [29]:
print(lr_gs.cv_results_["mean_test_accuracy"][lr_gs.best_index_])
print(lr_gs.cv_results_["mean_test_f1"][lr_gs.best_index_])

0.8046889711882492
0.738008765556597


In [31]:
track_experiment(
    "LogisticRegression",lr_gs,df_train
)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



### 2.1.3 SVM

In [33]:
scaler = preprocessing.StandardScaler()
scaler_fit = scaler.fit(x_train)
x_train_scaled = scaler_fit.transform(x_train)

In [34]:
from sklearn.linear_model import SGDClassifier

In [31]:
param_grid = list(ParameterGrid(
    {
        "loss": [["hinge"]],
        "penalty": [["l1"],["l2"],[None]],
        "alpha": [[0.01,0.05,0.1,0.2,0.005,0.001,1,2,0.0001]],
        "max_iter": [[100],[500],[1000],[10000]],
        "tol": [[0.001],[0.01],[0.05],[0.1],[0.0005],[0.0001]],
        "shuffle": [[True]],
        "learning_rate": [["optimal"],["constant"],["adaptive"]],
        "eta0": [[0.01,0.05,0.1,0.2,0.005,0.001]],
        "early_stopping": [[False],[True]],
    }
))

In [38]:
svm_gs = GridSearchCV(
    SGDClassifier(),
    param_grid,
    scoring=["accuracy","f1","precision","recall"],
    refit="accuracy",
    cv=5,
    n_jobs=-1, 
    verbose=0
)
svm_gs.fit(x_train,y_train)

In [41]:
print(svm_gs.cv_results_["mean_test_accuracy"][svm_gs.best_index_])
print(svm_gs.cv_results_["mean_test_f1"][svm_gs.best_index_])

0.803577929822359
0.7383972932462746


In [39]:
track_experiment(
    "SVM",svm_gs,df_train
)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



## 2.2 Non-linear approaches

Test of several linear approaches (linear SVM and Logistic Regression using different solvers) showed that using a linear boundary achieves at most about 70% accuracy.

### 2.2.1 Gradient Boosting

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

(*, loss='log_loss', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

In [47]:
param_grid = list(ParameterGrid(
    {
        "loss": [["log_loss"]],
        "learning_rate": [[0.01],[0.1],[0.001]],
        "n_estimators": [[100],[500],[1000],[2000]],
        # "subsample": [[0.5],[1]],
        "criterion": [["friedman_mse"]],
        "tol": [[0.001],[0.01]],
        "min_samples_split": [[2],[5]],
        "min_samples_leaf": [[2],[4]],
        "min_weight_fraction_leaf": [[0.1],[0.2],[0.4]],
        "max_depth": [[3],[5],[7]],
        "min_impurity_decrease": [[0.1],[0.3]],
    }
))

In [48]:
len(param_grid)

1728

In [49]:
gboost_gs = HalvingGridSearchCV(
    GradientBoostingClassifier(),
    param_grid,
    scoring="accuracy",
    refit=True,
    cv=5,
    n_jobs=-1, 
    verbose=0
)
gboost_gs.fit(x_train,y_train)

dict_keys(['iter', 'n_resources', 'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_learning_rate', 'param_loss', 'param_max_depth', 'param_min_impurity_decrease', 'param_min_samples_leaf', 'param_min_samples_split', 'param_min_weight_fraction_leaf', 'param_n_estimators', 'param_tol', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [53]:
print(gboost_gs.cv_results_["mean_test_score"][gboost_gs.best_index_])

0.8041883004499827


In [56]:
y_pred = gboost_gs.predict(x_train)
print(metrics.f1_score(y_train,y_pred))
print(metrics.accuracy_score(y_train,y_pred))
print(metrics.precision_score(y_train,y_pred))
print(metrics.recall_score(y_train,y_pred))

0.7611202635914333
0.8372615039281706
0.8716981132075472
0.6754385964912281


In [63]:
track_experiment(
    "GradientBoosting",gboost_gs,df_train
)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



# Generate submission

In [57]:
df_test = pd.read_parquet(f"{preprocessed_data_path}/test_preprocessed.parquet")

In [64]:
model_name = "Submission-240925"
model_version = "latest"

# Load the model from the Model Registry
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri)

In [65]:
df_predicted = df_test.copy()
df_predicted["Survived"] = model.predict(df_test)
df_predicted = df_predicted[["Survived"]]
df_predicted = df_predicted.reset_index()

In [67]:
df_predicted.to_csv(
    f"{submission_path}/{model_name}_{model_version}.csv", index=False
)