# 0. Imports

In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing
import mlflow
import datetime

# 1. Variables

In [86]:
preprocessed_data_path = "../data/02_preprocessed"
save_data_path = ""

# 1. Load Data

In [87]:
df_train = pd.read_parquet(f"{preprocessed_data_path}/train_preprocessed.parquet")

In [88]:
df_train_set, df_val_set = train_test_split(
    df_train,
    test_size = 0.2,
    stratify = df_train["Survived"]
)

In [89]:
x_train = df_train_set.drop("Survived",axis=1)
y_train = df_train_set["Survived"].to_numpy()
x_val = df_val_set.drop("Survived",axis=1)
y_val = df_val_set["Survived"].to_numpy()

# 2. Modelling

In [90]:
def get_or_create_experiment(name: str) -> str:
    client = mlflow.tracking.MlflowClient()
    searched_experiment = client.search_experiments(
        filter_string=f"attribute.name = '{name}'"
    )
    if searched_experiment:
        return searched_experiment[0].experiment_id
    else:
        return client.create_experiment(
            name
        )
    
def track_experiment(experiment_name,model,x_train,y_train,x_val,y_val):
    experiment_id = get_or_create_experiment("TitanicSurvivalClassification")
    with mlflow.start_run(experiment_id=experiment_id, run_name=experiment_name):
        mlflow.sklearn.log_model(
            model,
            input_example=x_train,
            artifact_path="titanic-kaggle-challenge/data/03_model"
        )

        mlflow.log_params(
            model.get_params()
        )

        with mlflow.start_run(experiment_id=experiment_id,run_name=f"Train",nested=True):
            train_dataset = x_train.copy()
            train_dataset["Survived"] = y_train
            train_pd_dataset = mlflow.data.from_pandas(
                train_dataset, predictions=None, targets="Survived"
            )
            mlflow.log_input(train_pd_dataset, context="Training")
            train_metrics = evaluate_model(model,x_train,y_train)
            mlflow.log_metrics(
                train_metrics
            )


        with mlflow.start_run(experiment_id=experiment_id,run_name=f"Validation",nested=True):
            val_dataset = x_val.copy()
            val_dataset["Survived"] = y_val
            val_pd_dataset = mlflow.data.from_pandas(
                val_dataset, predictions=None, targets="Survived"
            )
            mlflow.log_input(val_pd_dataset, context="Validation")
            val_metrics = evaluate_model(model,x_val,y_val)
            mlflow.log_metrics(
                val_metrics
            )

        # mlflow.evaluate(data=train_pd_dataset, predictions=None, model=model, model_type="classifier")
        # mlflow.evaluate(data=val_pd_dataset, predictions=None, model=model, model_type="classifier")

def fit_model(model, data):
    return model.fit(data)

def evaluate_model(model, x, y_true, prefix = ""):
    y_pred = model.predict(x)
    f1_score = metrics.f1_score(y_true,y_pred)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    precision = metrics.precision_score(y_true,y_pred)
    recall = metrics.recall_score(y_true,y_pred)
    metric_scores = {
        f"{prefix}f1_score": f1_score,
        f"{prefix}accuracy": accuracy,
        f"{prefix}precision": precision,
        f"{prefix}recall": recall
    }
    return metric_scores

## 2.1 Linear Approach

### 2.1.1 Logistic Regression

In [91]:
scaler = preprocessing.StandardScaler()
scaler_fit = scaler.fit(x_train)
x_train_scaled = scaler_fit.transform(x_train)
x_val_scaled = scaler_fit.transform(x_val)

In [92]:
from sklearn.linear_model import LogisticRegression

In [93]:
lr = LogisticRegression(
    penalty="l2",
    dual = False,
    tol = 1e-4,
    C = 0.1, #1/lambda
    fit_intercept = True, #Whether to have B0
    intercept_scaling = 1, #constant value of x0, default to 1 -> x0B0 = B0
    class_weight = None,#Whether a class should be primed or not
    random_state = None, #Seed to shuffle data
    solver = "lbfgs", # For SGD, use SGDClassifier which fits linear models with SGD. Models being controlled by loss
    max_iter = 100, #Number of max iterations for gradient descent
    multi_class = "auto",
    verbose = 0,
    warm_start = False, # Whether to use output of previous fit or not. Ignored for lbfgs,
    n_jobs = None,
    l1_ratio = None #Ratio for l1 when  penalty is elasticnet (w1*l1+w2*l2)
)

In [94]:
lr_fit = lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [95]:
track_experiment(
    "LogisticRegression-{ts}".format(ts=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S'")),lr_fit,x_train,y_train,x_val,y_val
)



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



### 2.1.2 Logistic Regression with SGD

Scaling is needed when using SGD to avoid too oscillating results

In [96]:
scaler = preprocessing.StandardScaler()
scaler_fit = scaler.fit(x_train)
x_train_scaled = scaler_fit.transform(x_train)
x_val_scaled = scaler_fit.transform(x_val)

In [97]:
from sklearn.linear_model import SGDClassifier

In [98]:
lr = SGDClassifier(
    loss = "log_loss", # This defines logstic regression as model to fit
    penalty="l2",
    alpha = 0.01,
    fit_intercept = True,
    max_iter=10000,
    tol=0.001,
    shuffle=True,
    verbose=0,
    learning_rate="optimal", #optimal, constant, invscaling, adaptive
    eta0=0.01, #initial LR for constant, invscaling and adaptive
    power_t=0.5, # exponent for inverse scaling
    early_stopping=False, #Whether to stop based on tolerance
    validation_fraction=.1,
    n_iter_no_change=5,
    class_weight=None,
    warm_start=False,
    average=False
)

In [99]:
lr_fit = lr.fit(x_train, y_train)

In [100]:
track_experiment(
    "LogisticRegression-SGD-{ts}".format(ts=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S'")),lr_fit,x_train,y_train,x_val,y_val
)



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



### 2.1.3 SVM

In [101]:
scaler = preprocessing.StandardScaler()
scaler_fit = scaler.fit(x_train)
x_train_scaled = scaler_fit.transform(x_train)
x_val_scaled = scaler_fit.transform(x_val)

In [102]:
from sklearn.linear_model import SGDClassifier

In [103]:
linear_svm = SGDClassifier(
    loss = "hinge", # This defines logstic regression as model to fit
    penalty="l1",
    alpha = 0.001,
    fit_intercept = True,
    max_iter=10000,
    tol=0.001,
    shuffle=True,
    verbose=0,
    learning_rate="optimal", #optimal, constant, invscaling, adaptive
    eta0=0.01, #initial LR for constant, invscaling and adaptive
    power_t=0.5, # exponent for inverse scaling
    early_stopping=False, #Whether to stop based on tolerance
    validation_fraction=.1,
    n_iter_no_change=5,
    class_weight=None,
    warm_start=False,
    average=False
)

In [104]:
linear_svm_fit = linear_svm.fit(x_train,y_train)

In [105]:
track_experiment(
    "SVM-{ts}".format(ts=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S'")),linear_svm_fit,x_train,y_train,x_val,y_val
)



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



## 2.2 Non-linear approaches

Test of several linear approaches (linear SVM and Logistic Regression using different solvers) showed that using a linear boundary achieves at most about 70% accuracy.

### 2.2.1 Gradient Boosting

In [106]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

(*, loss='log_loss', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

In [107]:
gboost =  GradientBoostingClassifier(
    loss="log_loss",
    learning_rate=0.1,
    n_estimators=50,
    subsample=1.0,
    criterion="friedman_mse",
    min_samples_split=10,
    min_samples_leaf=20,
    min_weight_fraction_leaf=0,
    max_depth=5,
    min_impurity_decrease=0.2,
    init=None,
    max_features=None,
    max_leaf_nodes=None,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=0.0001,
    ccp_alpha=0.0
)

In [108]:
gboost_fit = gboost.fit(x_train,y_train)

In [109]:
track_experiment(
    "GradientBoosting-{ts}".format(ts=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S'")),gboost_fit,x_train,y_train,x_val,y_val
)



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

