# Titanic 

# Todo
 - Plot feature correlation
 - Plot feature importances
 - Analyse Cabin and extract features
 - Read through variable notes (e.g. add is_age_estimated feature)
 - Tune prediction threshlod (kept at 0.5 default atm)
 - EDA in general
 - More intelligent imputation

In [None]:
from pathlib import Path
import os

is_titanic_dir = str(Path(os.getcwd())).rsplit("\\")[-1] == "Titanic"

if is_titanic_dir:
    os.chdir("../")

In [None]:
%%capture

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
from xgboost import XGBClassifier

from mlutils.model.baseline import AutoMLBaseline
from mlutils.preprocess.missing import check_missingness

### Load data

In [None]:
TARGET = "Survived"

DROP_COLS = [
    "PassengerId", 
    "Name",
    "Ticket", # remove this for now
    "Cabin"
]

In [None]:
def load_raw_data(data_set="train", drop_cols=None):

    if data_set == "test":
        df = pd.read_csv("Titanic/Data/test.csv")
        drop_cols = [x for x in DROP_COLS if x != "PassengerId"]
    else:
        df = pd.read_csv("Titanic/Data/train.csv")

    if drop_cols is not None:
        df.drop(drop_cols, axis=1, inplace=True)

    return df

In [None]:
df = load_raw_data(data_set="train", drop_cols=DROP_COLS)
df.head()

**Missingness**

In [None]:
check_missingness(df)

In [None]:
def impute_data(df: pd.DataFrame) -> pd.DataFrame:
    # Impute embarked with mode for now
    mode_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
    df[['Embarked']] = mode_imputer.fit_transform(df[['Embarked']])

    # Impute age with median
    median_imputer = SimpleImputer(missing_values=np.nan, strategy="median")
    df["Age"] = median_imputer.fit_transform(df[["Age"]])

    return df
    
df = impute_data(df)

**Encode categorical variables**

In [None]:
def ohe(df):
    return pd.get_dummies(df, drop_first=True)
df = ohe(df)

In [None]:
df

**Class imbalance**

In [None]:
pct_positive = len(df.loc[df[TARGET] == 1]) / len(df) * 100
pct_positive

**X, y & train validate sets**

In [None]:
X, y = df.drop(TARGET, axis=1), df[TARGET]

### Basline model

In [None]:
%%capture
baseline_performance = AutoMLBaseline(X, y, scoring="roc_auc", n_cv=3).run_experiment()

In [None]:
baseline_performance

### XGBoost

Tune xgboost model

In [None]:
%%capture

xgb = XGBClassifier(scale_pos_weight = (100 - pct_positive) / pct_positive)

params = {
    "max_depth": [6, 9, 12, 15],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 500, 1000],
    "colsample_bytree": [0.3, 0.5, 0.75]
}

stratified_kfold = StratifiedKFold(n_splits=3)

clf = GridSearchCV(estimator=xgb, 
                   param_grid=params,
                   scoring='roc_auc', 
                   verbose=1,
                   n_jobs=-1,
                   cv=stratified_kfold)

clf.fit(X, y)

print(clf.best_score_)
xgb_tuned = clf.best_estimator_

### Store best score along with model metadata

In [None]:
class ModelMetadata():
    def __init__(
        self, 
        fit_grid_search,
        features 
    ):
        self.best_scoring_metadata = {
            "best_score": fit_grid_search.best_score_,
            "params": fit_grid_search.best_params_,
            "features": features
        }

    def load_pickle(self):
        """Loads the current best performing metadata"""
        with open('Titanic/Data/best_scoring_metadata.pickle', 'rb') as handle:
            metadata = pickle.load(handle)
        return metadata

    def save_pickle(self, metadata: dict):
        """Overwrites the best performing """
        with open('Titanic/Data/best_scoring_metadata.pickle', 'wb') as handle:
            pickle.dump(metadata, handle)

    @staticmethod
    def _extract_best_score(metadata):
        return metadata["best_score"]

    def store_if_better(self):
        """
        Compares the challenger to the current best and
        overwrites the local best if it is
        """
        current_best = self.load_pickle()

        current_best_score = self._extract_best_score(current_best)
        challenger_score = self._extract_best_score(self.best_scoring_metadata)
        
        is_challenger_better = challenger_score > current_best_score
            
        if is_challenger_better:
            print(f"Challenger is better with a score of {challenger_score} - previous best {current_best_score}")
            print("Saving challenger model metadata...")
            self.save_pickle(self.best_scoring_metadata)
            return
        else:
            print(f"Current best score of {current_best_score} beats the challenger score of {challenger_score}")
        

In [None]:
_ = ModelMetadata(clf, X.columns)
_.store_if_better()
print(_.best_scoring_metadata)

In [None]:
df_test = load_raw_data("test", DROP_COLS)
df_test = impute_data(df_test)
df_test = ohe(df_test)

y_hats = df_test[["PassengerId"]]
y_hats["Survived"] = xgb_tuned.predict(df_test.drop("PassengerId", axis=1))

In [None]:
y_hats.to_csv("Titanic/Data/test_predictions.csv", index=False)