In [1]:
# Imoprting general libraries
import pandas as pd
import numpy as np
import inspect

from typing import Union
from dataclasses import dataclass, field
from utilities.warnings import *
from utilities.transfrom import data_transform

In [2]:
raw_data = pd.read_csv("./data/train.csv")
df = data_transform(data_frame=raw_data)
print(data_transform.__doc__)
df.head()

Fills Age column with median; Embarked column with first element which is the most frequently-occurring
Creating TravelAlone predictor; Removing ittelevant predictors; Creating dummy variables; Transforming dtypes


Unnamed: 0,Survived,Age,Fare,TravelAlone,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Sex_male
0,0,22,7.25,0,0,1,0,1,1
1,1,38,71.2833,0,0,0,0,0,0
2,1,26,7.925,1,0,1,0,1,0
3,1,35,53.1,0,0,0,0,1,0
4,0,35,8.05,1,0,1,0,1,1


In [3]:
df.dtypes

Survived         int32
Age              int32
Fare           float64
TravelAlone      int32
Pclass_2         int64
Pclass_3         int64
Embarked_Q       int64
Embarked_S       int64
Sex_male         int64
dtype: object

# Machine Learning Models

In [4]:
# Models
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import StratifiedKFold, cross_validate

In [5]:
# Predictors
X = df[['Age', 'Fare', 'TravelAlone', 'Pclass_2', 'Pclass_3','Embarked_Q', 'Embarked_S', 'Sex_male']]

# Dependent variable
y = df["Survived"]

# Models
gnb = GaussianNB
dt = DecisionTreeClassifier
rf = RandomForestClassifier
ab = AdaBoostClassifier
gb = GradientBoostingClassifier
xgb = XGBClassifier
lr = LogisticRegression
svc = SVC
knn = KNeighborsClassifier

models_list = [gnb, dt, rf, ab, gb, xgb, lr, svc, knn]

# cross-validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

@dataclass
class Model:

    model: object = field(repr=False)
    model_name: str = field(init=False)
    X_domain: pd.DataFrame = field(repr=False)
    y_domain: pd.DataFrame = field(repr=False)

    cross_validator: Union[object, int] = field(repr=False)

    accuracy: list[float] = field(default_factory=list[float])
    precission: list[float]  = field(default_factory=list[float])
    roc_auc: list[float]  = field(default_factory=list[float])
    f1: list[float]  = field(default_factory=list[float])
    recall: list[float]  = field(default_factory=list[float])

    def model_name_(self) -> model_name:
        self.model_name = self.model.__name__

    def cross_validation_(self, kwargs: dict = {}) -> (accuracy, precission, roc_auc, f1, recall):

        def cv_iter(self, iterations: int = 5, round_: int = 3):

            accuracy = []
            precission = []
            roc_auc = []
            f1 = []
            recall = []

            for i in range(iterations):

                cross_valitation_results = cross_validate(estimator=self.model(**kwargs, random_state=i), 
                                                        X=self.X_domain, y=self.y_domain, scoring=["accuracy", "precision", "roc_auc", "f1", "recall"], 
                                                        cv=self.cross_validator)
                accuracy.append(cross_valitation_results["test_accuracy"].mean())
                precission.append(cross_valitation_results["test_precision"].mean())
                roc_auc.append(cross_valitation_results["test_roc_auc"].mean())
                f1.append(cross_valitation_results["test_f1"].mean())
                recall.append(cross_valitation_results["test_recall"].mean())
            
            self.accuracy = np.array(accuracy).mean().round(round_)
            self.precission = np.array(precission).mean().round(round_)
            self.roc_auc = np.array(roc_auc).mean().round(round_)
            self.f1 = np.array(f1).mean().round(round_)
            self.recall = np.array(recall).mean().round(round_)
        
        def cv(self, round_: int = 3):

            cross_valitation_results = cross_validate(estimator=self.model(**kwargs), 
                                                        X=self.X_domain, y=self.y_domain, scoring=["accuracy", "precision", "roc_auc", "f1", "recall"], 
                                                        cv=self.cross_validator)
            
            self.accuracy = cross_valitation_results["test_accuracy"].mean().round(round_)
            self.precission = cross_valitation_results["test_precision"].mean().round(round_)
            self.roc_auc = cross_valitation_results["test_roc_auc"].mean().round(round_)
            self.f1 = cross_valitation_results["test_f1"].mean().round(round_)
            self.recall = cross_valitation_results["test_recall"].mean().round(round_)
    
        try:
            parameters = inspect.signature(self.model).parameters
            assert "random_state" in parameters
        except AssertionError:
            cv(self)
        else:
            cv_iter(self)

    def __post_init__(self):
        self.model_name_()

def models_comparison(models: list) -> pd.DataFrame:

    name = []
    accuracy = []
    precission = []
    roc_auc = []
    f1 = []
    recall = []

    for model in models:

        name.append(model.__dict__["model_name"])
        accuracy.append(model.__dict__["accuracy"])
        precission.append(model.__dict__["precission"])
        roc_auc.append(model.__dict__["roc_auc"])
        f1.append(model.__dict__["f1"])
        recall.append(model.__dict__["recall"])
    
    dict_ = {metric: [acc, prec, roc, f1_score, rec] for metric, acc, prec, roc, f1_score, rec in zip(name, accuracy, precission, roc_auc, f1, recall)}
    df = pd.DataFrame(dict_).transpose()
    df.rename(columns={0:"Accuracy", 1:"Precission", 2:"ROC_AUC", 3:"F1_score", 4:"Recall"}, inplace=True)
    return df

In [6]:
models = [Model(model=model, X_domain=X, y_domain=y, cross_validator=skf) for model in models_list]
models

[Model(model_name='GaussianNB', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='DecisionTreeClassifier', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='RandomForestClassifier', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='AdaBoostClassifier', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='GradientBoostingClassifier', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='XGBClassifier', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='LogisticRegression', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='SVC', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[]),
 Model(model_name='KNeighborsClassifier', accuracy=[], precission=[], roc_auc=[], f1=[], recall=[])]

In [7]:
for model in models:
    model.cross_validation_()
models

[Model(model_name='GaussianNB', accuracy=0.777, precission=0.724, roc_auc=0.822, f1=0.697, recall=0.678),
 Model(model_name='DecisionTreeClassifier', accuracy=0.79, precission=0.734, roc_auc=0.781, f1=0.723, recall=0.717),
 Model(model_name='RandomForestClassifier', accuracy=0.815, precission=0.768, roc_auc=0.871, f1=0.753, recall=0.744),
 Model(model_name='AdaBoostClassifier', accuracy=0.802, precission=0.758, roc_auc=0.858, f1=0.738, recall=0.725),
 Model(model_name='GradientBoostingClassifier', accuracy=0.816, precission=0.816, roc_auc=0.866, f1=0.737, recall=0.676),
 Model(model_name='XGBClassifier', accuracy=0.822, precission=0.789, roc_auc=0.868, f1=0.757, recall=0.731),
 Model(model_name='LogisticRegression', accuracy=0.79, precission=0.744, roc_auc=0.851, f1=0.716, recall=0.696),
 Model(model_name='SVC', accuracy=0.677, precission=0.68, roc_auc=0.734, f1=0.41, recall=0.298),
 Model(model_name='KNeighborsClassifier', accuracy=0.707, precission=0.638, roc_auc=0.741, f1=0.592, rec

In [8]:
comparison = models_comparison(models)
comparison.sort_values("ROC_AUC", ascending=False)

Unnamed: 0,Accuracy,Precission,ROC_AUC,F1_score,Recall
RandomForestClassifier,0.815,0.768,0.871,0.753,0.744
XGBClassifier,0.822,0.789,0.868,0.757,0.731
GradientBoostingClassifier,0.816,0.816,0.866,0.737,0.676
AdaBoostClassifier,0.802,0.758,0.858,0.738,0.725
LogisticRegression,0.79,0.744,0.851,0.716,0.696
GaussianNB,0.777,0.724,0.822,0.697,0.678
DecisionTreeClassifier,0.79,0.734,0.781,0.723,0.717
KNeighborsClassifier,0.707,0.638,0.741,0.592,0.556
SVC,0.677,0.68,0.734,0.41,0.298
