In [1]:
import pandas as pd

In [2]:
# Load the dataset
preprcossed_csv_path = r"..\03_Outputs\Preprocessed Train.csv"
df = pd.read_csv(preprcossed_csv_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Survived              891 non-null    int64  
 1   Pclass                891 non-null    int64  
 2   Sex                   891 non-null    int64  
 3   SibSp                 891 non-null    int64  
 4   Parch                 891 non-null    int64  
 5   Fare                  891 non-null    float64
 6   Embarked_C            891 non-null    bool   
 7   Embarked_Q            891 non-null    bool   
 8   Embarked_S            891 non-null    bool   
 9   AgeGroup_Child        891 non-null    bool   
 10  AgeGroup_Teen         891 non-null    bool   
 11  AgeGroup_Adult        891 non-null    bool   
 12  AgeGroup_Middle-Aged  891 non-null    bool   
 13  AgeGroup_Senior       891 non-null    bool   
 14  Title_Master          891 non-null    bool   
 15  Title_Miss            8

In [4]:
#Remove Class and Features
X_train = df.drop('Survived', axis=1)
y_train = df['Survived']


In [5]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np


In [6]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier( eval_metric='logloss', random_state=42)
}


In [7]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [8]:
for name, model in models.items():
    print(f"Model: {name}")
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(scores)
    print(f"{name} Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")
    print("---------------------------------------")


Model: Logistic Regression
[0.82122905 0.82022472 0.83707865 0.8258427  0.83707865]
Logistic Regression Accuracy: 0.8283 ± 0.0074
---------------------------------------
Model: Random Forest
[0.82681564 0.81460674 0.83707865 0.8258427  0.8258427 ]
Random Forest Accuracy: 0.8260 ± 0.0071
---------------------------------------
Model: XGBoost
[0.8603352  0.83146067 0.83707865 0.82022472 0.83707865]
XGBoost Accuracy: 0.8372 ± 0.0131
---------------------------------------


In [9]:
# Load the dataset
preprcossed_test_csv_path = r"..\03_Outputs\Preprocessed Test.csv"
df_test = pd.read_csv(preprcossed_test_csv_path)

In [10]:
#Remove Class and Features
X_test = df_test.drop('Survived', axis=1)
y_test = df_test['Survived']

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Assuming X_train, X_test, y_train, y_test are already defined
results = {}
for name, model in models.items():
    print(f"--- {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    results[name]= {"accuracy_score":acc,"precision_score":prec,"recall_score":rec,"confusion_matrix" : cm}
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print()


--- Logistic Regression ---
Accuracy:  0.9330
Precision: 0.8735
Recall:    0.9539
Confusion Matrix:
[[245  21]
 [  7 145]]

--- Random Forest ---
Accuracy:  0.8493
Precision: 0.7730
Recall:    0.8289
Confusion Matrix:
[[229  37]
 [ 26 126]]

--- XGBoost ---
Accuracy:  0.8612
Precision: 0.7901
Recall:    0.8421
Confusion Matrix:
[[232  34]
 [ 24 128]]



In [12]:
# Find the best model based on precision
best_model_precision = max(results, key=lambda x: results[x]['precision_score'])
print(f"Best model based on precision: {best_model_precision}")

# Find the best model based on recall
best_model_recall = max(results, key=lambda x: results[x]['recall_score'])
print(f"Best model based on recall: {best_model_recall}")


Best model based on precision: Logistic Regression
Best model based on recall: Logistic Regression
