In [23]:
import pandas as pd

In [24]:
# Load the dataset
preprcossed_csv_path = r"..\03_Outputs\Preprocessed Train.csv"
df = pd.read_csv(preprcossed_csv_path)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Survived              891 non-null    int64  
 1   Pclass                891 non-null    int64  
 2   Sex                   891 non-null    int64  
 3   SibSp                 891 non-null    int64  
 4   Parch                 891 non-null    int64  
 5   Fare                  891 non-null    float64
 6   Embarked_C            891 non-null    bool   
 7   Embarked_Q            891 non-null    bool   
 8   Embarked_S            891 non-null    bool   
 9   AgeGroup_Child        891 non-null    bool   
 10  AgeGroup_Teen         891 non-null    bool   
 11  AgeGroup_Adult        891 non-null    bool   
 12  AgeGroup_Middle-Aged  891 non-null    bool   
 13  AgeGroup_Senior       891 non-null    bool   
 14  Title_Master          891 non-null    bool   
 15  Title_Miss            8

In [26]:
#Remove Class and Features
X_train = df.drop('Survived', axis=1)
y_train = df['Survived']


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np


In [28]:
models = [
    {'name': 'Logistic Regression',
      'model': LogisticRegression(max_iter=1000, random_state=42), 
     'param_grid': {
         'C': [0.1, 1, 10], 
         'solver': ['lbfgs', 'liblinear'],
         'max_iter': [100, 1000]
     }},
    
    {'name': 'Random Forest', 'model': RandomForestClassifier(random_state=42), 
     'param_grid': {
         'n_estimators': [50, 100, 200],
         'max_depth': [None, 10, 20],
         'min_samples_split': [2, 5],
         'min_samples_leaf': [1, 2]
     }},
    
    {'name': 'XGBoost', 'model': xgb.XGBClassifier(eval_metric='logloss', random_state=42), 
     'param_grid': {
         'n_estimators': [50, 100],
         'max_depth': [3, 6],
         'learning_rate': [0.01, 0.1],
         'subsample': [0.8, 1.0]
     }}
]


In [29]:
# Load the dataset
preprcossed_test_csv_path = r"..\03_Outputs\Preprocessed Test.csv"
df_test = pd.read_csv(preprcossed_test_csv_path)

In [30]:
#Remove Class and Features
X_test = df_test.drop('Survived', axis=1)
y_test = df_test['Survived']

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

results = {}

for model_dict in models:
    name = model_dict['name']
    model = model_dict['model']
    param_grid = model_dict['param_grid']
    print(f"--- Tuning and Evaluating {name} ---")
    
    # Run GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    # Best estimator from GridSearchCV
    best_model = grid_search.best_estimator_
    
    # Predict on test set
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Store results and best params
    results[name] = {
        'best_params': grid_search.best_params_,
        'accuracy_score': acc,
        'precision_score': prec,
        'recall_score': rec,
        'confusion_matrix': cm,
        'best_estimator': best_model
    }
    
    print(f"{name} best params: {grid_search.best_params_}")
    print(f"{name} accuracy: {acc:.4f}")
    print()


--- Tuning and Evaluating Logistic Regression ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Logistic Regression best params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
Logistic Regression accuracy: 0.9330

--- Tuning and Evaluating Random Forest ---
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest best params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Random Forest accuracy: 0.8780

--- Tuning and Evaluating XGBoost ---
Fitting 5 folds for each of 16 candidates, totalling 80 fits
XGBoost best params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 50, 'subsample': 0.8}
XGBoost accuracy: 0.8708



In [32]:
# Find best model based on accuracy (you can change metric)
best_model_name = max(results, key=lambda x: results[x]['accuracy_score'])
print(f"Best model based on accuracy: {best_model_name}")

# Show best model results
best_result = results[best_model_name]
print(f"Best params: {best_result['best_params']}")
print(f"Accuracy: {best_result['accuracy_score']:.4f}")
print(f"Precision: {best_result['precision_score']:.4f}")
print(f"Recall: {best_result['recall_score']:.4f}")
print("Confusion Matrix:")
print(best_result['confusion_matrix'])

Best model based on accuracy: Logistic Regression
Best params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
Accuracy: 0.9330
Precision: 0.8735
Recall: 0.9539
Confusion Matrix:
[[245  21]
 [  7 145]]
