# Titanic ML
This notebook focuses on building machine learning models for the Titanic survival prediction task.
The data has already been cleaned and preprocessed in the data analysis notebook, ensuring consistent feature engineering for both train and test datasets.

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [15]:
# Load Preprocessed Data
train_df = pd.read_csv("train_preprocessed.csv")
test_df = pd.read_csv("test_preprocessed.csv")

# Prepare features and target
X = train_df.drop(columns=['Survived', 'PassengerId'])
y = train_df['Survived']

# Test set (for Kaggle submission)
X_test = test_df.drop(columns=['PassengerId'])
test_ids = test_df['PassengerId']

# Train/Validation Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
# Model Training & Evaluation

def evaluate_model(model, model_name):
    """Train, evaluate, and return accuracy score."""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)
    
    print(f" {model_name}")
    print("Validation Accuracy:", acc)
    print(classification_report(y_valid, y_pred))
    print("-" * 50)
    return acc

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
acc_lr = evaluate_model(lr_model, "Logistic Regression")

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    min_samples_split=4,
    random_state=42
)
acc_rf = evaluate_model(rf_model, "Random Forest")

# XGBoost
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
acc_xgb = evaluate_model(xgb_model, "XGBoost")



 Logistic Regression
Validation Accuracy: 0.8268156424581006
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       110
           1       0.79      0.75      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

--------------------------------------------------
 Random Forest
Validation Accuracy: 0.8268156424581006
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       110
           1       0.80      0.74      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.83      0.83      0.83       179

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 XGBoost
Validation Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.68      0.74        69

    accuracy                           0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179

--------------------------------------------------


In [None]:
# Compare Model Performances

results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Validation Accuracy': [acc_lr, acc_rf, acc_xgb]
})

print("\n Model Comparison")
print(results)

# Plot results
sns.barplot(data=results, x='Model', y='Validation Accuracy')
plt.ylim(0.7, 1.0)
plt.title("Model Performance Comparison")
plt.show()

In [None]:
# Train Best Model on Full Data

best_model = xgb_model  # Choose based on results
best_model.fit(X, y)

# Predict on test set
final_preds = best_model.predict(X_test)

In [None]:

# Create Kaggle Submission
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': final_preds
})
submission.to_csv("submission.csv", index=False)

print("Submission file created: submission.csv")