# 🚢 Titanic Machine Learning — End-to-End Experiment

This notebook walks you through an end-to-end binary classification workflow:

1. Load & clean the Titanic dataset  
2. Train multiple models: Logistic, SVM, Decision Tree, Random Forest, AdaBoost  
3. Evaluate with ROC and Precision-Recall curves  
4. Compute **Accuracy**, **Sensitivity**, **Specificity**, **Precision**, **F1** across thresholds  
5. Find best threshold and evaluate on test set

**Dataset:** Download `train.csv` from [Kaggle Titanic competition](https://www.kaggle.com/competitions/titanic/data) and place it in `../data/train.csv`.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_curve, precision_recall_curve, auc, f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
import joblib, os

## 1️⃣ Load and clean data

In [None]:
path = '../data/train.csv'
df = pd.read_csv(path)
df = df[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df.head()

## 2️⃣ Train/validation/test split

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

## 3️⃣ Preprocessor & models

In [None]:
numeric = ['Pclass','Age','SibSp','Parch','Fare']
categorical = ['Sex','Embarked']
pre = ColumnTransformer([
    ('num', StandardScaler(), numeric),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical)
])

models = {
    'Logistic': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=200, random_state=42)
}

## 4️⃣ Train and evaluate models

In [None]:
results = {}
for name, m in models.items():
    pipe = Pipeline([('pre', pre), ('model', m)])
    pipe.fit(X_train, y_train)
    probs = pipe.predict_proba(X_val)[:,1]
    fpr, tpr, _ = roc_curve(y_val, probs)
    pr_prec, pr_rec, _ = precision_recall_curve(y_val, probs)
    results[name] = {'pipe': pipe, 'fpr': fpr, 'tpr': tpr, 'pr_prec': pr_prec, 'pr_rec': pr_rec,
                     'roc_auc': auc(fpr, tpr), 'pr_auc': auc(pr_rec, pr_prec)}

plt.figure(figsize=(6,5))
for n,r in results.items():
    plt.plot(r['fpr'], r['tpr'], label=f"{n} (AUC={r['roc_auc']:.2f})")
plt.plot([0,1],[0,1],'--',c='gray'); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.legend(); plt.title('ROC Curves')
plt.show()

## 5️⃣ Metrics vs Threshold (for best model)

In [None]:
best_name = max(results, key=lambda n: results[n]['roc_auc'])
pipe = results[best_name]['pipe']
probs = pipe.predict_proba(X_val)[:,1]
thresholds = np.linspace(0.01, 0.99, 99)

accs, sens, specs, f1s, precs = [], [], [], [], []
for t in thresholds:
    preds = (probs >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_val, preds).ravel()
    accs.append((tp+tn)/(tp+tn+fp+fn))
    sens.append(tp/(tp+fn))
    specs.append(tn/(tn+fp))
    f1s.append(f1_score(y_val, preds))
    precs.append(precision_score(y_val, preds))

best_t = thresholds[np.argmax(f1s)]
print(f"Best threshold = {best_t:.2f}")

plt.figure(figsize=(8,5))
plt.plot(thresholds, accs, label='Accuracy')
plt.plot(thresholds, sens, label='Sensitivity')
plt.plot(thresholds, specs, label='Specificity')
plt.plot(thresholds, f1s, label='F1')
plt.legend(); plt.xlabel('Threshold'); plt.ylabel('Metric'); plt.title(f'{best_name} - Metrics vs Threshold')
plt.show()

## 6️⃣ Final test evaluation using best threshold

In [None]:
probs_test = pipe.predict_proba(X_test)[:,1]
preds = (probs_test >= best_t).astype(int)
print('Accuracy:', accuracy_score(y_test, preds))
print('Precision:', precision_score(y_test, preds))
print('Recall:', recall_score(y_test, preds))
print('F1:', f1_score(y_test, preds))
print('Confusion matrix:\n', confusion_matrix(y_test, preds))

## 7️⃣ Save model

In [None]:
os.makedirs('../models', exist_ok=True)
joblib.dump(pipe, f'../models/model_{best_name.lower()}.pkl')
print(f'Model saved as model_{best_name.lower()}.pkl')