# Titanic Survival Prediction
**Goal:** Predict survival using classical ML.  
**Dataset:** Titanic (Seaborn)  
**Model:** Random Forest (pipeline with preprocessing)  
**Metrics:** Accuracy, Precision, Recall, F1-score  

---

## Imports


In [None]:
# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# --- Reproducibility ---
SEED = 42
np.random.seed(SEED)


## Load and Inspect Dataset


In [None]:
# Load dataset
titanic = sns.load_dataset("titanic")
print(titanic.head())
print(titanic.info())

# Basic cleanup: drop missing values in key features
titanic = titanic.dropna(subset=["age", "embarked", "fare"])

## Feature Selection


In [None]:
# Features and target
X = titanic[["pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]]
y = titanic["survived"]

# Identify numeric and categorical columns
num_cols = ["age", "sibsp", "parch", "fare", "pclass"]
cat_cols = ["sex", "embarked"]


## 4️⃣ Preprocessing + Pipeline

In [None]:
# Preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# Pipeline with Random Forest
clf = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(random_state=SEED))
])


## 5️⃣ Hyperparameter Tuning


In [None]:
# Hyperparameter grid
param_grid = {
    "model__n_estimators": [50, 100],
    "model__max_depth": [None, 5, 10],
    "model__min_samples_split": [2, 4]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
grid_search = GridSearchCV(clf, param_grid, cv=cv, scoring="accuracy", n_jobs=-1)
grid_search.fit(X, y)

print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


## 6️⃣ Train & Evaluate

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

# Train best model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
y_pred = best_model.predict(X_test)

## 7️⃣ Classification Metrics

In [None]:
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=["Not Survived", "Survived"])
disp.plot(cmap="Blues", values_format="d")
plt.show()


## 8️⃣ Feature Importance

In [None]:
# Feature importances from Random Forest
importances = best_model.named_steps["model"].feature_importances_
features = num_cols + list(best_model.named_steps["preprocess"].named_transformers_["cat"].get_feature_names_out())

sns.barplot(x=importances, y=features)
plt.title("Feature Importances")
plt.show()
