<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-8/day08_trees_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# day08_trees_rf.py
# Run from project root: python day08_trees_rf.py
# Requires: pandas, numpy, scikit-learn, matplotlib, seaborn, joblib

import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
import joblib

OUT = Path("day08"); OUT.mkdir(exist_ok=True)
ASSETS = OUT / "assets"; ASSETS.mkdir(exist_ok=True)

# -----------------------
# 0) Load data (use best processed file if available)
# -----------------------
candidates = [
    Path("day05/day05_titanic_feat.csv"),
    Path("day05_titanic_feat.csv"),
    Path("day02/day02_titanic_clean.csv"),
    Path("day02/day02_titanic_preserved.csv"),
    Path("train.csv")
]
data = None
for p in candidates:
    if p.exists():
        print("Loading:", p)
        data = pd.read_csv(p)
        break
if data is None:
    raise FileNotFoundError("No input CSV found. Put day05_titanic_feat.csv or day02_titanic_clean.csv or train.csv in the project.")

# Ensure target exists
if 'Survived' not in data.columns:
    raise KeyError("'Survived' column not found in loaded data.")

# -----------------------
# 1) Prepare X, y (simple safe preprocessing)
# -----------------------
df = data.copy()
# Drop obviously useless columns if present
for c in ['PassengerId','Ticket','Cabin','Name']:
    if c in df.columns: df.drop(columns=[c], inplace=True)

# If Sex is still string, convert
if 'Sex' in df.columns and df['Sex'].dtype == object:
    df['Sex'] = (df['Sex'].str.lower().str.startswith('m')).astype(int)

# Fill Age/Fare if any missing
if 'Age' in df.columns:
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    df['Age'].fillna(df['Age'].median(), inplace=True)
if 'Fare' in df.columns:
    df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

# One-hot encode any leftover categorical columns
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'Survived']
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

X = df.drop(columns=['Survived'])
y = df['Survived']

# Train/test split (we'll keep a holdout test set)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# Helper to evaluate and save plots
def evaluate_and_plot(name, model, X_test, y_test, prefix):
    y_pred = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(precision_score(y_test, y_pred)),
        "recall": float(recall_score(y_test, y_pred)),
        "f1": float(f1_score(y_test, y_pred)),
        "roc_auc": float(roc_auc_score(y_test, proba)) if proba is not None else None
    }
    print(f"\n== {name} metrics:", metrics)
    print(classification_report(y_test, y_pred))

    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(ASSETS / f"{prefix}_confusion.png")
    plt.close()

    # ROC curve
    if proba is not None:
        fpr, tpr, _ = roc_curve(y_test, proba)
        plt.figure(figsize=(5,4))
        plt.plot(fpr, tpr, label=f"AUC={metrics['roc_auc']:.3f}")
        plt.plot([0,1],[0,1],'--',color='gray')
        plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"{name} ROC")
        plt.legend()
        plt.tight_layout()
        plt.savefig(ASSETS / f"{prefix}_roc.png")
        plt.close()

    return metrics

results = {}

# -----------------------
# 2) Decision Tree baseline
# -----------------------
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
joblib.dump(dt, ASSETS / "dt_baseline.joblib")
results['decision_tree_baseline'] = evaluate_and_plot("DecisionTree (baseline)", dt, X_test, y_test, "dt_baseline")

# Optional: visualize small tree (only if feature count small)
try:
    plt.figure(figsize=(12,8))
    plot_tree(dt, max_depth=3, fontsize=8, feature_names=X.columns, class_names=['0','1'], filled=True)
    plt.tight_layout()
    plt.savefig(ASSETS/"dt_tree_plot.png")
    plt.close()
except Exception:
    pass

# -----------------------
# 3) Random Forest baseline
# -----------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
joblib.dump(rf, ASSETS / "rf_baseline.joblib")
results['random_forest_baseline'] = evaluate_and_plot("RandomForest (baseline)", rf, X_test, y_test, "rf_baseline")

# Feature importances (top 20)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(6,8))
sns.barplot(x=importances.values[:20], y=importances.index[:20])
plt.title("RF Top 20 Feature Importances")
plt.tight_layout()
plt.savefig(ASSETS/"rf_feature_importances.png")
plt.close()

# -----------------------
# 4) Quick GridSearch (small) for RF (keeps runtime modest)
# -----------------------
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [4, 6, None],
    "max_features": ["sqrt", "log2"]
}
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
gs = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_grid, cv=cv, scoring='f1', n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)
best_rf = gs.best_estimator_
joblib.dump(best_rf, ASSETS/"rf_best_grid.joblib")
results['random_forest_grid'] = {
    'best_params': gs.best_params_,
    'cv_best_score': float(gs.best_score_)
}
# evaluate on test
results['random_forest_grid']['test_metrics'] = evaluate_and_plot("RandomForest (tuned)", best_rf, X_test, y_test, "rf_tuned")

# -----------------------
# 5) Save results summary
# -----------------------
with open(OUT/"day08_results.json","w") as f:
    json.dump(results, f, indent=2)

print("\nSaved results to", OUT, "assets to", ASSETS)
print("Summary:", results)
