<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-4/day04_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Day 4 - Train/Test + Metrics (Titanic)
import pandas as pd
import numpy as np
from pathlib import Path

# plotting & sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve, classification_report
)
import joblib

# Setup
ASSETS = Path("assets"); ASSETS.mkdir(exist_ok=True)
df = pd.read_csv("train.csv")  # path relative to day04

# 1) Prepare data
X = df.drop(columns=['Survived'])
y = df['Survived']

# If any non-numeric columns remain, encode or drop (we used Day2 one-hot)
print("Feature columns:", X.columns.tolist())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:,1]  # for ROC-AUC (binary)

dt = DecisionTreeClassifier(random_state=42, max_depth=5)  # max_depth as simple regularizer
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
y_proba_dt = dt.predict_proba(X_test)[:,1]

def eval_and_plot(y_true, y_pred, y_proba, model_name="Model"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba)

    print(f"--- {model_name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("\nClassification report:\n", classification_report(y_true, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(ASSETS/f"{model_name}_confusion.png", bbox_inches='tight')
    plt.show()

    # ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f"{model_name} (AUC={roc_auc:.3f})")
    plt.plot([0,1],[0,1], linestyle='--', color='gray')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"{model_name} - ROC Curve")
    plt.legend()
    plt.savefig(ASSETS/f"{model_name}_roc.png", bbox_inches='tight')
    plt.show()

    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": roc_auc}

# Run eval
res_lr = eval_and_plot(y_test, y_pred_lr, y_proba_lr, model_name="LogisticRegression")
res_dt = eval_and_plot(y_test, y_pred_dt, y_proba_dt, model_name="DecisionTree")

# Pick model to save (example: RandomForest if best)
joblib.dump(rf, ASSETS/"best_model_rf.joblib")

# Save results summary
import json
results = {"LogisticRegression": res_lr, "DecisionTree": res_dt, "RandomForest": res_rf}
with open(ASSETS/"results_summary.json","w") as f:
    json.dump(results, f, indent=2)



Feature columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


ValueError: could not convert string to float: 'Lam, Mr. Ali'