<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-5/day05_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from pathlib import Path

# -----------------------------
# Step 1 – Load Clean Dataset
# -----------------------------
data_path = Path("day02_titanic_clean.csv") # Removed "day02/" from the path
if not data_path.exists():
    raise FileNotFoundError("❌ day02_titanic_clean.csv not found. Please run your Day 2 script first.")

df = pd.read_csv(data_path)
print("✅ Data loaded:", df.shape)

# -----------------------------
# Step 2 – Additional Feature Engineering
# -----------------------------

# Family size & IsAlone (if not already created in Day 2)
if "FamilySize" not in df.columns and {"SibSp","Parch"}.issubset(df.columns):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
if "IsAlone" not in df.columns:
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Extract Title from Name if available
if "Name" in df.columns:
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False) # Added 'r' for raw string to fix SyntaxWarning
    df['Title'] = df['Title'].replace(
        ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare'
    )
    df['Title'] = df['Title'].replace(['Mlle','Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
else:
    df['Title'] = "Unknown"

# One-hot encode categorical
df = pd.get_dummies(df, columns=['Sex','Embarked','Title'], drop_first=True)

print("✅ Features engineered. New shape:", df.shape)

# -----------------------------
# Step 3 – Train/Test Split
# -----------------------------
if "Survived" not in df.columns:
    raise KeyError("❌ Target column 'Survived' not found in dataset.")

X = df.drop(['Survived','PassengerId','Ticket','Cabin','Name'], axis=1, errors='ignore')
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("✅ Train/Test split ->", X_train.shape, X_test.shape)

# -----------------------------
# Step 4 – Hyperparameter Tuning (Decision Tree)
# -----------------------------
param_grid = {
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini','entropy']
}

dt = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("\n🎯 Best Parameters:", grid_search.best_params_)
print("📊 Best CV Accuracy:", round(grid_search.best_score_, 4))

# -----------------------------
# Step 5 – Evaluate Best Model
# -----------------------------
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test)

print("\n--- Classification Report (Test Set) ---")
print(classification_report(y_test, y_pred))

# -----------------------------
# Step 6 – Save Results
# -----------------------------
out_dir = Path("day05")
out_dir.mkdir(exist_ok=True)

pd.DataFrame(grid_search.cv_results_).to_csv(out_dir / "day05_gridsearch_results.csv", index=False)

print("\n✅ Day 5 complete! GridSearch results saved at:", out_dir / "day05_gridsearch_results.csv")

✅ Data loaded: (891, 12)
✅ Features engineered. New shape: (891, 13)
✅ Train/Test split -> (712, 11) (179, 11)

🎯 Best Parameters: {'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 10}
📊 Best CV Accuracy: 0.8203

--- Classification Report (Test Set) ---
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       110
           1       0.80      0.58      0.67        69

    accuracy                           0.78       179
   macro avg       0.79      0.74      0.75       179
weighted avg       0.78      0.78      0.77       179


✅ Day 5 complete! GridSearch results saved at: day05/day05_gridsearch_results.csv
