<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-23/day23_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# day23_baseline.py
# Day 23 - Feature Engineering + Baseline Models

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# ---------------------------
# Step 1: Load Data
# ---------------------------
train = pd.read_csv("train_processed.csv")
test = pd.read_csv("test_processed.csv")

print("✅ Data Loaded")
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# ---------------------------
# Step 2: Define Features & Target
# ---------------------------
X = train.drop(["Survived", "PassengerId"], axis=1)
y = train["Survived"]

X_test = test.drop("PassengerId", axis=1)

# ---------------------------
# Step 3: Train/Validation Split
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("✅ Train/Validation Split done")

# ---------------------------
# Step 4: Baseline Models
# ---------------------------
results = {}

# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_val)
results["Logistic Regression"] = accuracy_score(y_val, lr_preds)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_val)
results["Decision Tree"] = accuracy_score(y_val, dt_preds)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_val)
results["Random Forest"] = accuracy_score(y_val, rf_preds)

# ---------------------------
# Step 5: Compare Models
# ---------------------------
print("\n📊 Validation Accuracies:")
for model, acc in results.items():
    print(f"{model}: {acc:.4f}")

best_model_name = max(results, key=results.get)
print(f"\n🏆 Best Model: {best_model_name}")

if best_model_name == "Logistic Regression":
    best_model = lr
elif best_model_name == "Decision Tree":
    best_model = dt
else:
    best_model = rf

# ---------------------------
# Step 6: Generate Submission
# ---------------------------
test_preds = best_model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_preds
})

submission.to_csv("day23_submission.csv", index=False)
print("\n✅ Submission file saved as day23_submission.csv")


✅ Data Loaded
Train shape: (891, 30)
Test shape: (418, 29)
✅ Train/Validation Split done

📊 Validation Accuracies:
Logistic Regression: 0.8436
Decision Tree: 0.7989
Random Forest: 0.7598

🏆 Best Model: Logistic Regression

✅ Submission file saved as day23_submission.csv
