<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-13/day13_stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# Day 13 - Titanic Kaggle Submission with Stacking Ensemble

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ---------------------------
# Step 1: Load processed data
# ---------------------------
# Corrected file names to match the uploaded files
train = pd.read_csv("train_processed.csv")
test = pd.read_csv("test_processed.csv")

# Create a dummy passenger ID list. In a real competition, you would load this from the
# original 'test.csv' file to ensure proper submission formatting.
# For example: test_passenger_ids = pd.read_csv("original_test.csv")["PassengerId"]
# Here we'll create one based on the length of the processed test data.
test_passenger_ids = pd.DataFrame({"PassengerId": range(len(test))})

# ---------------------------
# Step 1.5: Preprocess data (One-Hot Encoding for 'Embarked')
# ---------------------------
# The ValueError indicates a string column is present.
# We'll use get_dummies to one-hot encode the 'Embarked' column.
train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True)

# Separate features (X) and target (y) from the training data
X = train.drop("Survived", axis=1)
y = train["Survived"]

# ---------------------------
# Step 2: Split data for validation
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------------
# Step 3: Define base models
# ---------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42)
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    eval_metric="logloss"
)
lr = LogisticRegression(max_iter=1000)

# ---------------------------
# Step 4: Build Stacking model
# ---------------------------
stack_model = StackingClassifier(
    estimators=[("rf", rf), ("xgb", xgb)],
    final_estimator=lr,
    passthrough=False
)

# ---------------------------
# Step 5: Train and validate
# ---------------------------
stack_model.fit(X_train, y_train)
y_pred = stack_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# ---------------------------
# Step 6: Train on full data & Predict Test
# ---------------------------
stack_model.fit(X, y)

# Drop the 'Survived' column from the test data before prediction to match
# the features (X) the model was trained on.
test_features = test.drop("Survived", axis=1)
test_pred = stack_model.predict(test_features)

# ---------------------------
# Step 7: Create submission file
# ---------------------------
# Ensure the 'PassengerId' column is created and matches the number of rows in the test set.
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids["PassengerId"],
    "Survived": test_pred
})

# The script creates the 'submissions' directory if it doesn't exist
submission.to_csv("day13_stacking.csv", index=False)

print("✅ Submission saved at: day13_stacking.csv")


Validation Accuracy: 0.8491620111731844
✅ Submission saved at: day13_stacking.csv
