<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-26/day26_error_analysis_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# day26_pipeline_advanced.py
# Day 26 - Pipelines, Preprocessing & Ensemble Methods

import pandas as pd
import sklearn
from packaging import version
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# ---------------------------
# Step 1: Load Data
# ---------------------------
train = pd.read_csv("train_processed.csv")
test = pd.read_csv("test_processed.csv")

print("✅ Data Loaded")
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# ---------------------------
# Step 2: Features & Target
# ---------------------------
X = train.drop(["Survived", "PassengerId"], axis=1)
y = train["Survived"]
X_test = test.drop("PassengerId", axis=1)

# ---------------------------
# Step 3: Train/Validation Split
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("✅ Train/Validation Split done")

# ---------------------------
# Step 4: Build Preprocessor
# ---------------------------
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# version-safe OneHotEncoder
if version.parse(sklearn.__version__) >= version.parse("1.2"):
    onehot = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    onehot = OneHotEncoder(handle_unknown="ignore", sparse=False)

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", onehot)
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_features),
    ("cat", cat_pipe, cat_features)
])

# ---------------------------
# Step 5: Define Models
# ---------------------------
rf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

gb = Pipeline([
    ("preprocessor", preprocessor),
    ("model", GradientBoostingClassifier(random_state=42))
])

# Ensemble with Voting Classifier
ensemble = VotingClassifier(
    estimators=[("rf", rf), ("gb", gb)],
    voting="soft"
)

# ---------------------------
# Step 6: Train & Evaluate
# ---------------------------
ensemble.fit(X_train, y_train)
val_preds = ensemble.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

print(f"\n📊 Validation Accuracy (Ensemble): {val_acc:.4f}")

# ---------------------------
# Step 7: Final Predictions
# ---------------------------
test_preds = ensemble.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_preds
})

submission.to_csv("day26_submission.csv", index=False)
print("\n✅ Submission file saved as day26_submission.csv")


✅ Data Loaded
Train shape: (891, 30)
Test shape: (418, 29)
✅ Train/Validation Split done

📊 Validation Accuracy (Ensemble): 0.6536

✅ Submission file saved as day26_submission.csv
