In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from pathlib import Path
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import joblib

In [24]:
TRAIN_PATH = "data/spaceship-titanic-train.parquet"
TEST_SIZE = 0.2
MODEL_PATH = Path("../../models/XGB-spaceship-titanic.pkl")
PARAMS_PATH = Path("../../models/sXGB-spaceship-titanic.txt")
SEED = 42


def build_stacking_model():
    level0 = [
        ('xgb', XGBClassifier(tree_method="hist", enable_categorical=True, eval_metric="logloss", random_state=SEED)),
        ('lgbm', LGBMClassifier(random_state=SEED)),
        ('catboost', CatBoostClassifier(random_state=SEED))
    ]
    level1 = LogisticRegression(max_iter=1000, random_state=SEED)
    return StackingClassifier(
        estimators=level0,
        final_estimator=level1,
        passthrough=True,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
        n_jobs=-1
    )

In [22]:
train_df = pd.read_parquet("data/spaceship-titanic-train.parquet")
train_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_deck,Cabin_num,Cabin_side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0.0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0.0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0.0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0.0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1.0,S


In [23]:
def load_data():
    df = pd.read_parquet(TRAIN_PATH)
    X = df.drop("Transported", axis=1)
    y = df["Transported"]
    return train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED)

def build_stacking_model():
    level0 = [
        ('xgb', XGBClassifier(tree_method="hist", enable_categorical=True, eval_metric="logloss", random_state=SEED)),
        ('lgbm', LGBMClassifier(random_state=SEED)),
        ('catboost', CatBoostClassifier(verbose=0, random_state=SEED))
    ]
    level1 = LogisticRegression(max_iter=1000, random_state=SEED)
    return StackingClassifier(
        estimators=level0,
        final_estimator=level1,
        passthrough=True,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
        n_jobs=-1
    )

def train_and_evaluate():
    X_train, X_valid, y_train, y_valid = load_data()
    model = build_stacking_model()
    print("Training stacking ensemble...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)
    report = classification_report(y_valid, y_pred)

    print(f"\nValidation Accuracy: {acc:.4f}\n")
    print(report)

    MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(model, MODEL_PATH)
    print(f"✅ Model saved to: {MODEL_PATH}")

    # Save model summary
    with open(PARAMS_PATH, "w") as f:
        f.write(f"Validation Accuracy: {acc:.4f}\n\n")
        f.write(report)