In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# -------------------------
# Load data
# -------------------------
target = "CORRUCYSTIC_DENSITY"
train = pd.read_csv("MiNDAT.csv")
test = pd.read_csv("MiNDAT_UNK.csv")

# Drop rows where target is missing
train = train.dropna(subset=[target])
X = train.drop(columns=[target])
y = train[target]

# -------------------------
# Preprocessing
# -------------------------
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("power", PowerTransformer(method="yeo-johnson"))  # handle skew
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# -------------------------
# Model Ensembling
# -------------------------
models = {
    "XGB": XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=8,
                        subsample=0.8, colsample_bytree=0.8, random_state=42),
    "LGBM": LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=-1,
                          subsample=0.8, colsample_bytree=0.8, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=8,
                                  verbose=0, random_state=42)
}

final_preds = np.zeros(len(test))

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])
    
    # Cross-validation score
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = -cross_val_score(pipe, X, y, scoring="neg_root_mean_squared_error", cv=kf).mean()
    print(f"{name} CV RMSE: {cv_score:.5f}")
    
    # Fit on full data
    pipe.fit(X, y)
    preds = pipe.predict(test)
    final_preds += preds / len(models)  # averaging

# -------------------------
# Submission
# -------------------------
submission = pd.DataFrame({
    "Id": test.index,
    target: final_preds
})
submission.to_csv("submission10.csv", index=False)
print("✅ submission10.csv saved!")
