In [None]:
!pip install catboost
from IPython import get_ipython
from IPython.display import display
# %%
# 1. Load data
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.base import clone

# Load dataset
train_df = pd.read_csv('/content/training_dataset (2).csv')
val_df = pd.read_csv('/content/validation_set (1).csv')

# Target and features
target = 'berlangganan_deposito'
features = [col for col in train_df.columns if col not in [target, 'customer_number']]
X = train_df[features]
y = train_df[target]

# Train/test split for SMOTE
X_train, _, y_train, _ = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 2. Preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Feature selector
feature_selector = SelectKBest(score_func=mutual_info_classif, k='all')

# Apply preprocessing and feature selection
X_preprocessed = preprocessor.fit_transform(X_train)
X_selected = feature_selector.fit_transform(X_preprocessed, y_train)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_selected, y_train)

# Define models
models = {
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}

# Evaluate with CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    print(f"Training {name}...")
    base_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('selector', feature_selector),
        ('classifier', clone(model))
    ])
    scores = cross_val_score(base_model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    mean_auc = scores.mean()
    results.append((name, mean_auc))

# Voting Ensemble
ensemble = VotingClassifier(estimators=[
    ('xgb', models['XGBoost']),
    ('lgb', models['LightGBM']),
    ('cat', models['CatBoost'])
], voting='soft', n_jobs=-1)

ensemble_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', feature_selector),
    ('classifier', ensemble)
])

# Fit final model on full data
ensemble_pipeline.fit(X, y)

# Predict
X_val = val_df[features]
y_pred = ensemble_pipeline.predict_proba(X_val)[:, 1]
val_df['berlangganan_deposito'] = y_pred

# Submission
submission = val_df[['customer_number', 'berlangganan_deposito']]
submission.to_csv('submission.csv', index=False)

# Results
results_df = pd.DataFrame(results, columns=['Model', 'CV AUC'])
print("\n✅ Cross-validated AUC Scores:")
print(results_df.sort_values(by='CV AUC', ascending=False))
ensemble_auc = cross_val_score(ensemble_pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=-1).mean()
print(f"\n✅ Ensemble VotingClassifier AUC = {ensemble_auc:.4f}")
print("📁 Submission file 'submission.csv' created.")

Training XGBoost...
Training LightGBM...
Training CatBoost...





✅ Cross-validated AUC Scores:
      Model    CV AUC
2  CatBoost  0.788256
1  LightGBM  0.786552
0   XGBoost  0.771523
