In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


# Load data


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import optuna


train_df = pd.read_csv('/content/training_dataset.csv')
val_df = pd.read_csv('/content/validation_set.csv')



# Pisahkan fitur dan target


In [None]:
X_all = train_df.drop(columns=['berlangganan_deposito'])
y = train_df['berlangganan_deposito']
X_val_all = val_df.copy()


# Deteksi kolom numerik dan kategorikal

In [None]:

num_cols = X_all.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_all.select_dtypes(include=['object']).columns

# Imputasi nilai kosong

In [None]:

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

X_all[num_cols] = num_imputer.fit_transform(X_all[num_cols])
X_val_all[num_cols] = num_imputer.transform(X_val_all[num_cols])

X_all[cat_cols] = cat_imputer.fit_transform(X_all[cat_cols])
X_val_all[cat_cols] = cat_imputer.transform(X_val_all[cat_cols])



# Label encoding

In [None]:

for col in cat_cols:
    le = LabelEncoder()
    X_all[col] = le.fit_transform(X_all[col].astype(str))
    X_val_all[col] = le.transform(X_val_all[col].astype(str))

# Split untuk validasi Optuna
X_train, X_valid, y_train, y_valid = train_test_split(
    X_all, y, test_size=0.2, stratify=y, random_state=42
)


# Tuning CatBoost dengan Optuna

In [None]:
def objective(trial):
    params = {
        'iterations': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'bootstrap_type': 'Bayesian',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'od_wait': 100,
        'verbose': 0
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid),
              early_stopping_rounds=200, use_best_model=True)
    preds = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, preds)



# Jalankan tuning

In [None]:

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2025-05-29 09:17:09,010] A new study created in memory with name: no-name-e8af75af-2716-4be3-b203-8d86a19252f6
[I 2025-05-29 09:17:19,911] Trial 0 finished with value: 0.7944312899412073 and parameters: {'learning_rate': 0.01549791767321643, 'depth': 5, 'l2_leaf_reg': 1.4114052596040607}. Best is trial 0 with value: 0.7944312899412073.
[I 2025-05-29 09:17:30,462] Trial 1 finished with value: 0.7959760874201882 and parameters: {'learning_rate': 0.04187927378471162, 'depth': 8, 'l2_leaf_reg': 9.790642383328933}. Best is trial 1 with value: 0.7959760874201882.
[I 2025-05-29 09:17:42,303] Trial 2 finished with value: 0.7933512029504172 and parameters: {'learning_rate': 0.01995926444428947, 'depth': 9, 'l2_leaf_reg': 5.997476731197827}. Best is trial 1 with value: 0.7959760874201882.
[I 2025-05-29 09:17:46,209] Trial 3 finished with value: 0.7922117912164539 and parameters: {'learning_rate': 0.032097396168119505, 'depth': 5, 'l2_leaf_reg': 6.836488647536591}. Best is trial 1 with value: 



# Ambil parameter terbaik

In [None]:

best_params = study.best_params
print("✅ Best CatBoost params:", best_params)

✅ Best CatBoost params: {'learning_rate': 0.04187927378471162, 'depth': 8, 'l2_leaf_reg': 9.790642383328933}




# Latih ulang model CatBoost dengan parameter terbaik

In [None]:
cat_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    bootstrap_type='Bayesian',
    eval_metric='AUC',
    random_seed=42,
    od_wait=100,
    verbose=100
)
cat_model.fit(X_all, y)

0:	total: 18.7ms	remaining: 37.3s
100:	total: 1.85s	remaining: 34.8s
200:	total: 3.72s	remaining: 33.3s
300:	total: 6.25s	remaining: 35.3s
400:	total: 9.94s	remaining: 39.7s
500:	total: 11.8s	remaining: 35.4s
600:	total: 13.7s	remaining: 31.9s
700:	total: 15.5s	remaining: 28.8s
800:	total: 17.4s	remaining: 26.1s
900:	total: 19.7s	remaining: 24s
1000:	total: 23.7s	remaining: 23.7s
1100:	total: 25.6s	remaining: 20.9s
1200:	total: 27.5s	remaining: 18.3s
1300:	total: 29.4s	remaining: 15.8s
1400:	total: 31.3s	remaining: 13.4s
1500:	total: 33.3s	remaining: 11.1s
1600:	total: 37.5s	remaining: 9.35s
1700:	total: 39.5s	remaining: 6.94s
1800:	total: 41.4s	remaining: 4.58s
1900:	total: 43.4s	remaining: 2.26s
1999:	total: 45.3s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7dcb08364b50>

# Latih model LightGBM

In [None]:
lgb_model = LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.01,
    num_leaves=64,
    reg_alpha=1.0,
    reg_lambda=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgb_model.fit(X_all, y)

[LightGBM] [Info] Number of positive: 2614, number of negative: 20302
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003962 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 726
[LightGBM] [Info] Number of data points in the train set: 22916, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114069 -> initscore=-2.049838
[LightGBM] [Info] Start training from score -2.049838



# Ensemble prediksi

In [None]:
cat_preds = cat_model.predict_proba(X_val_all)[:, 1]
lgb_preds = lgb_model.predict_proba(X_val_all)[:, 1]
ensemble_preds = (cat_preds + lgb_preds) / 2

# Hitung AUC pada data validasi

In [None]:

cat_valid_preds = cat_model.predict_proba(X_valid)[:, 1]
lgb_valid_preds = lgb_model.predict_proba(X_valid)[:, 1]
ensemble_valid_preds = (cat_valid_preds + lgb_valid_preds) / 2

cat_auc = roc_auc_score(y_valid, cat_valid_preds)
lgb_auc = roc_auc_score(y_valid, lgb_valid_preds)
ensemble_auc = roc_auc_score(y_valid, ensemble_valid_preds)

print(f"CatBoost AUC on validation set: {cat_auc:.4f}")
print(f"LightGBM AUC on validation set: {lgb_auc:.4f}")
print(f"Ensemble AUC on validation set: {ensemble_auc:.4f}")


CatBoost AUC on validation set: 0.9948
LightGBM AUC on validation set: 0.9741
Ensemble AUC on validation set: 0.9897


In [None]:
# Simpan submission sesuai format
submission = pd.DataFrame({
    'customer_number': val_df['customer_number'],
    'berlangganan_deposito': ensemble_preds
})
submission.to_csv('submission.csv', index=False)
print('Submission saved to submission.csv')

Submission saved to submission.csv
