In [13]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from xgboost import XGBClassifier

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    cross_val_score
)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                            VotingClassifier)


# Constants
RANDOM_STATE = 42
LABEL = 'berlangganan_deposito'


In [14]:
train = pd.read_csv('../training_dataset.csv')
test = pd.read_csv('../validation_set.csv')

X = train.drop(columns=[LABEL, 'customer_number', 'pulau'])
y = train[LABEL]

In [15]:
le = LabelEncoder()

def feature_engineering(df):

    df = df.copy()
    
    if 'hari_sejak_kontak_sebelumnya' in df.columns:
        df['is hari_sejak_kontak_sebelumnya > 22'] = (df['hari_sejak_kontak_sebelumnya'] > 22).astype(int)
    
    df['total_kontak'] = df['jumlah_kontak_sebelumnya'] + df['jumlah_kontak_kampanye_ini']
    df['rasio_kontak'] = df['jumlah_kontak_sebelumnya'] / df['total_kontak']
    
    df['is>=5100pekerja'] = (df['jumlah_pekerja'] > 5100).astype(int)
    
    if 'hari_sejak_kontak_sebelumnya' in df.columns:
        df = df.drop(columns=['hari_sejak_kontak_sebelumnya'])
        
    return df

In [16]:
X = feature_engineering(X)
test = feature_engineering(test)

In [17]:
encoders = {}

# Encode categorical features
categorical_cols = X.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])  
    encoders[col] = le 


In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [19]:
rf_model = RandomForestClassifier(
    n_estimators=128,
    max_depth=12,
    min_samples_split=7,
    min_samples_leaf=9,
    max_features= 'sqrt',
    criterion='entropy',
    bootstrap=True,
    class_weight='balanced',
    random_state=RANDOM_STATE
)

rf_model.fit(X_train, y_train)

y_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]
roc_rf = roc_auc_score(y_val, y_pred_proba_rf)
print(f"ROC AUC Score: {roc_rf:.4f}")


ROC AUC Score: 0.7916


In [20]:
gb_model = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.006606055737608706,
    max_depth=11,
    subsample=0.8660093939561572,
    max_features='log2',
    min_samples_split=10,
    min_samples_leaf=10,
    random_state=RANDOM_STATE
)

gb_model.fit(X_train, y_train)

y_pred_proba = gb_model.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, y_pred_proba)
print(f"ROC AUC Score: {roc:.4f}")

ROC AUC Score: 0.7934


In [21]:
xgb_model = XGBClassifier(
    n_estimators=145,
    learning_rate=0.07743387797898427,
    max_depth=12,
    subsample=0.896841632989109,
    colsample_bytree=0.6887680511951783,
    min_child_weight=5,
    gamma=2.6256061694934245,
    reg_alpha=0.5548795607783858,
    reg_lambda=0.7122696697444622,
    random_state=42,
    use_label_encoder=True,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, y_pred_proba)
print(f"ROC AUC Score: {roc:.4f}")


ROC AUC Score: 0.7958


In [22]:
voting_soft = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('xgb', xgb_model)
    ],
    voting='soft'
)

voting_soft.fit(X_train, y_train)

y_pred_proba_voting = voting_soft.predict_proba(X_val)[:, 1]
roc_voting = roc_auc_score(y_val, y_pred_proba_voting)
print(f"ROC AUC Score: {roc_voting:.4f}")

ROC AUC Score: 0.7930


In [23]:
# Validasi menggunakan cross-validation dengan scoring ROC AUC
N_SPLITS = 5
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

print(f"\nEvaluasi model default VotingSoft dengan {N_SPLITS}-fold CV:")
auc_scores = cross_val_score(voting_soft, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)

# Tampilkan hasil
for i, score in enumerate(auc_scores):
    print(f"Fold {i+1} AUC: {score:.4f}")

print(f"\nRata-rata AUC: {auc_scores.mean():.4f}")
print(f"Standar Deviasi AUC: {auc_scores.std():.4f}")


Evaluasi model default VotingSoft dengan 5-fold CV:
Fold 1 AUC: 0.7799
Fold 2 AUC: 0.7978
Fold 3 AUC: 0.8076
Fold 4 AUC: 0.7956
Fold 5 AUC: 0.7926

Rata-rata AUC: 0.7947
Standar Deviasi AUC: 0.0090


In [24]:
pred = voting_soft.predict_proba(test[X.columns])[:, 1]
# Save predictions to CSV
submission = pd.DataFrame({
    'customer_number': test['customer_number'],
    'berlangganan_deposito': pred
})
submission.to_csv('submission3.csv', index=False)