In [5]:
!pip install catboost



In [6]:


import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from catboost import CatBoostClassifier


# Load datasets
train_df = pd.read_csv('/content/training_dataset (2).csv')
val_df = pd.read_csv('/content/validation_set (1).csv')

In [7]:

target = 'berlangganan_deposito'
features = [col for col in train_df.columns if col not in [target, 'customer_number']]
X = train_df[features]
y = train_df[target]
X_val = val_df[features]

# 3. Preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

feature_selector = SelectKBest(score_func=mutual_info_classif, k='all')

# 4. Pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', feature_selector),
    ('classifier', CatBoostClassifier(verbose=0, random_state=42))
])

# 5. Parameter tuning
param_dist = {
    'classifier__iterations': [300, 500, 1000],
    'classifier__learning_rate': [0.01, 0.03, 0.1],
    'classifier__depth': [4, 6, 8],
    'classifier__l2_leaf_reg': [1, 3, 5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(pipe, param_distributions=param_dist,
                            scoring='roc_auc', n_iter=10, cv=cv,
                            verbose=2, n_jobs=-1, random_state=42)

# 6. Fit model
search.fit(X, y)

# 7. Predict on validation set
y_pred = search.best_estimator_.predict_proba(X_val)[:, 1]
val_df['berlangganan_deposito'] = y_pred

# 8. Save submission
submission = val_df[['customer_number', 'berlangganan_deposito']]
submission.to_csv('submission_catboost_tuned.csv', index=False)
print("✅ Submission saved to 'submission_catboost_tuned.csv'")
print("Best CV AUC:", search.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
✅ Submission saved to 'submission_catboost_tuned.csv'
Best CV AUC: 0.7960838841910659
