In [1]:
import warnings
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.model_selection import (
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import shap
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore", category=FutureWarning)
plt.style.use("fast")

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train = pd.read_csv('train.csv')
label_encoder = LabelEncoder()

# Drop column
train = train.drop(["id"], axis=1)

# fill Na
train.fillna(0.0, inplace=True)

# encoded column:
encoded_columns = ["Name", "City", "Gender", "Age", "Working Professional or Student", "Profession", "Sleep Duration", "Dietary Habits", "Degree", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness"]

for column in encoded_columns:
    train[column] = train[column].astype(str)
    train[column] = label_encoder.fit_transform(train[column])

train = train.astype(float)

y = train['Depression']
X = train.drop(['Depression'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

In [4]:
def objective(trial):
    """
    Objective function for Optuna. Defines the hyperparameter space and evaluation logic.
    """
    # Define the hyperparameter space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'logging_level': 'Silent',
        'random_seed': 10,
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
    y_pred = []
    y_true = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]
        model = CatBoostClassifier(**params)
        model.fit(X_train_f, y_train_f, eval_set=(X_val_f, y_val_f), verbose=0, early_stopping_rounds=50)
        y_pred.append(model.predict(X_val_f))
        y_true.append(y_val_f.values)

    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

# Create an Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)  # Run 50 trials

# Best hyperparameters and accuracy
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

[I 2024-11-28 10:54:02,021] A new study created in memory with name: no-name-876c8a67-1b2a-4f42-99e9-ca6cff396a85


[0]	validation-logloss:0.45058
[1]	validation-logloss:0.42933
[2]	validation-logloss:0.40960
[3]	validation-logloss:0.39207
[4]	validation-logloss:0.37620
[5]	validation-logloss:0.36186
[6]	validation-logloss:0.35045
[7]	validation-logloss:0.33829
[8]	validation-logloss:0.32701
[9]	validation-logloss:0.31652
[10]	validation-logloss:0.30680
[11]	validation-logloss:0.29794
[12]	validation-logloss:0.28974
[13]	validation-logloss:0.28347
[14]	validation-logloss:0.27655
[15]	validation-logloss:0.26962
[16]	validation-logloss:0.26322
[17]	validation-logloss:0.25719
[18]	validation-logloss:0.25143
[19]	validation-logloss:0.24606
[20]	validation-logloss:0.24106
[21]	validation-logloss:0.23677
[22]	validation-logloss:0.23219
[23]	validation-logloss:0.22789
[24]	validation-logloss:0.22386
[25]	validation-logloss:0.22006
[26]	validation-logloss:0.21664
[27]	validation-logloss:0.21361
[28]	validation-logloss:0.21024
[29]	validation-logloss:0.20723
[30]	validation-logloss:0.20427
[31]	validation-lo

[I 2024-11-28 10:54:44,334] Trial 0 finished with value: 0.024791335582266982 and parameters: {'booster': 'dart', 'learning_rate': 0.04415901993389597, 'max_depth': 10, 'min_child_weight': 6.47417640538773, 'subsample': 0.7025436430309222, 'colsample_bytree': 0.8738762862493525, 'lambda': 2.873599234847058e-06, 'alpha': 3.821210329099404e-05}. Best is trial 0 with value: 0.024791335582266982.


[0]	validation-logloss:0.42916
[1]	validation-logloss:0.39769
[2]	validation-logloss:0.37267
[3]	validation-logloss:0.34697
[4]	validation-logloss:0.32573
[5]	validation-logloss:0.30963
[6]	validation-logloss:0.29671
[7]	validation-logloss:0.28448
[8]	validation-logloss:0.27063
[9]	validation-logloss:0.25958
[10]	validation-logloss:0.24834
[11]	validation-logloss:0.23999
[12]	validation-logloss:0.23136
[13]	validation-logloss:0.22636
[14]	validation-logloss:0.22078
[15]	validation-logloss:0.21364
[16]	validation-logloss:0.20917
[17]	validation-logloss:0.20375
[18]	validation-logloss:0.20009
[19]	validation-logloss:0.19545
[20]	validation-logloss:0.19098
[21]	validation-logloss:0.18801
[22]	validation-logloss:0.18452
[23]	validation-logloss:0.18168
[24]	validation-logloss:0.17874
[25]	validation-logloss:0.17603
[26]	validation-logloss:0.17381
[27]	validation-logloss:0.17250
[28]	validation-logloss:0.17054
[29]	validation-logloss:0.16880
[30]	validation-logloss:0.16754
[31]	validation-lo

[I 2024-11-28 10:55:06,548] Trial 1 finished with value: 0.02434442723208463 and parameters: {'booster': 'dart', 'learning_rate': 0.09056703096601536, 'max_depth': 7, 'min_child_weight': 1.3989287444623546, 'subsample': 0.7403362293040345, 'colsample_bytree': 0.627725389549227, 'lambda': 1.0703104098590831e-07, 'alpha': 0.010328041032163261}. Best is trial 1 with value: 0.02434442723208463.


[0]	validation-logloss:0.46954
[1]	validation-logloss:0.46439
[2]	validation-logloss:0.45993
[3]	validation-logloss:0.45407
[4]	validation-logloss:0.44863
[5]	validation-logloss:0.44391
[6]	validation-logloss:0.43976
[7]	validation-logloss:0.43522
[8]	validation-logloss:0.42991
[9]	validation-logloss:0.42556
[10]	validation-logloss:0.42113
[11]	validation-logloss:0.41695
[12]	validation-logloss:0.41218
[13]	validation-logloss:0.40898
[14]	validation-logloss:0.40536
[15]	validation-logloss:0.40082
[16]	validation-logloss:0.39721
[17]	validation-logloss:0.39311
[18]	validation-logloss:0.38964
[19]	validation-logloss:0.38593
[20]	validation-logloss:0.38190
[21]	validation-logloss:0.37863
[22]	validation-logloss:0.37465
[23]	validation-logloss:0.37212
[24]	validation-logloss:0.36856
[25]	validation-logloss:0.36482
[26]	validation-logloss:0.36182
[27]	validation-logloss:0.35957
[28]	validation-logloss:0.35645
[29]	validation-logloss:0.35305
[30]	validation-logloss:0.35028
[31]	validation-lo

[I 2024-11-28 10:55:15,278] Trial 2 finished with value: 0.023846316824884695 and parameters: {'booster': 'gbtree', 'learning_rate': 0.012311547309179756, 'max_depth': 7, 'min_child_weight': 4.140893541394591, 'subsample': 0.8282046404803096, 'colsample_bytree': 0.5409140711108644, 'lambda': 6.042203664033186e-07, 'alpha': 0.17222928770082935}. Best is trial 2 with value: 0.023846316824884695.


[0]	validation-logloss:0.46347
[1]	validation-logloss:0.45323
[2]	validation-logloss:0.44520
[3]	validation-logloss:0.43469
[4]	validation-logloss:0.42527
[5]	validation-logloss:0.41736
[6]	validation-logloss:0.41056
[7]	validation-logloss:0.40339
[8]	validation-logloss:0.39490
[9]	validation-logloss:0.38778
[10]	validation-logloss:0.37993
[11]	validation-logloss:0.37350
[12]	validation-logloss:0.36636
[13]	validation-logloss:0.36167
[14]	validation-logloss:0.35645
[15]	validation-logloss:0.34998
[16]	validation-logloss:0.34493
[17]	validation-logloss:0.33917
[18]	validation-logloss:0.33452
[19]	validation-logloss:0.32959
[20]	validation-logloss:0.32411
[21]	validation-logloss:0.31988
[22]	validation-logloss:0.31480
[23]	validation-logloss:0.31067
[24]	validation-logloss:0.30619
[25]	validation-logloss:0.30155
[26]	validation-logloss:0.29794
[27]	validation-logloss:0.29528
[28]	validation-logloss:0.29155
[29]	validation-logloss:0.28751
[30]	validation-logloss:0.28430
[31]	validation-lo

[W 2024-11-28 10:56:16,558] Trial 3 failed with parameters: {'booster': 'dart', 'learning_rate': 0.024194025313115874, 'max_depth': 6, 'min_child_weight': 3.691173654792567, 'subsample': 0.6809015873829158, 'colsample_bytree': 0.5780156917211776, 'lambda': 9.48190453406121e-07, 'alpha': 7.89301094812072e-05} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\tkvkh\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\tkvkh\AppData\Local\Temp\ipykernel_12720\3300309344.py", line 21, in objective
    model = xgb.train(
            ^^^^^^^^^^
  File "C:\Users\tkvkh\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\tkvkh\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboos

KeyboardInterrupt: 