In [1]:
import warnings
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.model_selection import (
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import shap
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore", category=FutureWarning)
plt.style.use("fast")

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train = pd.read_csv('train.csv')
label_encoder = LabelEncoder()

# Drop column
train = train.drop(["id"], axis=1)

# fill Na
train.fillna(0.0, inplace=True)

# encoded column:
encoded_columns = ["Name", "City", "Gender", "Age", "Working Professional or Student", "Profession", "Sleep Duration", "Dietary Habits", "Degree", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness"]

for column in encoded_columns:
    train[column] = train[column].astype(str)
    train[column] = label_encoder.fit_transform(train[column])

train = train.astype(float)

y = train['Depression']
X = train.drop(['Depression'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

In [4]:
def objective(trial):
    """
    Objective function for Optuna. Defines the hyperparameter space and evaluation logic.
    """
    # Define the hyperparameter space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'logging_level': 'Silent',
        'random_seed': 10,
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
    y_pred = []
    y_true = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]
        model = CatBoostClassifier(**params)
        model.fit(X_train_f, y_train_f, eval_set=(X_val_f, y_val_f), verbose=0, early_stopping_rounds=50)
        y_pred.append(model.predict(X_val_f))
        y_true.append(y_val_f.values)

    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

# Create an Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)  # Run 50 trials

# Best hyperparameters and accuracy
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

[I 2024-11-28 15:02:43,955] A new study created in memory with name: no-name-3b94767e-f3eb-4f17-aed3-7566785aca3c
[I 2024-11-28 15:03:04,170] Trial 0 finished with value: 0.9396872778962331 and parameters: {'iterations': 541, 'learning_rate': 0.07344186765800421, 'depth': 6, 'l2_leaf_reg': 8.856163502463602, 'bagging_temperature': 0.26105491633544, 'random_strength': 0.8691868927295232, 'border_count': 32}. Best is trial 0 with value: 0.9396872778962331.
[I 2024-11-28 15:03:36,763] Trial 1 finished with value: 0.9352452025586354 and parameters: {'iterations': 441, 'learning_rate': 0.0057598108462355874, 'depth': 11, 'l2_leaf_reg': 7.172587876147276, 'bagging_temperature': 0.2097917636332025, 'random_strength': 0.2492284086628591, 'border_count': 182}. Best is trial 0 with value: 0.9396872778962331.
[I 2024-11-28 15:03:49,211] Trial 2 finished with value: 0.9400355366027008 and parameters: {'iterations': 668, 'learning_rate': 0.22426495292492513, 'depth': 6, 'l2_leaf_reg': 4.46953633222

Best hyperparameters: {'iterations': 1519, 'learning_rate': 0.26009622586966913, 'depth': 4, 'l2_leaf_reg': 1.1996351639774017, 'bagging_temperature': 0.7649603238356025, 'random_strength': 0.20075632651722608, 'border_count': 154}
Best validation accuracy: 0.9400852878464819


In [6]:
catboost_best_params = study.best_params
model = CatBoostClassifier(**catboost_best_params)

model.fit(X, y)

0:	learn: 0.3776642	total: 14ms	remaining: 21.2s
1:	learn: 0.2758258	total: 27.4ms	remaining: 20.8s
2:	learn: 0.2326992	total: 40ms	remaining: 20.2s
3:	learn: 0.2096960	total: 51.5ms	remaining: 19.5s
4:	learn: 0.1981550	total: 63ms	remaining: 19.1s
5:	learn: 0.1889235	total: 73.8ms	remaining: 18.6s
6:	learn: 0.1833894	total: 85.3ms	remaining: 18.4s
7:	learn: 0.1795186	total: 98.9ms	remaining: 18.7s
8:	learn: 0.1742142	total: 110ms	remaining: 18.5s
9:	learn: 0.1719644	total: 122ms	remaining: 18.4s
10:	learn: 0.1684689	total: 133ms	remaining: 18.3s
11:	learn: 0.1669129	total: 144ms	remaining: 18.1s
12:	learn: 0.1650557	total: 160ms	remaining: 18.5s
13:	learn: 0.1638058	total: 172ms	remaining: 18.5s
14:	learn: 0.1627574	total: 183ms	remaining: 18.3s
15:	learn: 0.1616175	total: 194ms	remaining: 18.2s
16:	learn: 0.1609031	total: 206ms	remaining: 18.2s
17:	learn: 0.1596950	total: 222ms	remaining: 18.5s
18:	learn: 0.1582135	total: 233ms	remaining: 18.4s
19:	learn: 0.1576254	total: 245ms	remai

<catboost.core.CatBoostClassifier at 0x1aecd0b98d0>

In [10]:
test = test.drop(["id"], axis=1)

# fill Na
test.fillna(0.0, inplace=True)

# encoded column:
encoded_columns = ["Name", "City", "Gender", "Age", "Working Professional or Student", "Profession", "Sleep Duration", "Dietary Habits", "Degree", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness"]

for column in encoded_columns:
    test[column] = test[column].astype(str)
    test[column] = label_encoder.fit_transform(test[column])

test = test.astype(float)
test.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,288.0,1.0,36.0,67.0,1.0,28.0,0.0,2.0,0.0,0.0,5.0,25.0,13.0,54.0,0.0,9.0,3.0,1.0
1,279.0,0.0,41.0,25.0,1.0,20.0,0.0,2.0,0.0,0.0,4.0,25.0,13.0,17.0,0.0,6.0,4.0,0.0
2,365.0,1.0,36.0,21.0,1.0,57.0,0.0,4.0,0.0,0.0,1.0,16.0,13.0,13.0,1.0,12.0,4.0,0.0
3,191.0,0.0,5.0,47.0,0.0,0.0,5.0,0.0,6.84,1.0,0.0,27.0,13.0,36.0,1.0,10.0,4.0,0.0
4,285.0,1.0,30.0,22.0,1.0,57.0,0.0,5.0,0.0,0.0,5.0,16.0,13.0,29.0,1.0,3.0,4.0,0.0


In [12]:
test_y = model.predict(test)

In [14]:
submission = pd.read_csv("sample_submission.csv")
submission["Depression"] = test_y

submission.to_csv("submission/catboost.csv", index=False)