In [1]:
import warnings
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.model_selection import (
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import shap
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore", category=FutureWarning)
plt.style.use("fast")

In [2]:
df = pd.read_csv('train.csv')
label_encoder = LabelEncoder()

# Drop column
df = df.drop(["id"], axis=1)

# fill Na
df.fillna(0.0, inplace=True)

# encoded column:
encoded_columns = ["Name", "City", "Gender", "Age", "Working Professional or Student", "Profession", "Sleep Duration", "Dietary Habits", "Degree", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness"]
df["Profession"] = df["Profession"].astype(str)

for column in encoded_columns:
    df[column] = df[column].astype(str)
    df[column] = label_encoder.fit_transform(df[column])

df = df.astype(float)
df.head(5)

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,11.0,0.0,31.0,50.0,1.0,11.0,0.0,5.0,0.0,0.0,2.0,29.0,8.0,34.0,0.0,1.0,2.0,0.0,0.0
1,407.0,1.0,8.0,93.0,1.0,56.0,0.0,4.0,0.0,0.0,3.0,27.0,21.0,64.0,1.0,7.0,3.0,0.0,1.0
2,417.0,1.0,15.0,97.0,0.0,0.0,5.0,0.0,8.97,2.0,0.0,15.0,8.0,22.0,1.0,3.0,1.0,0.0,1.0
3,417.0,1.0,4.0,64.0,1.0,56.0,0.0,5.0,0.0,0.0,1.0,27.0,16.0,29.0,1.0,10.0,1.0,1.0,1.0
4,286.0,0.0,12.0,37.0,1.0,10.0,0.0,1.0,0.0,0.0,1.0,15.0,21.0,29.0,1.0,9.0,4.0,1.0,0.0


In [3]:
X = df.drop(columns=['Depression'])
y = df['Depression']
SEED = 10

In [8]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 0.2),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.9),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1.0),
        'random_state': 10,
        'eval_metric': 'logloss',
        'objective': 'binary:logistic',
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    y_pred = []
    y_true = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_f, y_train_f)
        y_pred.append(model.predict(X_val_f))
        y_true.append(y_val_f.values)
    
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    accuracy = accuracy_score(y_true, y_pred)

    # Optuna minimizes, so we return 1 - auc
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  
best_xgb_params = study.best_params

# best trial
print("Best trial:")
trial = study.best_trial
print(f"Value: {trial.value}")
print("Best Params:")
for key, value in trial.params.items():
    print(f"{key}: {value}")

[I 2024-11-28 14:37:14,718] A new study created in memory with name: no-name-c97f1cf3-2ff1-413d-9460-e47b73e82045
[I 2024-11-28 14:37:27,507] Trial 0 finished with value: 0.8182871357498224 and parameters: {'n_estimators': 701, 'max_depth': 6, 'learning_rate': 0.00015810798048963292, 'subsample': 0.6964119558693622, 'colsample_bytree': 0.7193429410061497, 'gamma': 1.9742824860321044, 'reg_alpha': 0.0015303992146647354, 'reg_lambda': 0.0009153915573194836}. Best is trial 0 with value: 0.8182871357498224.
[I 2024-11-28 14:37:34,982] Trial 1 finished with value: 0.9395735607675906 and parameters: {'n_estimators': 416, 'max_depth': 7, 'learning_rate': 0.023922519155574185, 'subsample': 0.5118661608093226, 'colsample_bytree': 0.8275733415711461, 'gamma': 4.45449454717529, 'reg_alpha': 0.06503670683476363, 'reg_lambda': 0.7586834897731384}. Best is trial 1 with value: 0.9395735607675906.
[I 2024-11-28 14:37:59,949] Trial 2 finished with value: 0.9150319829424307 and parameters: {'n_estimator

Best trial:
Value: 0.9398081023454158
Best Params:
n_estimators: 1280
max_depth: 9
learning_rate: 0.011565541749182971
subsample: 0.7565088322918962
colsample_bytree: 0.45637067465324693
gamma: 4.091091622836834
reg_alpha: 0.07184101166550814
reg_lambda: 0.0045588523124331995


In [10]:
xgboost_model = xgb.XGBClassifier(**best_xgb_params)
xgboost_model.fit(X, y)

In [11]:
test = pd.read_csv("test.csv")
test = test.drop(["id"], axis=1)

# fill Na
test.fillna(0.0, inplace=True)

# encoded column:
encoded_columns = ["Name", "City", "Gender", "Age", "Working Professional or Student", "Profession", "Sleep Duration", "Dietary Habits", "Degree", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness"]

for column in encoded_columns:
    test[column] = test[column].astype(str)
    test[column] = label_encoder.fit_transform(test[column])

test = test.astype(float)
test.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,288.0,1.0,36.0,67.0,1.0,28.0,0.0,2.0,0.0,0.0,5.0,25.0,13.0,54.0,0.0,9.0,3.0,1.0
1,279.0,0.0,41.0,25.0,1.0,20.0,0.0,2.0,0.0,0.0,4.0,25.0,13.0,17.0,0.0,6.0,4.0,0.0
2,365.0,1.0,36.0,21.0,1.0,57.0,0.0,4.0,0.0,0.0,1.0,16.0,13.0,13.0,1.0,12.0,4.0,0.0
3,191.0,0.0,5.0,47.0,0.0,0.0,5.0,0.0,6.84,1.0,0.0,27.0,13.0,36.0,1.0,10.0,4.0,0.0
4,285.0,1.0,30.0,22.0,1.0,57.0,0.0,5.0,0.0,0.0,5.0,16.0,13.0,29.0,1.0,3.0,4.0,0.0


In [12]:
test_y = xgboost_model.predict(test)

submission = pd.read_csv("sample_submission.csv")
submission["Depression"] = test_y

submission.to_csv("submission/xgboost.csv", index=False)