# Preprocess data

In [1]:
import pandas as pd

These columns are either extremely imbalanced or do not have correlation with Depression so we remove them.

In [2]:
imba_cols = [
    # irrelevant
    'Name',
    
    # extremely imbalanced
    'Working Professional or Student',
    'Academic Pressure',
    'CGPA',
    'Study Satisfaction'
]

In [3]:
df = pd.read_csv('data/train-clean.csv')
test_df = pd.read_csv('data/test.csv')

In [4]:
X  = df.drop(['Depression'] + imba_cols, axis=1)
y = df['Depression']

In [5]:
X.head()

Unnamed: 0,Gender,Age,City,Profession,Work Pressure,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,Female,49.0,Ludhiana,Chef,5.0,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No
1,Male,26.0,Varanasi,Teacher,4.0,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No
2,Male,22.0,Mumbai,Teacher,5.0,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes
3,Female,30.0,Kanpur,Business Analyst,1.0,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes
4,Female,59.0,Ahmedabad,Finanancial Analyst,2.0,5.0,5-6 hours,Healthy,MCA,No,7.0,5.0,No


In [6]:
X_sub = test_df[X.columns].fillna(0)
X_sub.head()

Unnamed: 0,Gender,Age,City,Profession,Work Pressure,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,Male,53.0,Visakhapatnam,Judge,2.0,5.0,Less than 5 hours,Moderate,LLB,No,9.0,3.0,Yes
1,Female,58.0,Kolkata,Educational Consultant,2.0,4.0,Less than 5 hours,Moderate,B.Ed,No,6.0,4.0,No
2,Male,53.0,Jaipur,Teacher,4.0,1.0,7-8 hours,Moderate,B.Arch,Yes,12.0,4.0,No
3,Female,23.0,Rajkot,0,0.0,0.0,More than 8 hours,Moderate,BSc,Yes,10.0,4.0,No
4,Male,47.0,Kalyan,Teacher,5.0,5.0,7-8 hours,Moderate,BCA,Yes,3.0,4.0,No


In [7]:
print(X['Sleep Duration'].unique())
print(X['Dietary Habits'].unique())
print(X['Degree'].value_counts())

['More than 8 hours' 'Less than 5 hours' '5-6 hours' '7-8 hours']
['Healthy' 'Unhealthy' 'Moderate']
Degree
B.Ed       9902
B.Arch     7316
B.Com      6688
B.Pharm    5109
M.Ed       4902
BBA        4404
BCA        4366
MCA        4260
LLM        4230
BSc        4199
M.Pharm    4020
MSc        3752
LLB        3730
M.Tech     3512
ME         3510
BHM        3441
B.Tech     3321
MBA        3319
BA         3214
MHM        3163
MD         2858
PhD        2643
BE         2546
MBBS       2432
MA         2397
M.Com      2396
Name: count, dtype: int64


In [8]:
X.head()

Unnamed: 0,Gender,Age,City,Profession,Work Pressure,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,Female,49.0,Ludhiana,Chef,5.0,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No
1,Male,26.0,Varanasi,Teacher,4.0,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No
2,Male,22.0,Mumbai,Teacher,5.0,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes
3,Female,30.0,Kanpur,Business Analyst,1.0,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes
4,Female,59.0,Ahmedabad,Finanancial Analyst,2.0,5.0,5-6 hours,Healthy,MCA,No,7.0,5.0,No


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

categorical_cols = ['Gender', 'City', 'Profession', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

preprocessor = ColumnTransformer(
    transformers=[
        ('sleep', OrdinalEncoder(
            categories=[[
                'Less than 5 hours',
                '5-6 hours',
                '7-8 hours',
                'More than 8 hours'
            ]],
            handle_unknown='use_encoded_value',
            unknown_value=-1,
        ), ['Sleep Duration']),
        ('diet', OrdinalEncoder(
            categories=[[
                'Unhealthy',
                'Moderate',
                'Healthy'
            ]],
            handle_unknown='use_encoded_value',
            unknown_value=-1,
        ), ['Dietary Habits']),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ],
    remainder='passthrough'
)

X_encoded = preprocessor.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded.toarray(), columns=preprocessor.get_feature_names_out())

In [10]:
X_encoded.head()

Unnamed: 0,sleep__Sleep Duration,diet__Dietary Habits,onehot__Gender_Female,onehot__Gender_Male,onehot__City_Agra,onehot__City_Ahmedabad,onehot__City_Bangalore,onehot__City_Bhopal,onehot__City_Chennai,onehot__City_Delhi,...,onehot__Degree_PhD,onehot__Have you ever had suicidal thoughts ?_No,onehot__Have you ever had suicidal thoughts ?_Yes,onehot__Family History of Mental Illness_No,onehot__Family History of Mental Illness_Yes,remainder__Age,remainder__Work Pressure,remainder__Job Satisfaction,remainder__Work/Study Hours,remainder__Financial Stress
0,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,49.0,5.0,2.0,1.0,2.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,26.0,4.0,3.0,7.0,3.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,22.0,5.0,1.0,10.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,30.0,1.0,1.0,9.0,4.0
4,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,59.0,2.0,5.0,7.0,5.0


In [11]:
X_sub = preprocessor.transform(test_df)
X_sub = pd.DataFrame(X_sub.toarray(), columns = preprocessor.get_feature_names_out()).fillna(0)

In [12]:
# calculate class balance
class_balance = y.value_counts(normalize=True)
print(f'Class (im)balance: {class_balance[0]/class_balance[1]:.4f}')

Class (im)balance: 16.3791


We have an extreme class imbalance.

# Metrics

In [13]:
from sklearn.metrics import accuracy_score
def evaluate(model, X_train, X_test, y_train, y_test):
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))

    print(f'Train accuracy: {train_acc:.4f}')
    print(f'Test accuracy: {test_acc:.4f}')

In [14]:
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # use SVD to reduce dimensionality
    # svd = TruncatedSVD(n_components=50)
    # X_train_reduced = svd.fit_transform(X_train_scaled)
    # X_test_reduced = svd.transform(X_test_scaled)
    
    # train_cols = [f'component_{i+1}' for i in range(X_train_reduced.shape[1])]
    # test_cols = [f'component_{i+1}' for i in range(X_test_reduced.shape[1])]
    
    # return pd.DataFrame(X_train_reduced, columns=train_cols), pd.DataFrame(X_test_reduced, columns=test_cols), y_train, y_test
    return scaler, pd.DataFrame(X_train_scaled, columns=X.columns), pd.DataFrame(X_test_scaled, columns=X.columns), y_train, y_test

In [15]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve

def plot_learning_curve(model, X, y):
    train_sizes, train_scores, val_scores = learning_curve(model, X, y, cv=5, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1, 10))
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, val_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation Accuracy')
    plt.fill_between(train_sizes, val_mean + val_std, val_mean - val_std, alpha=0.15, color='green')
    plt.grid()
    plt.xlabel('Training Examples')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.show()

# Hyperparameter optimization

We have determined that using class weights gives us the best test accuracy. Now we will optimize hyperparameters to reduce variance.

In [25]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint
from xgboost import XGBClassifier
from scipy.stats import randint, uniform

def random_search(X_train, y_train, n_iter=100):
    param_dist = {
        'n_estimators': randint(50, 200),  # Integer range
        'max_depth': randint(2, 10),        # Integer range
        'learning_rate': uniform(0.01, 0.2), # Float range (0.01 to 0.21)
        'subsample': uniform(0.8, 0.2),    # Float range (0.8 to 1.0)
        'colsample_bytree': uniform(0.8, 0.2), # Float range (0.8 to 1.0)
        'gamma': uniform(0, 0.3),          # Float range (0 to 0.3)
        'reg_alpha': uniform(0, 0.5),      # Float range (0 to 0.5)
        'reg_lambda': uniform(0.1, 9.9),   # Float range (0.1 to 10.0)
        'device': ['cuda'],       # Use GPU acceleration
        'random_state': [42]
    }

    # Create a Random Forest Classifier
    model = XGBClassifier(device='cuda', random_state=42)

    # Set up Stratified K-Fold Cross-Validation
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Use RandomizedSearchn
    # CV with StratifiedKFold
    random_search = RandomizedSearchCV(
        model, 
        param_distributions=param_dist, 
        n_iter=n_iter,  # Number of parameter settings sampled
        cv=stratified_kfold,  # Use StratifiedKFold
        scoring='accuracy', 
        n_jobs=-1,   # Use all available cores
        random_state=42
    )

    # Fit the model to the training data
    random_search.fit(X_train, y_train)

    # Print the best parameters and best score
    print(f'Best parameters: {random_search.best_params_}')
    print(f'Best cross-validation score: {random_search.best_score_:.4f}')

    # Evaluate the model on the test set
    return random_search

In [26]:
from sklearn.model_selection import RandomizedSearchCV

estimators: dict[str, RandomizedSearchCV] = {}

## Undersample majority class

In [27]:
from sklearn.utils import resample
import numpy as np

# Assuming X_encoded is your sparse matrix
X_majority = X_encoded[y==0]
X_minority = X_encoded[y==1]

# Downsample the majority class to match the minority class size
X_majority_downsampled = resample(X_majority, 
                                  replace=False, 
                                  n_samples=X_minority.shape[0], 
                                  random_state=42)

# Stack the arrays vertically
X_resampled = np.vstack((X_majority_downsampled, X_minority))
X_resampled = pd.DataFrame(X_resampled, columns=X_encoded.columns)
y_resampled = np.concatenate([np.zeros(X_majority_downsampled.shape[0]), np.ones(X_minority.shape[0])]).astype('int64')

scaler, X_train, X_test, y_train, y_test = split_data(X_resampled, y_resampled)
estimators['undersampling'] = random_search(X_train, y_train)

Best parameters: {'colsample_bytree': 0.9788434525791851, 'device': 'cuda', 'gamma': 0.03862451948989317, 'learning_rate': 0.07601990266203108, 'max_depth': 4, 'n_estimators': 122, 'random_state': 42, 'reg_alpha': 0.04614529311437304, 'reg_lambda': 4.863339401157303, 'subsample': 0.9375569434192167}
Best cross-validation score: 0.9054


## Oversampling majority class

In [28]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
scaler, X_train, X_test, y_train, y_test = split_data(X_encoded, y)

X_train, y_train = smote.fit_resample(X_train, y_train)

estimators['oversampling'] = random_search(X_train, y_train)

Best parameters: {'colsample_bytree': 0.8321616102834998, 'device': 'cuda', 'gamma': 0.16462013680997584, 'learning_rate': 0.14837903953853868, 'max_depth': 9, 'n_estimators': 117, 'random_state': 42, 'reg_alpha': 0.11862454374840004, 'reg_lambda': 3.321457011776751, 'subsample': 0.9492982810236049}
Best cross-validation score: 0.9837


## Balanced learning

In [29]:
scaler, X_train, X_test, y_train, y_test = split_data(X_encoded, y)

estimators['no_sampling'] = random_search(X_train, y_train) 

Best parameters: {'colsample_bytree': 0.8509281309527528, 'device': 'cuda', 'gamma': 0.20445081666717876, 'learning_rate': 0.16204557197793734, 'max_depth': 4, 'n_estimators': 192, 'random_state': 42, 'reg_alpha': 0.23578809427507919, 'reg_lambda': 4.177225050057959, 'subsample': 0.8697736533085991}
Best cross-validation score: 0.9695


In [30]:
for name, estimator in estimators.items():
    print(name)
    evaluate(estimator.best_estimator_, X_train, X_test, y_train, y_test)
    print()

undersampling
Train accuracy: 0.5039
Test accuracy: 0.5021

oversampling
Train accuracy: 0.9796
Test accuracy: 0.9650

no_sampling
Train accuracy: 0.9732
Test accuracy: 0.9666



# Submission

In [31]:
scaler, _ , _, _, _ = split_data(X_encoded, y)

In [32]:
for name, estimator in estimators.items():
    predictions = estimator.best_estimator_.predict(scaler.transform(X_sub))
    submisison = pd.DataFrame({'id': test_df['id'], 'Depression': predictions})
    submisison.to_csv(f'data/submission-xgboost-{name}.csv', index=False)

# Save params to disk

In [33]:
params = {
    name: estimator.best_params_
    for name, estimator in estimators.items()
}
print(params)

{'undersampling': {'colsample_bytree': 0.9788434525791851, 'device': 'cuda', 'gamma': 0.03862451948989317, 'learning_rate': 0.07601990266203108, 'max_depth': 4, 'n_estimators': 122, 'random_state': 42, 'reg_alpha': 0.04614529311437304, 'reg_lambda': 4.863339401157303, 'subsample': 0.9375569434192167}, 'oversampling': {'colsample_bytree': 0.8321616102834998, 'device': 'cuda', 'gamma': 0.16462013680997584, 'learning_rate': 0.14837903953853868, 'max_depth': 9, 'n_estimators': 117, 'random_state': 42, 'reg_alpha': 0.11862454374840004, 'reg_lambda': 3.321457011776751, 'subsample': 0.9492982810236049}, 'no_sampling': {'colsample_bytree': 0.8509281309527528, 'device': 'cuda', 'gamma': 0.20445081666717876, 'learning_rate': 0.16204557197793734, 'max_depth': 4, 'n_estimators': 192, 'random_state': 42, 'reg_alpha': 0.23578809427507919, 'reg_lambda': 4.177225050057959, 'subsample': 0.8697736533085991}}


In [34]:
import json

with open('models/xgboost.json', 'w') as f:
    json.dump(params, f)