In [3]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import optuna
import pandas as pd

# **PREPROCESSING**

In [4]:
df = pd.read_csv('train.csv', index_col=0)
df.rename(columns={
    'Have you ever had suicidal thoughts ?':'suicidal_thoughts',
    'Family History of Mental Illness':'family_history'
}, inplace=True)

In [5]:
df['job'] = df.apply(lambda row: 'Student' if row['Working Professional or Student'] == 'Student' else row['Profession'], axis=1)
df = df.fillna({'job':'Unemployed'})
df['pressure'] = df.apply(lambda row: row['Work Pressure'] if row['Working Professional or Student'] == 'Working Professional' else row['Academic Pressure'], axis=1)
df['satisfaction'] = df.apply(lambda row: row['Job Satisfaction'] if row['Working Professional or Student'] == 'Working Professional' else row['Study Satisfaction'], axis=1)
degree_mapping = {
    'Class 12': 'High School',
    'B.Ed': 'Undergraduate',
    'B.Arch': 'Undergraduate',
    'B.Com': 'Undergraduate',
    'B.Pharm': 'Undergraduate',
    'BCA': 'Undergraduate',
    'BBA': 'Undergraduate',
    'BSc': 'Undergraduate',
    'B.Tech': 'Undergraduate',
    'LLB': 'Undergraduate',
    'BHM': 'Undergraduate',
    'BA': 'Undergraduate',
    'BE': 'Undergraduate',
    'MBBS': 'Undergraduate',
    'M.Ed': 'Postgraduate',
    'MCA': 'Postgraduate',
    'MSc': 'Postgraduate',
    'LLM': 'Postgraduate',
    'M.Pharm': 'Postgraduate',
    'M.Tech': 'Postgraduate',
    'MBA': 'Postgraduate',
    'ME': 'Postgraduate',
    'MHM': 'Postgraduate',
    'M.Com': 'Postgraduate',
    'MA': 'Postgraduate',
    'PhD': 'Doctorate',
    'MD': 'Doctorate'
}
df['deg'] = df['Degree'].map(degree_mapping).fillna('other')

In [6]:
categorical_features = [
    'Gender',
    'job',
    'Sleep Duration',
    'Dietary Habits',
    'deg',
    'suicidal_thoughts',
    'family_history'
]

numerical_features = [
    'Age',
    'pressure',
    'satisfaction',
    'Work/Study Hours',
    'Financial Stress'
]

target = 'Depression'

Xtr, Xte, ytr, yte = train_test_split(
    df[numerical_features + categorical_features],
    df[target],
    stratify=df['Depression'],
    test_size=0.2,
    random_state=42
)

print(f'Xtr shape {Xtr.shape}')
print(f'Xte shape {Xte.shape}')
print(f'ytr shape {ytr.shape}')
print(f'yte shape {yte.shape}') 

Xtr shape (112560, 12)
Xte shape (28140, 12)
ytr shape (112560,)
yte shape (28140,)


# **SVM Benchmark**

In [5]:
from sklearn.svm import SVC

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(min_frequency=0.1, handle_unknown='infrequent_if_exist'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

In [None]:
def objective(trial):
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear'])
    if kernel == 'rbf' or kernel == 'poly':
        gamma = trial.suggest_float('gamma', 1e-4, 1e0, log=True)
    else:
        gamma = 'scale'  # Not used for linear kernel
    
    pipeline.set_params(
        classifier__C=C,
        classifier__kernel=kernel,
        classifier__gamma=gamma
    )
    
    scores = cross_val_score(pipeline, Xtr, ytr, cv=3, scoring='accuracy')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print(study.best_params)

best_params = study.best_params
pipeline.set_params(
    classifier__C=best_params['C'],
    classifier__kernel=best_params['kernel'],
    classifier__gamma=best_params['gamma']
)

pipeline.fit(Xtr, ytr)

test_accuracy = pipeline.score(Xte, yte)
print(f"Test accuracy: {test_accuracy:.5f}")

# **XGBoost**

In [None]:
from xgboost import XGBClassifier

Xtr, Xte, ytr, yte = train_test_split(
    df[numerical_features + categorical_features],
    df[target],
    stratify=df['Depression'],
    test_size=0.2,
    random_state=42
)

print(f'Xtr shape {Xtr.shape}')
print(f'Xte shape {Xte.shape}')
print(f'ytr shape {ytr.shape}')
print(f'yte shape {yte.shape}') 

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(min_frequency=0.1, handle_unknown='infrequent_if_exist'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(eval_metric='logloss', random_state=42))
])

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 1500)
    max_depth = trial.suggest_int('max_depth', 3, 25)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)

    pipeline.set_params(
        classifier__n_estimators=n_estimators,
        classifier__max_depth=max_depth,
        classifier__learning_rate=learning_rate,
        classifier__subsample=subsample,
        classifier__colsample_bytree=colsample_bytree,
        classifier__min_child_weight=min_child_weight
    )

    scores = cross_val_score(pipeline, Xtr, ytr, cv=5, scoring='accuracy', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

print("Best parameters:", study.best_params)

best_params = study.best_params
pipeline.set_params(
    classifier__n_estimators=best_params['n_estimators'],
    classifier__max_depth=best_params['max_depth'],
    classifier__learning_rate=best_params['learning_rate'],
    classifier__subsample=best_params['subsample'],
    classifier__colsample_bytree=best_params['colsample_bytree'],
    classifier__min_child_weight=best_params['min_child_weight']
)

pipeline.fit(Xtr, ytr)

test_accuracy = pipeline.score(Xte, yte)
print(f"Test accuracy: {test_accuracy:.5f}")

# **TabPFN**

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tabpfn import TabPFNClassifier

#X, y = load_breast_cancer(return_X_y=True)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
testdf = df.sample(1024)

X_train, X_test, y_train, y_test = train_test_split(
    testdf[numerical_features + categorical_features],
    testdf[target],
    stratify=testdf[target],
    test_size=0.2,
    random_state=42
)

# N_ensemble_configurations controls the number of model predictions that are ensembled with feature and class rotations (See our work for details).
# When N_ensemble_configurations > #features * #classes, no further averaging is applied.



numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(min_frequency=0.1, handle_unknown='infrequent_if_exist'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)
labenc = LabelEncoder()

X_train = preprocessor.fit_transform(X_train)
y_train = labenc.fit_transform(y_train)

classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)
classifier.fit(X_train, y_train, overwrite_warning=True)

X_test = preprocessor.transform(X_test)
y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)

print('Accuracy', accuracy_score(y_test, y_eval))

  return fn(*args, **kwargs)


Accuracy 0.9317073170731708


# **AutoGluon**

In [2]:
from autogluon.tabular import TabularPredictor
predictor = TabularPredictor(label='Depression').fit("train.csv")
predictions = predictor.predict("test.csv")

No path specified. Models will be saved in: "AutogluonModels/ag-20241203_124844"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.0.0: Mon Aug 12 20:51:54 PDT 2024; root:xnu-11215.1.10~2/RELEASE_ARM64_T6000
CPU Count:          8
Memory Avail:       4.70 GB / 16.00 GB (29.3%)
Disk Space Avail:   203.87 GB / 460.43 GB (44.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and

# **MLJar**

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML

df = pd.read_csv(
    'train.csv',
    index_col=0
)
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['Depression']),
    df['Depression'],
    test_size=0.25
)

automl = AutoML(
    eval_metric='accuracy',
    mode='Compete',
    total_time_limit=60*60*4
)
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

Linear algorithm was disabled.
AutoML directory: AutoML_5
The task is binary_classification with evaluation metric accuracy
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree accuracy 0.909694 trained in 4.17 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree accuracy 0.910391 trained in 24.05 seconds
2_DecisionTree accuracy 0.919744 trained in 2