Step 1: Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import torch

Step 2: Load the Dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adilshamim8/student-depression-dataset")

print("Path to dataset files:", path)

df = pd.read_csv(path + "/student_depression_dataset.csv")
column_mapping = {
    "id": "id",
    "Gender": "gender",
    "Age": "age",
    "City": "city",
    "Profession": "profession",
    "Academic Pressure": "academic_pressure",
    "Work Pressure": "work_pressure",
    "CGPA": "cgpa",
    "Study Satisfaction": "study_satisfaction",
    "Job Satisfaction": "job_satisfaction",
    "Sleep Duration": "sleep_duration",
    "Dietary Habits": "dietary_habits",
    "Degree": "degree",
    "Have you ever had suicidal thoughts ?": "suicidal_thoughts",
    "Work/Study Hours": "work_study_hours",
    "Financial Stress": "financial_stress",
    "Family History of Mental Illness": "family_mental_illness",
    "Depression": "depression"
}
df = df.rename(columns=column_mapping)
print("Number of example: {}".format(len(df)))

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/jakub/.cache/kagglehub/datasets/adilshamim8/student-depression-dataset/versions/1
Number of example: 27901


Step 3: Data Cleaning & Preprocessing

3.1 Convert and Clean Data Types

In [3]:
# df.info()

In [4]:
# df.describe(include='all')

In [5]:
def extract_hours(s):
    # Find a number (including decimals)
    match = re.search(r"(\d+(\.\d+)?)", str(s))
    return float(match.group(1)) if match else np.nan

df['sleep_duration'] = df['sleep_duration'].apply(extract_hours)

# print(df[['sleep_duration', 'financial_stress']].head())

3.2 Check for Missing Values

In [6]:
missing_values = df.isnull().sum()
# print(missing_values)
df = df.dropna()
df = df[df["dietary_habits"] != "Others"]
df = df[df["financial_stress"] != "?"]


3.3 Drop not useful columns

In [7]:
df = df.drop(columns=['id', 'profession', 'job_satisfaction', 'work_pressure'])

3.4 Deal with categorical features

In [8]:
df['suicidal_thoughts'] = df['suicidal_thoughts'].map({'Yes': 1, 'No': 0})
df['family_mental_illness'] = df['family_mental_illness'].map({'Yes': 1, 'No': 0})

# To check
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})


In [9]:
display(df["dietary_habits"].unique())
df['dietary_habits'] = df['dietary_habits'].map({'Unhealthy': 0, 'Moderate': 1, 'Healthy': 2})

array(['Healthy', 'Moderate', 'Unhealthy'], dtype=object)

In [10]:
from category_encoders import BinaryEncoder
# To check
# encoder = BinaryEncoder(cols=['city', 'degree'])
# encoder = BinaryEncoder(cols=['degree'])

# df_encoded = encoder.fit_transform(df)
df_encoded = df.copy()

df_encoded = df_encoded.drop(columns=['city', 'degree'])
# df_encoded.info()


In [11]:
# df_encoded.dtypes


In [12]:
# df["financial_stress"].value_counts()

In [13]:
# First convert financial_stress from object to float
# df_encoded['financial_stress'] = df_encoded['financial_stress'].astype(float)
# Now all columns are numeric, we can check


df_encoded = df_encoded.astype(float)

# Verify the conversion
print("All columns converted to float:")
print(df_encoded.dtypes)

All columns converted to float:
gender                   float64
age                      float64
academic_pressure        float64
cgpa                     float64
study_satisfaction       float64
sleep_duration           float64
dietary_habits           float64
suicidal_thoughts        float64
work_study_hours         float64
financial_stress         float64
family_mental_illness    float64
depression               float64
dtype: object


In [14]:
labels = df_encoded['depression'].copy()
# Convert labels to tensor
y = torch.tensor(labels.values, dtype=torch.float32)

X = df_encoded.drop(columns=['depression'])
X = torch.tensor(X.values, dtype=torch.float32)


In [15]:
from sklearn.model_selection import train_test_split

# Using sklearn's train_test_split with torch tensors
X_train_indices, X_test_indices = train_test_split(
    range(len(X)), test_size=0.2, random_state=42
)

# Get the corresponding tensors using the indices
X_train = X[X_train_indices]
X_test = X[X_test_indices]
y_train = y[X_train_indices]
y_test = y[X_test_indices]

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")

Training set: torch.Size([22294, 11]), torch.Size([22294])
Testing set: torch.Size([5574, 11]), torch.Size([5574])


In [16]:
mean = X_train.mean(dim=0, keepdim=True)
std = X_train.std(dim=0, keepdim=True)

# Avoid division by zero
eps = 1e-6
X_train_norm = (X_train - mean) / (std + eps)
X_test_norm  = (X_test  - mean) / (std + eps)

1. XGBoost

In [17]:
X_train_np = X_train_norm.cpu().numpy()
y_train_np = y_train.cpu().numpy()
X_test_np  = X_test_norm.cpu().numpy()
y_test_np  = y_test.cpu().numpy()

In [18]:
from xgboost import XGBClassifier

model = XGBClassifier(
  tree_method="auto",
  gpu_id=0,
  max_depth=4,
  learning_rate=0.1,
  objective="binary:logistic",
  eval_metric="logloss",
  early_stopping_rounds=3,
)

model.fit(
  X_train_np,
  y_train_np,
  eval_set=[(X_test_np, y_test_np)]
)

# Predictions
y_pred_np     = model.predict(X_test_np)
y_pred_tensor = torch.from_numpy(y_pred_np)


[0]	validation_0-logloss:0.63480
[1]	validation_0-logloss:0.59832
[2]	validation_0-logloss:0.56843
[3]	validation_0-logloss:0.54299
[4]	validation_0-logloss:0.52173
[5]	validation_0-logloss:0.50343
[6]	validation_0-logloss:0.48734
[7]	validation_0-logloss:0.47375
[8]	validation_0-logloss:0.46178
[9]	validation_0-logloss:0.45149
[10]	validation_0-logloss:0.44241
[11]	validation_0-logloss:0.43420
[12]	validation_0-logloss:0.42718
[13]	validation_0-logloss:0.42067
[14]	validation_0-logloss:0.41529
[15]	validation_0-logloss:0.41016
[16]	validation_0-logloss:0.40546
[17]	validation_0-logloss:0.40152
[18]	validation_0-logloss:0.39810
[19]	validation_0-logloss:0.39446
[20]	validation_0-logloss:0.39120
[21]	validation_0-logloss:0.38810
[22]	validation_0-logloss:0.38539
[23]	validation_0-logloss:0.38269
[24]	validation_0-logloss:0.38057
[25]	validation_0-logloss:0.37864
[26]	validation_0-logloss:0.37643
[27]	validation_0-logloss:0.37472
[28]	validation_0-logloss:0.37305
[29]	validation_0-loglos



In [19]:
from sklearn.metrics import accuracy_score

# Convert probabilities to binary predictions if needed
y_pred_binary = (y_pred_np > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test_np, y_pred_binary)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 84.89%


Neural Networks (Multi-Layer Perceptron)
