# Installs and Imports

In [None]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import kagglehub

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Dataset 1 -> Depression

## Data Read

In [None]:
# Download latest version
path = kagglehub.dataset_download("hopesb/student-depression-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/student-depression-dataset


In [None]:
for filename in os.listdir(path):
    if filename.endswith(".csv"):  # Check if it's a CSV file
        filepath = os.path.join(path, filename)
        try:
            df_dep = pd.read_csv(filepath)
            print(f"Successfully read {filename} into a pandas DataFrame:")
            # print(df.head()) # Display first few rows of the dataframe
            break # Stop after reading the first CSV file found
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# If no CSV file is found
else:
    print("No CSV files found in the specified directory.")

Successfully read Student Depression Dataset.csv into a pandas DataFrame:


In [None]:
df_dep.drop(columns=['id', 'Profession', 'Job Satisfaction', 'City'], axis=1, inplace=True)

In [None]:
df_dep.dropna(subset=['Financial Stress'], inplace=True)

In [None]:
df_dep.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,5.0,0.0,8.97,2.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,Female,24.0,2.0,0.0,5.9,5.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,Male,31.0,3.0,0.0,7.03,5.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,Female,28.0,3.0,0.0,5.59,2.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,Female,25.0,4.0,0.0,8.13,3.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [None]:
# Gender
df_dep['Gender'] = df_dep['Gender'].map({'Male': 0, 'Female': 1})

# Sleep Duration
df_dep['Sleep Duration'] = df_dep['Sleep Duration'].apply(lambda x: 0 if x == "Less than 5 hours" else 1)

# Dietary Habits
df_dep['Dietary Habits'] = df_dep['Dietary Habits'].map({'Healthy': 1, 'Unhealthy': 0})

# Convert columns to integer type
for col in ['Age', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']:
    df_dep[col] = df_dep[col].astype(int)

# Family History of Mental Illness
df_dep['Family History of Mental Illness'] = df_dep['Family History of Mental Illness'].map({'No': 0, 'Yes': 1})

df_dep['Have you ever had suicidal thoughts ?'] = df_dep['Have you ever had suicidal thoughts ?'].map({'Yes': 1, 'No': 0})

In [None]:
# ─── 2. mappings ───────────────────────────────────────────────────────────
# map to full-text for better embeddings
full_form = {
    'B.Pharm':    'Bachelor of Pharmacy',
    'BSc':        'Bachelor of Science',
    'BA':         'Bachelor of Arts',
    'BCA':        'Bachelor of Computer Applications',
    'M.Tech':     'Master of Technology',
    'PhD':        'Doctor of Philosophy',
    'Class 12':   'Class 12 education',
    'B.Ed':       'Bachelor of Education',
    'LLB':        'Bachelor of Laws',
    'BE':         'Bachelor of Engineering',
    'M.Ed':       'Master of Education',
    'MSc':        'Master of Science',
    'BHM':        'Bachelor of Hotel Management',
    'M.Pharm':    'Master of Pharmacy',
    'MCA':        'Master of Computer Applications',
    'MA':         'Master of Arts',
    'B.Com':      'Bachelor of Commerce',
    'MD':         'Doctor of Medicine',
    'MBA':        'Master of Business Administration',
    'MBBS':       'Bachelor of Medicine and Bachelor of Surgery',
    'M.Com':      'Master of Commerce',
    'B.Arch':     'Bachelor of Architecture',
    'LLM':        'Master of Laws',
    'B.Tech':     'Bachelor of Technology',
    'BBA':        'Bachelor of Business Administration',
    'ME':         'Master of Engineering',
    'MHM':        'Master of Hotel Management',
    'Others':     'Other degree'
}

# categorize into broad “level” buckets
level_map = {
    'Class 12': '0',
    **{k: '1' for k in
       ['B.Pharm','BSc','BA','BCA','B.Ed','LLB','BE','BHM','B.Com','MBBS','B.Arch','B.Tech','BBA']},
    **{k: '2' for k in
       ['M.Tech','M.Ed','MSc','M.Pharm','MCA','MA','MBA','M.Com','ME','MHM','LLM']},
    'PhD': '3',
    'MD': '3',
    'Others': '0'
}

# and into “domain” buckets
domain_map = {
    'B.Pharm':'Pharmacy','M.Pharm':'Pharmacy',
    'BSc':'Science','MSc':'Science',
    'BA':'Arts','MA':'Arts',
    'BCA':'Computer Applications','MCA':'Computer Applications',
    'M.Tech':'Technology','B.Tech':'Technology',
    'BE':'Engineering','ME':'Engineering',
    'LLB':'Law','LLM':'Law',
    'B.Ed':'Education','M.Ed':'Education',
    'BBA':'Business','MBA':'Business',
    'B.Com':'Commerce','M.Com':'Commerce',
    'MBBS':'Medicine','MD':'Medicine',
    'B.Arch':'Architecture',
    'BHM':'Hospitality','MHM':'Hospitality',
    'Class 12':'High School','PhD':'Research','Others':'Other'
}

In [None]:
# ─── 3. apply mappings ──────────────────────────────────────────────────────
df_dep['degree_text']   = df_dep['Degree'].map(full_form).fillna(df_dep['Degree'])
df_dep['degree_level']  = df_dep['Degree'].map(level_map).fillna('Other')
df_dep['degree_domain'] = df_dep['Degree'].map(domain_map).fillna('Other')

In [None]:
# ─── 4. compute embeddings ─────────────────────────────────────────────────
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df_dep['degree_text'].tolist(),
                          convert_to_numpy=True,
                          show_progress_bar=True)  # shape (n, 384)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/872 [00:00<?, ?it/s]

In [None]:
# ─── 5. cluster those embeddings ────────────────────────────────────────────
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_dep['degree_cluster'] = kmeans.fit_predict(embeddings)

In [None]:
# ─── 6. (optional) reduce dims via PCA → new columns ────────────────────────
n_pca = 5
pca = PCA(n_components=n_pca, random_state=42)
emb_pca = pca.fit_transform(embeddings)  # shape (n, n_pca)
for i in range(n_pca):
    df_dep[f'degree_emb_pca_{i+1}'] = emb_pca[:, i]

In [None]:
df_dep.dropna(subset=['Dietary Habits'], inplace=True)

In [None]:
df_dep.drop(columns=['degree_text', 'Degree', 'degree_domain'], errors='ignore', inplace =True)

In [None]:
df_dep.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,degree_level,degree_cluster,degree_emb_pca_1,degree_emb_pca_2,degree_emb_pca_3,degree_emb_pca_4,degree_emb_pca_5
0,0,33,5.0,0.0,8.97,2,1,1.0,1,3,1,0,1,1,1,0.253879,0.243325,0.343663,-0.337468,-0.045405
2,0,31,3.0,0.0,7.03,5,0,1.0,0,9,1,1,0,1,1,0.221702,0.239069,0.015781,0.184283,0.126231
5,0,29,2.0,0.0,5.7,3,0,1.0,0,4,1,0,0,3,2,0.061047,-0.276469,0.533775,-0.294401,0.217425
6,0,30,3.0,0.0,9.54,4,1,1.0,0,1,2,0,0,1,1,0.235426,0.208007,0.092394,0.084074,0.243518
7,1,30,2.0,0.0,8.04,4,0,0.0,0,0,1,1,0,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673


In [None]:
X = df_dep.drop('Depression', axis=1)
y = df_dep['Depression']

# Split data into training and temporary sets (test + validation)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split temporary data into test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


print("Training data shape:", X_train.shape, y_train.shape)
print("Validation data shape:", X_val.shape, y_val.shape)
print("Test data shape:", X_test.shape, y_test.shape)

# Scale numerical features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)



Training data shape: (10779, 19) (10779,)
Validation data shape: (3593, 19) (3593,)
Test data shape: (3593, 19) (3593,)


In [None]:
# Scale numerical features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Now you can use X_train_scaled, X_val_scaled, and X_test_scaled in your models


## Baseline Models

In [None]:
classification_models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB(),
}

classification_params = {
    'Decision Tree': {'max_depth': [3,]},
    'Random Forest': {'n_estimators': [50], 'max_depth': [3]},
    'Gradient Boosting': {'n_estimators': [50], 'learning_rate': [0.01]},
    'Bagging': {'n_estimators': [50]},
    'K-Nearest Neighbors': {'n_neighbors': [5]},
    'Support Vector Machine': {'C': [0.1], 'kernel': ['linear']},
}

classification_results = []

for model_name in classification_models:
    model = classification_models[model_name]

    if model_name in classification_params:
        grid_search = GridSearchCV(model, classification_params[model_name], cv=5)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        training_time = grid_search.refit_time_
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        classification_results.append([model_name, best_params, training_time, accuracy])


results_df = pd.DataFrame(classification_results, columns=['Model', 'Best Parameters', 'Training Time (s)', 'Accuracy'])

results_df

Unnamed: 0,Model,Best Parameters,Training Time (s),Accuracy
0,Decision Tree,{'max_depth': 3},0.058978,0.820763
1,Random Forest,"{'max_depth': 3, 'n_estimators': 50}",0.346031,0.822989
2,Gradient Boosting,"{'learning_rate': 0.01, 'n_estimators': 50}",0.973771,0.807125
3,Bagging,{'n_estimators': 50},3.569257,0.831339
4,K-Nearest Neighbors,{'n_neighbors': 5},0.014994,0.801559
5,Support Vector Machine,"{'C': 0.1, 'kernel': 'linear'}",3.606805,0.847481


## DL BASELINE

In [None]:
# Assuming 'X' and 'y' are already defined from the previous code
# Convert data to PyTorch tensors
# X = torch.tensor(X_scaled, dtype=torch.float32)
X = torch.tensor(torch.tensor(X.values.astype(np.float32)))
y = torch.tensor(y.astype(int), dtype=torch.long)  # Assuming y is categorical


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Sequential model
class SequentialModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SequentialModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out


# Model parameters
input_size = X_train.shape[1]
hidden_size_seq = 64
output_size = len(np.unique(y))  # Number of unique classes
hidden_size_lstm = 32
num_layers_lstm = 2


# Initialize models and optimizers
seq_model = SequentialModel(input_size, hidden_size_seq, output_size)
lstm_model = LSTMModel(input_size, hidden_size_lstm, num_layers_lstm, output_size)

criterion = nn.CrossEntropyLoss()
seq_optimizer = optim.Adam(seq_model.parameters(), lr=0.001)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Training loop (example - adjust epochs and batch size as needed)
num_epochs = 10
batch_size = 32

  X = torch.tensor(torch.tensor(X.values.astype(np.float32)))


In [None]:
# Training loop for Sequential Model
for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        seq_optimizer.zero_grad()
        batch_X = X_train[i:i + batch_size]
        batch_y = y_train[i:i + batch_size]
        outputs = seq_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        seq_optimizer.step()
    print(f'Sequential Model - Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Testing loop for Sequential Model
with torch.no_grad():
    seq_outputs = seq_model(X_test)
    _, seq_predicted = torch.max(seq_outputs, 1)
    seq_accuracy = (seq_predicted == y_test).sum().item() / len(y_test)
    print(f'Sequential Model Accuracy: {seq_accuracy:.4f}')


Sequential Model - Epoch [1/10], Loss: 0.3016
Sequential Model - Epoch [2/10], Loss: 0.3102
Sequential Model - Epoch [3/10], Loss: 0.3099
Sequential Model - Epoch [4/10], Loss: 0.3052
Sequential Model - Epoch [5/10], Loss: 0.3012
Sequential Model - Epoch [6/10], Loss: 0.2973
Sequential Model - Epoch [7/10], Loss: 0.2954
Sequential Model - Epoch [8/10], Loss: 0.2890
Sequential Model - Epoch [9/10], Loss: 0.2872
Sequential Model - Epoch [10/10], Loss: 0.2837
Sequential Model Accuracy: 0.8475


In [None]:
# Training loop for LSTM Model
for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        lstm_optimizer.zero_grad()
        batch_X = X_train[i:i + batch_size].unsqueeze(1) # Reshape for LSTM input
        batch_y = y_train[i:i + batch_size]
        lstm_outputs = lstm_model(batch_X)
        loss = criterion(lstm_outputs, batch_y)
        loss.backward()
        lstm_optimizer.step()
    print(f'LSTM Model - Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


# Testing loop for LSTM model
with torch.no_grad():
    lstm_outputs = lstm_model(X_test.unsqueeze(1)) # Reshape for LSTM input
    _, lstm_predicted = torch.max(lstm_outputs, 1)
    lstm_accuracy = (lstm_predicted == y_test).sum().item() / len(y_test)
    print(f'LSTM Model Accuracy: {lstm_accuracy:.4f}')

LSTM Model - Epoch [1/10], Loss: 0.2744
LSTM Model - Epoch [2/10], Loss: 0.2587
LSTM Model - Epoch [3/10], Loss: 0.2510
LSTM Model - Epoch [4/10], Loss: 0.2462
LSTM Model - Epoch [5/10], Loss: 0.2429
LSTM Model - Epoch [6/10], Loss: 0.2407
LSTM Model - Epoch [7/10], Loss: 0.2395
LSTM Model - Epoch [8/10], Loss: 0.2386
LSTM Model - Epoch [9/10], Loss: 0.2380
LSTM Model - Epoch [10/10], Loss: 0.2371
LSTM Model Accuracy: 0.8500


# Dataset 2 -> Exam Scores

## Data Read And Pre-processing

In [None]:
# Download latest version
path = kagglehub.dataset_download("desalegngeb/students-exam-scores")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/students-exam-scores


In [None]:
path = "/root/.cache/kagglehub/datasets/desalegngeb/students-exam-scores/versions/2"

In [None]:
for filename in os.listdir(path):
    if filename.endswith(".csv"):  # Check if it's a CSV file
        filepath = os.path.join(path, filename)
        try:
            df_exam = pd.read_csv(filepath)
            print(f"Successfully read {filename} into a pandas DataFrame:")
            # print(df.head()) # Display first few rows of the dataframe
            break # Stop after reading the first CSV file found
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# If no CSV file is found
else:
    print("No CSV files found in the specified directory.")

Successfully read Expanded_data_with_more_features.csv into a pandas DataFrame:


In [None]:
df_exam.dropna(inplace=True)

In [None]:
df_exam.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [None]:
# Gender
df_exam['Gender'] = df_exam['Gender'].map({'male': 0, 'female': 1})

# EthnicGroup
ethnic_mapping = {'group A': 0, 'group B': 1, 'group C': 2, 'group D': 3, 'group E': 4}
df_exam['EthnicGroup'] = df_exam['EthnicGroup'].map(ethnic_mapping)

# ParentEdu
parent_edu_mapping = {
    'high school': 0, 'some high school': 0,
    'some college': 1, "associate's degree": 1,
    "bachelor's degree": 2, "master's degree": 3
}
df_exam['ParentEduc'] = df_exam['ParentEduc'].map(parent_edu_mapping)

# Encode LunchType, TestPrep, IsFirstChild, TransportMeans as 0, 1
df_exam['LunchType'] = df_exam['LunchType'].map({'standard': 1, 'free/reduced': 0})
df_exam['TestPrep'] = df_exam['TestPrep'].map({'completed': 1, 'none': 0})
df_exam['IsFirstChild'] = df_exam['IsFirstChild'].map({'yes': 1, 'no': 0})

# ParentMaritalStatus
marital_mapping = {'single': 0, 'divorced': 1, 'widowed': 2, 'married': 3}
df_exam['ParentMaritalStatus'] = df_exam['ParentMaritalStatus'].map(marital_mapping)

# PracticeSport
sport_mapping = {'never': 0, 'sometimes': 1, 'regularly': 2}
df_exam['PracticeSport'] = df_exam['PracticeSport'].map(sport_mapping)

# WklyStudyHours
study_mapping = {'< 5': 0, '5 - 10': 1, '> 10': 2}
df_exam['WklyStudyHours'] = df_exam['WklyStudyHours'].map(study_mapping)

# TransportMeans
transport_mapping = {'private' : 1, 'school_bus' : 0}
df_exam['TransportMeans'] = df_exam['TransportMeans'].map(transport_mapping)

In [None]:
df_exam.columns

Index(['Gender', 'EthnicGroup', 'ParentEduc', 'LunchType', 'TestPrep',
       'ParentMaritalStatus', 'PracticeSport', 'IsFirstChild', 'NrSiblings',
       'TransportMeans', 'WklyStudyHours', 'MathScore', 'ReadingScore',
       'WritingScore'],
      dtype='object')

In [None]:
target_cols = ['MathScore','ReadingScore','WritingScore']
X = df_exam.drop(columns=target_cols)
y = df_exam[target_cols]

# 2. split into train / temp, then val / test (70 / 15 / 15)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42
)

In [None]:
# 3. fit scaler only on train, apply to all
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

## Baseline Models

In [None]:
# helper to train & eval any regressor
def eval_model(reg, X_tr, y_tr, X_v, y_v, X_t, y_t):
    reg.fit(X_tr, y_tr)
    preds = reg.predict(X_v)
    mse  = mean_squared_error(y_v, preds)
    mae  = mean_absolute_error(y_v, preds)

    preds_t = reg.predict(X_t)
    mse_t  = mean_squared_error(y_t, preds_t)
    mae_t  = mean_absolute_error(y_t, preds_t)
    return mse, mae, mse_t, mae_t

In [None]:
# list of (name, estimator)
models = [
    ("DummyMean", MultiOutputRegressor(DummyRegressor(strategy="mean"))),
    ("LinearReg", MultiOutputRegressor(LinearRegression())),
    ("RidgeReg", MultiOutputRegressor(Ridge(alpha=1.0))),
    ("RandomForest", MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))),
    ("KNN", MultiOutputRegressor(KNeighborsRegressor(n_neighbors=5))),
]

In [None]:
results = []
for name, est in models:
    mse, mae, mse_t, mae_t = eval_model(est, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test)
    results.append((name, mse, mae, mse_t, mae_t))

# print a simple table
print("Model\t\tVal MSE\t\tVal MAE\t\tTest MSE\t\tTestMAE")
for name, mse, mae, mse_t, mae_t in results:
    print(f"{name:13s}{mse:12.4f}{mae:12.4f}{mse:12.4f}{mae:12.4f}")

Model		Val MSE		Val MAE		Test MSE		TestMAE
DummyMean        235.1882     12.4193    235.1882     12.4193
LinearReg        166.2936     10.4276    166.2936     10.4276
RidgeReg         166.2938     10.4277    166.2938     10.4277
RandomForest     207.9602     11.6051    207.9602     11.6051
KNN              200.6841     11.3686    200.6841     11.3686


### DL

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

# 1. prepare tensors & dataloaders
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_tr = torch.tensor(X_train_scaled, dtype=torch.float32)
y_tr = torch.tensor(y_train.values,   dtype=torch.float32)
X_va = torch.tensor(X_val_scaled,   dtype=torch.float32)
y_va = torch.tensor(y_val.values,   dtype=torch.float32)
X_te = torch.tensor(X_test_scaled,  dtype=torch.float32)
y_te = torch.tensor(y_test.values,  dtype=torch.float32)

train_ds = TensorDataset(X_tr, y_tr)
val_ds   = TensorDataset(X_va, y_va)
test_ds  = TensorDataset(X_te, y_te)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(test_ds,  batch_size=64)

# 2. define a simple MLP
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.net(x)

model = MLP(input_dim=X_tr.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 3. training loop with val check
n_epochs = 50
for epoch in range(1, n_epochs+1):
    # train
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # val
    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            val_losses.append(criterion(model(xb), yb).item())
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch}/{n_epochs}, Train Loss: {loss.item():.4f}, Val Loss: {sum(val_losses)/len(val_losses):.4f}")

# 4. final evaluation on test set
model.eval()
test_losses = []
test_maes   = []
with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        out = model(xb)
        test_losses.append(criterion(out, yb).item())
        test_maes.append(torch.mean(torch.abs(out - yb)).item())

print(f"Test MSE: {sum(test_losses)/len(test_losses):.4f}, Test MAE: {sum(test_maes)/len(test_maes):.4f}")


Epoch 1/50, Train Loss: 245.6289, Val Loss: 213.7299
Epoch 10/50, Train Loss: 175.7169, Val Loss: 170.7773
Epoch 20/50, Train Loss: 130.6596, Val Loss: 171.1258
Epoch 30/50, Train Loss: 150.9118, Val Loss: 169.1565
Epoch 40/50, Train Loss: 189.5444, Val Loss: 167.5364
Epoch 50/50, Train Loss: 182.4652, Val Loss: 168.3983
Test MSE: 167.4233, Test MAE: 10.5138


### TabNET

In [None]:
# 2. imports
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [None]:
X_train = X_train_scaled
X_val   = X_val_scaled
X_test  = X_test_scaled

y_train_np = y_train.values
y_val_np   = y_val.values
y_test_np  = y_test.values

# 4. define and configure the TabNet regressor
tabnet = TabNetRegressor(
    n_d=16,              # width of the decision prediction layer
    n_a=16,              # width of the attention embedding for each mask
    n_steps=5,           # number of decision steps
    gamma=1.5,           # relaxation parameter
    lambda_sparse=1e-3,  # sparsity regularization
    optimizer_fn=torch.optim.Adam,
    optimizer_params={"lr":2e-2, "weight_decay":1e-5},
    mask_type="entmax"   # “sparsemax” or “entmax”
)




In [None]:

tabnet.fit(
    X_train=X_train, y_train=y_train_np,
    eval_set=[(X_val, y_val_np)],
    eval_name=["val"],
    eval_metric=["rmse"],        # root‐mean‐squared error
    max_epochs=100,
    patience=15,                 # stop if no val‐improvement in 15 epochs
    batch_size=256,
    virtual_batch_size=64,
    num_workers=0,
    drop_last=False
)

# 6. evaluate on the test set
y_pred = tabnet.predict(X_test)

mse = mean_squared_error(y_test_np, y_pred)
mae = mean_absolute_error(y_test_np, y_pred)
print(f"TabNet Test MSE: {mse:.4f}, Test MAE: {mae:.4f}")

epoch 0  | loss: 1635.41355| val_rmse: 15.94337|  0:00:04s
epoch 1  | loss: 198.94999| val_rmse: 13.97863|  0:00:07s
epoch 2  | loss: 184.3098| val_rmse: 13.74509|  0:00:09s
epoch 3  | loss: 180.55917| val_rmse: 13.34957|  0:00:12s
epoch 4  | loss: 175.67549| val_rmse: 13.21869|  0:00:15s
epoch 5  | loss: 174.08982| val_rmse: 13.22999|  0:00:19s
epoch 6  | loss: 174.55972| val_rmse: 13.1372 |  0:00:21s
epoch 7  | loss: 171.96945| val_rmse: 13.04678|  0:00:24s
epoch 8  | loss: 171.21347| val_rmse: 13.19247|  0:00:27s
epoch 9  | loss: 170.57157| val_rmse: 13.11344|  0:00:29s
epoch 10 | loss: 171.87712| val_rmse: 13.05556|  0:00:32s
epoch 11 | loss: 169.67999| val_rmse: 13.18763|  0:00:34s
epoch 12 | loss: 169.23765| val_rmse: 13.12004|  0:00:36s
epoch 13 | loss: 168.41214| val_rmse: 13.0989 |  0:00:40s
epoch 14 | loss: 167.72487| val_rmse: 12.92999|  0:00:42s
epoch 15 | loss: 166.86166| val_rmse: 12.93946|  0:00:44s
epoch 16 | loss: 167.76953| val_rmse: 13.00476|  0:00:47s
epoch 17 | los



TabNet Test MSE: 167.8074, Test MAE: 10.5165


In [None]:
# prompt: df_dep['Gender'] = df_dep['Gender'].apply(lambda x: 0 if x <= 0 else 1) do this for all the binary columns

binary_cols = ['Gender', 'Family History of Mental Illness', 'Have you ever had suicidal thoughts ?']

for col in binary_cols:
    df_dep[col] = df_dep[col].apply(lambda x: 0 if x <= 0 else 1)
