In [1]:
import pandas as pd
from src.causal_graph import *
from src.scm.feedforward_ncm import FF_NCM
from src.scm.distribution import *
from torch.utils.data import DataLoader
from torch.utils.data._utils.collate import default_collate
from src.data import NCMDataset

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/df_dep.csv')
df.head(10)

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,degree_level,degree_cluster,degree_emb_pca_1,degree_emb_pca_2,degree_emb_pca_3,degree_emb_pca_4,degree_emb_pca_5
0,0,33,5.0,0.0,8.97,2,1,1.0,1,3,1,0,1,1,4,0.253879,0.243325,0.343663,-0.337468,-0.045405
1,0,31,3.0,0.0,7.03,5,0,1.0,0,9,1,1,0,1,1,0.221702,0.239069,0.015781,0.184283,0.126231
2,0,29,2.0,0.0,5.7,3,0,1.0,0,4,1,0,0,3,2,0.061047,-0.276469,0.533775,-0.294401,0.217425
3,0,30,3.0,0.0,9.54,4,1,1.0,0,1,2,0,0,1,4,0.235426,0.208007,0.092394,0.084074,0.243518
4,1,30,2.0,0.0,8.04,4,0,0.0,0,0,1,1,0,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673
5,0,33,3.0,0.0,7.03,4,0,1.0,1,10,2,1,0,1,4,0.268271,0.238251,-0.039036,0.147155,0.202336
6,1,19,2.0,0.0,8.52,4,0,0.0,0,6,2,1,0,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673
7,0,25,5.0,0.0,6.51,2,0,0.0,1,2,5,1,1,2,2,-0.032905,-0.350849,0.032525,0.216158,0.021605
8,1,20,5.0,0.0,7.25,3,1,1.0,1,10,3,0,1,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673
9,0,19,2.0,0.0,7.83,2,1,0.0,0,6,3,0,0,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673


# Causal Model

In [2]:
X = 'Gender'

Z = [
    'Age',
    'Family History of Mental Illness',
    'Sleep Duration'
]

W = [
    'Academic Pressure',
    'Work Pressure',
    'CGPA',
    'Study Satisfaction',
    'Dietary Habits',
    'Have you ever had suicidal thoughts ?',
    'Work/Study Hours',
    'Financial Stress',
    'degree_level',
    'degree_cluster',
    'degree_emb_pca_1',
    'degree_emb_pca_2',
    'degree_emb_pca_3',
    'degree_emb_pca_4',
    'degree_emb_pca_5'
]
Y = 'Depression'

In [3]:
standard_fairness_model = create_expanded_sfm(X, Z, W, Y)
ncm = FF_NCM(standard_fairness_model)

In [46]:
def train_ncm(model, dataloader, loss_fns, optimizer, device, num_epochs=10):
    model.to(device)
    model.train()

    for epoch in range(1, num_epochs+1):
        epoch_loss = 0.0
        for batch in dataloader:
            # if DataLoader gives you back a list of samples, collate it
            if isinstance(batch, list):
                batch = default_collate(batch)
                # batch = torch.Tensor(batch)

            # now batch is a dict of batched tensors
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = next(iter(batch.values())).shape[0]

            # sample exogenous noise
            u = model.pu.sample(n=batch_size, device=device)

            # accumulate loss over all nodes
            total_loss = 0.0
            for v in model.v:
                pa_keys = model.cg.pa[v]
                pa_vals = {k: batch[k] for k in pa_keys}

                u_keys = model.cg.v2c2[v]
                u_vals = {k: u[k] for k in u_keys}

                pred_v = model.f[v](pa=pa_vals, u=u_vals)
                loss_v = loss_fns[v](pred_v, batch[v])
                total_loss += loss_v

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            epoch_loss += total_loss.item()

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch}/{num_epochs}, Loss: {avg_loss:.4f}")

    return model

In [4]:
x = df.drop('Depression', axis=1)
y = df['Depression']

X_train, X_temp, y_train, y_temp = train_test_split(x, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training data shape:", X_train.shape, y_train.shape)
print("Validation data shape:", X_val.shape, y_val.shape)
print("Test data shape:", X_test.shape, y_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Training data shape: (10779, 19) (10779,)
Validation data shape: (3593, 19) (3593,)
Test data shape: (3593, 19) (3593,)


In [5]:
train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
train_df["Depression"] = y_train.values
val_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
val_df["Depression"] = y_val.values
test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
test_df["Depression"] = y_test.values

variables = list(X_train.columns) + ["Depression"]

In [13]:
binary_cols = ['Gender', 'Family History of Mental Illness', 'Have you ever had suicidal thoughts ?']

for col in binary_cols:
    train_df[col] = train_df[col].apply(lambda x: 0 if x <= 0 else 1)
    test_df[col] = test_df[col].apply(lambda x: 0 if x <= 0 else 1)
    val_df[col] = val_df[col].apply(lambda x: 0 if x <= 0 else 1)

In [14]:
train_df.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,degree_level,degree_cluster,degree_emb_pca_1,degree_emb_pca_2,degree_emb_pca_3,degree_emb_pca_4,degree_emb_pca_5,Depression
0,0,1.080453,-0.847551,-0.009632,0.998851,1.518994,-1.56537,1.157674,1,-0.30986,-1.505205,1,-1.412209,-1.457237,-1.828773,0.312362,-0.125115,-0.106898,-0.093296,0
1,1,-0.148849,-0.123239,-0.009632,-0.270247,1.518994,0.638826,-0.863801,1,1.305217,0.582904,1,-0.16028,1.136053,0.722132,0.853257,-0.16993,0.808998,1.123598,1
2,0,1.285336,1.325387,-0.009632,0.555349,0.779295,0.638826,-0.863801,1,-0.848219,-1.505205,1,1.091648,-0.160592,0.41664,-1.33356,-0.364663,1.161359,0.505003,0
3,1,1.080453,1.325387,-0.009632,-1.55299,0.779295,-1.56537,-0.863801,0,1.305217,1.27894,0,1.091648,-0.160592,0.373413,-1.413536,-0.276702,0.04971,-1.908243,1
4,1,0.465802,1.325387,-0.009632,-0.181546,-1.439801,0.638826,-0.863801,1,0.228499,-0.113132,1,1.091648,-0.160592,0.23003,-1.748632,0.36276,0.806814,0.963826,1


In [15]:
# def collate1(batch_list):
#     inputs = torch.stack([item[0] for item in batch_list], dim=0)
#     labels = torch.stack([item[1] for item in batch_list], dim=0)
#     return {'inputs': inputs, 'labels': labels}

from src.data import NCMDataset
def train_ncm(model, dataloader, loss_fns, optimizer, device, num_epochs=10):
    model.to(device)
    model.train()

    for epoch in range(1, num_epochs+1):
        epoch_loss = 0.0
        for batch in dataloader:
            # if DataLoader gives you back a list of samples, collate it
            # if isinstance(batch, list):
            #     # batch = default_collate(batch)
            #     batch = collate1(batch)

            # now batch is a dict of batched tensors
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = next(iter(batch.values())).shape[0]

            # sample exogenous noise
            u = model.pu.sample(n=batch_size, device=device)

            # accumulate loss over all nodes
            total_loss = 0.0
            for v in model.v:
                pa_keys = model.cg.pa[v]
                pa_vals = {k: batch[k] for k in pa_keys}

                u_keys = model.cg.v2c2[v]
                u_vals = {k: u[k] for k in u_keys}

                pred_v = model.f[v](pa=pa_vals, u=u_vals)
                loss_v = loss_fns[v](pred_v, batch[v])
                total_loss += loss_v

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            epoch_loss += total_loss.item()

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch}/{num_epochs}, Loss: {avg_loss:.4f}")

    return model
#=====

variables = [X] + Z + W + [Y]

train_set = NCMDataset(train_df, variables)
test_set = NCMDataset(test_df, variables)

train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=32, shuffle=True)

loss_fns = {}
binary_vars = [X, 'Family History of Mental Illness', 'Have you ever had suicidal thoughts ?', Y]
for v in variables:
    if v in binary_vars:
        loss_fns[v] = nn.BCELoss()
    else:
        loss_fns[v] = nn.MSELoss()

torch.manual_seed(0)
optimizer = optim.Adam(ncm.parameters(), lr=1e-3)

# Train for a few epochs for quick testing
num_epochs = 3
trained_ncm = train_ncm(ncm, train_dataloader, loss_fns, optimizer, 'cpu', num_epochs)

# Compute accuracy on Y
def compute_accuracy(model, dataloader, device, target_var):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = next(iter(batch.values())).shape[0]
            u = model.pu.sample(n=batch_size, device=device)
            preds = model.f[target_var](
                pa={k: batch[k] for k in model.cg.pa[target_var]},
                u={k: u[k] for k in model.cg.v2c2[target_var]}
            )
            labels = batch[target_var]
            pred_labels = (preds > 0.5).float()
            correct += (pred_labels == labels).sum().item()
            total += labels.numel()
    return correct / total

train_acc = compute_accuracy(trained_ncm, train_dataloader, 'cpu', Y)
print(f'Final train accuracy for {Y}: {train_acc:.4f}')

test_acc = compute_accuracy(trained_ncm, test_dataloader, 'cpu', Y)
print(f'Final test accuracy  for {Y}: {test_acc:.4f}')


Epoch 1/3, Loss: 18.1659
Epoch 2/3, Loss: 18.0070
Epoch 3/3, Loss: 17.9774
Final train accuracy for Depression: 0.8545
Final test accuracy  for Depression: 0.8430
