In [1]:
import pandas as pd
from src.causal_graph import *
from src.scm.feedforward_ncm import FF_NCM
from src.scm.distribution import *
from torch.utils.data import DataLoader
from torch.utils.data._utils.collate import default_collate
from src.data import NCMDataset,tt_split
from src.queries import *
import numpy as np

from src.training.train import train_ncm, print_accuracy

import torch
import torch.nn as nn
import torch.optim as optim

df = pd.read_csv('data/df_dep.csv')
df.head(10)

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,degree_level,degree_cluster,degree_emb_pca_1,degree_emb_pca_2,degree_emb_pca_3,degree_emb_pca_4,degree_emb_pca_5
0,0,33,5.0,0.0,8.97,2,1,1.0,1,3,1,0,1,1,4,0.253879,0.243325,0.343663,-0.337468,-0.045405
1,0,31,3.0,0.0,7.03,5,0,1.0,0,9,1,1,0,1,1,0.221702,0.239069,0.015781,0.184283,0.126231
2,0,29,2.0,0.0,5.7,3,0,1.0,0,4,1,0,0,3,2,0.061047,-0.276469,0.533775,-0.294401,0.217425
3,0,30,3.0,0.0,9.54,4,1,1.0,0,1,2,0,0,1,4,0.235426,0.208007,0.092394,0.084074,0.243518
4,1,30,2.0,0.0,8.04,4,0,0.0,0,0,1,1,0,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673
5,0,33,3.0,0.0,7.03,4,0,1.0,1,10,2,1,0,1,4,0.268271,0.238251,-0.039036,0.147155,0.202336
6,1,19,2.0,0.0,8.52,4,0,0.0,0,6,2,1,0,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673
7,0,25,5.0,0.0,6.51,2,0,0.0,1,2,5,1,1,2,2,-0.032905,-0.350849,0.032525,0.216158,0.021605
8,1,20,5.0,0.0,7.25,3,1,1.0,1,10,3,0,1,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673
9,0,19,2.0,0.0,7.83,2,1,0.0,0,6,3,0,0,0,0,-0.69218,0.08292,-0.029387,-0.020745,-0.018673


# Causal Model

In [2]:
# Create model =============
X = 'Gender'

Z = [
    'Age',
    'Sleep Duration',
    'Family History of Mental Illness'
]

W = [
    'Academic Pressure',
    'Work Pressure',
    'CGPA',
    'Study Satisfaction',
    'Dietary Habits',
    'Have you ever had suicidal thoughts ?',
    'Work/Study Hours',
    'Financial Stress',
    'degree_level',
    'degree_cluster',
    'degree_emb_pca_1',
    'degree_emb_pca_2',
    'degree_emb_pca_3',
    'degree_emb_pca_4',
    'degree_emb_pca_5'
]
Y = 'Depression'
variables = [X] + Z + W + [Y]
binary_cols = ['Gender', 'Family History of Mental Illness', 'Have you ever had suicidal thoughts ?',Y]

ncm = FF_NCM(create_expanded_sfm_graph(X, Z, W, Y), discrete_vals=binary_cols)

In [3]:
# Process data =============
train_df, test_df = tt_split(df, binary_cols)

train_dataloader = NCMDataset(train_df, variables).get_dataloader()
test_dataloader = NCMDataset(test_df, variables).get_dataloader()

train_df.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,degree_level,degree_cluster,degree_emb_pca_1,degree_emb_pca_2,degree_emb_pca_3,degree_emb_pca_4,degree_emb_pca_5
0,0,1.080453,-0.847551,-0.009632,0.998851,1.518994,-1.56537,1.157674,1,-0.30986,-1.505205,1,0,-1.412209,-1.457237,-1.828773,0.312362,-0.125115,-0.106898,-0.093296
1,1,-0.148849,-0.123239,-0.009632,-0.270247,1.518994,0.638826,-0.863801,1,1.305217,0.582904,1,1,-0.16028,1.136053,0.722132,0.853257,-0.16993,0.808998,1.123598
2,0,1.285336,1.325387,-0.009632,0.555349,0.779295,0.638826,-0.863801,1,-0.848219,-1.505205,1,0,1.091648,-0.160592,0.41664,-1.33356,-0.364663,1.161359,0.505003
3,1,1.080453,1.325387,-0.009632,-1.55299,0.779295,-1.56537,-0.863801,0,1.305217,1.27894,0,1,1.091648,-0.160592,0.373413,-1.413536,-0.276702,0.04971,-1.908243
4,1,0.465802,1.325387,-0.009632,-0.181546,-1.439801,0.638826,-0.863801,1,0.228499,-0.113132,1,1,1.091648,-0.160592,0.23003,-1.748632,0.36276,0.806814,0.963826


In [4]:
# Train =============
torch.manual_seed(0)

# Train for a few epochs for quick testing
optimizer = optim.Adam(ncm.parameters(), lr=1e-3)
_ = train_ncm(ncm, train_dataloader, optimizer, 'cpu', num_epochs=3)

# print_accuracy(X, ncm, train_dataloader, test_dataloader)
# print_accuracy(Y, ncm, train_dataloader, test_dataloader)

Epoch 1/3, Loss: 0.0832
Epoch 2/3, Loss: 0.0663
Epoch 3/3, Loss: 0.0660


In [5]:
# for w in W: print_accuracy(w, ncm, train_dataloader, test_dataloader)

In [6]:
# for z in Z: print_accuracy(z, ncm, train_dataloader, test_dataloader)

In [7]:
# check that it's not random guessing X
sample = ncm(n=10,select={X})[X].tolist()
print([round(sample[i][0]) for i in range(len(sample))])

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


# SHIT EXPERIMENT CODE ============

In [8]:
train_df[Y].value_counts(normalize=True)


Depression
1    0.600427
0    0.399573
Name: proportion, dtype: float64

In [9]:
train_df.groupby(X)[Y].value_counts(normalize=True).unstack()


Depression,0,1
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.397849,0.602151
1,0.401853,0.598147


In [10]:
train_df.groupby(X)[Y].value_counts(normalize=True).unstack()

Depression,0,1
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.397849,0.602151
1,0.401853,0.598147


In [11]:
U = general_queries(ncm,{X},W,Y)
_ = exp_se(ncm,{X},Y, u=U)

TE (Depression=0) 	= -0.0024
NDE (Depression=0) 	= 0.0361
NIE (Depression=0) 	= 0.0385

TE (Depression=1) 	= 0.0024
NDE (Depression=1) 	= -0.0361
NIE (Depression=1) 	= -0.0385

Exp-SE_{'Gender': 1}(Depression=0) 	= -0.0171
Exp-SE_{'Gender': 1}(Depression=1) 	= 0.0171


In [13]:
x_specific_queries(ncm, {X}, W, Y, u=U, xvals={X:0})
_ = x_specific_queries(ncm, {X}, W, Y, u=U)

x-TE(Depression=0 | {'Gender': 0}) 	= -0.0013
x-DE(Depression=0 | {'Gender': 0}) 	= 0.0375
x-IE(Depression=0 | {'Gender': 0}) 	= 0.0388
x-SE(Depression=0) 			= -0.0338

x-TE(Depression=1 | {'Gender': 0}) 	= 0.0013
x-DE(Depression=1 | {'Gender': 0}) 	= -0.0375
x-IE(Depression=1 | {'Gender': 0}) 	= -0.0388
x-SE(Depression=1) 			= 0.0338

x-TE(Depression=0 | {'Gender': 1}) 	= -0.0036
x-DE(Depression=0 | {'Gender': 1}) 	= 0.0345
x-IE(Depression=0 | {'Gender': 1}) 	= 0.0381
x-SE(Depression=0) 			= -0.0338

x-TE(Depression=1 | {'Gender': 1}) 	= 0.0036
x-DE(Depression=1 | {'Gender': 1}) 	= -0.0345
x-IE(Depression=1 | {'Gender': 1}) 	= -0.0381
x-SE(Depression=1) 			= 0.0338



In [14]:
z1 = {'Family History of Mental Illness':1}
z_specific_queries(ncm, {X}, W, Y, z1, u=U)

z0 = {'Family History of Mental Illness':0}
z_specific_queries(ncm, {X}, W, Y, z0, u=U)
pass

z-TE(Depression=0 | {'Family History of Mental Illness': 1}) 	= 0.0497
z-DE(Depression=0 | {'Family History of Mental Illness': 1}) 	= 0.0619
z-IE(Depression=0 | {'Family History of Mental Illness': 1}) 	= 0.0122
z-SE_{'Gender': 1}(Depression=0) 			= -0.0260

z-TE(Depression=1 | {'Family History of Mental Illness': 1}) 	= -0.0497
z-DE(Depression=1 | {'Family History of Mental Illness': 1}) 	= -0.0619
z-IE(Depression=1 | {'Family History of Mental Illness': 1}) 	= -0.0122
z-SE_{'Gender': 1}(Depression=1) 			= 0.0260

z-TE(Depression=0 | {'Family History of Mental Illness': 0}) 	= -0.0269
z-DE(Depression=0 | {'Family History of Mental Illness': 0}) 	= 0.0240
z-IE(Depression=0 | {'Family History of Mental Illness': 0}) 	= 0.0509
z-SE_{'Gender': 1}(Depression=0) 			= -0.0186

z-TE(Depression=1 | {'Family History of Mental Illness': 0}) 	= 0.0269
z-DE(Depression=1 | {'Family History of Mental Illness': 0}) 	= -0.0240
z-IE(Depression=1 | {'Family History of Mental Illness': 0}) 	= -0.0509
z-

In [15]:
xz_specific_queries(ncm, {X}, W, Y, z0, u=U, xvals={X:0})
_ = xz_specific_queries(ncm, {X}, W, Y, z0, u=U)

xz-TE(Depression=0 | ({'Gender': 0}, {'Family History of Mental Illness': 0})) 	= -0.0235
xz-DE(Depression=0 | ({'Gender': 0}, {'Family History of Mental Illness': 0})) 	= 0.0275
xz-IE(Depression=0 | ({'Gender': 0}, {'Family History of Mental Illness': 0})) 	= 0.0510
xz-SE(Depression=0) 			= -0.0295

xz-TE(Depression=1 | ({'Gender': 0}, {'Family History of Mental Illness': 0})) 	= 0.0235
xz-DE(Depression=1 | ({'Gender': 0}, {'Family History of Mental Illness': 0})) 	= -0.0275
xz-IE(Depression=1 | ({'Gender': 0}, {'Family History of Mental Illness': 0})) 	= -0.0510
xz-SE(Depression=1) 			= 0.0295

xz-TE(Depression=0 | ({'Gender': 1}, {'Family History of Mental Illness': 0})) 	= -0.0310
xz-DE(Depression=0 | ({'Gender': 1}, {'Family History of Mental Illness': 0})) 	= 0.0197
xz-IE(Depression=0 | ({'Gender': 1}, {'Family History of Mental Illness': 0})) 	= 0.0507
xz-SE(Depression=0) 			= -0.0370

xz-TE(Depression=1 | ({'Gender': 1}, {'Family History of Mental Illness': 0})) 	= 0.0310
xz-DE

In [17]:
x_sym(ncm, {X}, W, Y, u=U)
pass

x-DEsym(Depression=0 | None) 	= 0.0351
x-IEsym(Depression=0 | None) 	= 0.0375
x-SE(Depression=0) 			= -0.0338

x-DEsym(Depression=1 | None) 	= -0.0351
x-IEsym(Depression=1 | None) 	= -0.0375
x-SE(Depression=1) 			= 0.0338

