In [1]:
import pandas as pd
from src.causal_graph import *
from src.scm.feedforward_ncm import FF_NCM
from src.scm.distribution import *
from torch.utils.data import DataLoader
from torch.utils.data._utils.collate import default_collate
from src.data import NCMDataset

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/df_exam.csv')
df.head(10)

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,1,1,3,1,0,0,1,1,4.0,0,0,87,93,91
1,0,2,1,1,0,3,1,1,0.0,0,1,76,78,75
2,1,1,1,1,0,3,2,1,1.0,0,1,73,84,79
3,1,1,1,1,1,2,0,0,1.0,1,1,85,93,89
4,0,1,1,0,0,3,1,1,1.0,1,2,41,43,39
5,0,3,0,0,1,0,1,0,3.0,1,2,65,64,68
6,0,3,1,1,0,1,1,1,1.0,0,1,40,52,43
7,1,1,0,1,0,3,2,0,1.0,1,1,66,82,74
8,0,0,1,1,1,0,1,1,1.0,1,2,80,73,71
9,1,0,3,1,0,1,1,1,2.0,1,0,48,53,58


# Causal Model

In [2]:
df.columns

Index(['Gender', 'EthnicGroup', 'ParentEduc', 'LunchType', 'TestPrep',
       'ParentMaritalStatus', 'PracticeSport', 'IsFirstChild', 'NrSiblings',
       'TransportMeans', 'WklyStudyHours', 'MathScore', 'ReadingScore',
       'WritingScore'],
      dtype='object')

In [5]:
X = 'Gender'

Z = [
    'EthnicGroup',
    'ParentMaritalStatus',
    'IsFirstChild', 
    'NrSiblings',
    'TransportMeans',
]

W = [
    'ParentEduc',
    'LunchType',
    'TestPrep',
    'PracticeSport',
    'WklyStudyHours',
    'ReadingScore',
    'WritingScore'
]
Y = 'MathScore'

In [6]:
standard_fairness_model = create_expanded_sfm(X, Z, W, Y)
ncm = FF_NCM(standard_fairness_model)

In [7]:
def train_ncm(model, dataloader, loss_fns, optimizer, device, num_epochs=10):
    model.to(device)
    model.train()

    for epoch in range(1, num_epochs+1):
        epoch_loss = 0.0
        for batch in dataloader:
            # if DataLoader gives you back a list of samples, collate it
            if isinstance(batch, list):
                batch = default_collate(batch)
                # batch = torch.Tensor(batch)

            # now batch is a dict of batched tensors
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = next(iter(batch.values())).shape[0]

            # sample exogenous noise
            u = model.pu.sample(n=batch_size, device=device)

            # accumulate loss over all nodes
            total_loss = 0.0
            for v in model.v:
                pa_keys = model.cg.pa[v]
                pa_vals = {k: batch[k] for k in pa_keys}

                u_keys = model.cg.v2c2[v]
                u_vals = {k: u[k] for k in u_keys}

                pred_v = model.f[v](pa=pa_vals, u=u_vals)
                loss_v = loss_fns[v](pred_v, batch[v])
                total_loss += loss_v

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            epoch_loss += total_loss.item()

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch}/{num_epochs}, Loss: {avg_loss:.4f}")

    return model

In [8]:
x = df.drop('MathScore', axis=1)
y = df['MathScore']

X_train, X_temp, y_train, y_temp = train_test_split(x, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training data shape:", X_train.shape, y_train.shape)
print("Validation data shape:", X_val.shape, y_val.shape)
print("Test data shape:", X_test.shape, y_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Training data shape: (11545, 13) (11545,)
Validation data shape: (3849, 13) (3849,)
Test data shape: (3849, 13) (3849,)


In [14]:
y_train = y_train.apply(lambda x: 1 if x > 52 else 0)
y_val = y_val.apply(lambda x: 1 if x > 52 else 0)
y_test = y_test.apply(lambda x: 1 if x > 52 else 0)

In [15]:
train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
train_df['MathScore'] = y_train.values
val_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
val_df['MathScore'] = y_val.values
test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
test_df['MathScore'] = y_test.values

variables = list(X_train.columns) + ['MathScore']

In [16]:
binary_cols = ['Gender', 'LunchType', 'TestPrep', 'IsFirstChild']

for col in binary_cols:
    train_df[col] = train_df[col].apply(lambda x: 0 if x <= 0 else 1)
    test_df[col] = test_df[col].apply(lambda x: 0 if x <= 0 else 1)
    val_df[col] = val_df[col].apply(lambda x: 0 if x <= 0 else 1)

In [17]:
train_df.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,ReadingScore,WritingScore,MathScore
0,1,-1.034705,-0.999477,0,0,0.823963,-0.335256,0,0.600533,-0.849768,0.160755,-0.10917,-0.170846,0
1,0,1.591101,0.141864,0,0,0.823963,1.166696,1,-1.47692,-0.849768,-1.348116,-0.582762,-0.429507,1
2,0,-1.034705,0.141864,1,1,-0.717546,1.166696,0,0.600533,1.176791,-1.348116,-0.041514,-0.041516,1
3,1,-0.159436,-0.999477,1,0,-0.717546,1.166696,0,-0.784435,1.176791,0.160755,-0.312138,0.087815,1
4,1,-1.034705,-0.999477,0,0,0.823963,-0.335256,1,-0.784435,1.176791,0.160755,-3.086034,-2.951453,0


In [18]:
# def collate1(batch_list):
#     inputs = torch.stack([item[0] for item in batch_list], dim=0)
#     labels = torch.stack([item[1] for item in batch_list], dim=0)
#     return {'inputs': inputs, 'labels': labels}

from src.data import NCMDataset
def train_ncm(model, dataloader, loss_fns, optimizer, device, num_epochs=10):
    model.to(device)
    model.train()

    for epoch in range(1, num_epochs+1):
        epoch_loss = 0.0
        for batch in dataloader:
            # if DataLoader gives you back a list of samples, collate it
            # if isinstance(batch, list):
            #     # batch = default_collate(batch)
            #     batch = collate1(batch)

            # now batch is a dict of batched tensors
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = next(iter(batch.values())).shape[0]

            # sample exogenous noise
            u = model.pu.sample(n=batch_size, device=device)

            # accumulate loss over all nodes
            total_loss = 0.0
            for v in model.v:
                pa_keys = model.cg.pa[v]
                pa_vals = {k: batch[k] for k in pa_keys}

                u_keys = model.cg.v2c2[v]
                u_vals = {k: u[k] for k in u_keys}

                pred_v = model.f[v](pa=pa_vals, u=u_vals)
                loss_v = loss_fns[v](pred_v, batch[v])
                total_loss += loss_v

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            epoch_loss += total_loss.item()

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch}/{num_epochs}, Loss: {avg_loss:.4f}")

    return model
#=====

variables = [X] + Z + W + [Y]

train_set = NCMDataset(train_df, variables)
test_set = NCMDataset(test_df, variables)

train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=32, shuffle=True)

loss_fns = {}
binary_vars = [X, 'Family History of Mental Illness', 'Have you ever had suicidal thoughts ?', Y]
for v in variables:
    if v in binary_vars:
        loss_fns[v] = nn.BCELoss()
    else:
        loss_fns[v] = nn.MSELoss()

torch.manual_seed(0)
optimizer = optim.Adam(ncm.parameters(), lr=1e-3)

# Train for a few epochs for quick testing
num_epochs = 3
trained_ncm = train_ncm(ncm, train_dataloader, loss_fns, optimizer, 'cpu', num_epochs)

# Compute accuracy on Y
def compute_accuracy(model, dataloader, device, target_var):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = next(iter(batch.values())).shape[0]
            u = model.pu.sample(n=batch_size, device=device)
            preds = model.f[target_var](
                pa={k: batch[k] for k in model.cg.pa[target_var]},
                u={k: u[k] for k in model.cg.v2c2[target_var]}
            )
            labels = batch[target_var]
            pred_labels = (preds > 0.5).float()
            correct += (pred_labels == labels).sum().item()
            total += labels.numel()
    return correct / total

train_acc = compute_accuracy(trained_ncm, train_dataloader, 'cpu', Y)
print(f'Final train accuracy for {Y}: {train_acc:.4f}')

test_acc = compute_accuracy(trained_ncm, test_dataloader, 'cpu', Y)
print(f'Final test accuracy  for {Y}: {test_acc:.4f}')


Epoch 1/3, Loss: 10.5795
Epoch 2/3, Loss: 10.4721
Epoch 3/3, Loss: 10.4593
Final train accuracy for MathScore: 0.9267
Final test accuracy  for MathScore: 0.9244
