In [1]:
import os
import pickle
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import torch
from torch import nn
import pyro
import pyro.distributions as dist
from pyro.nn import PyroModule
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
base_path = 'Fair-ML-Causal-Inference'
law_path = os.path.join('~', base_path, 'data', 'law_data.csv')
data = pd.read_csv(law_path)

data.head()

Unnamed: 0.1,Unnamed: 0,race,sex,LSAT,UGPA,region_first,ZFYA,sander_index,first_pf
0,0,White,1,39.0,3.1,GL,-0.98,0.782738,1.0
1,1,White,1,36.0,3.0,GL,0.09,0.735714,1.0
2,2,White,2,30.0,3.1,MS,-0.35,0.670238,1.0
3,5,Hispanic,2,39.0,2.2,NE,0.58,0.697024,1.0
4,6,White,1,37.0,3.4,GL,-1.26,0.78631,1.0


In [31]:
display(data['region_first'].value_counts())
data = data.loc[data['region_first'] != 'PO']

Prepping Data

In [32]:
cols_keep = ['race', 'sex', 'LSAT', 'UGPA', 'ZFYA'] 
law_data = data[cols_keep]
law_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21790 entries, 0 to 21790
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   race    21790 non-null  object 
 1   sex     21790 non-null  int64  
 2   LSAT    21790 non-null  float64
 3   UGPA    21790 non-null  float64
 4   ZFYA    21790 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 1021.4+ KB


In [33]:
# visualize the distributions for our OHE data
display(law_data['race'].value_counts(), law_data['sex'].value_counts())

# convert sex to category
law_data.loc[:,'sex'] = np.where(law_data['sex'] == 1, 'Female', 'Male')

race
White          18284
Black           1282
Asian            845
Hispanic         488
Mexican          389
Other            293
Puertorican      110
Amerindian        99
Name: count, dtype: int64

sex
2    12253
1     9537
Name: count, dtype: int64

OHE

In [34]:
# split the data first to avoid data leakage
train, test = train_test_split(law_data, train_size=0.8, random_state=256)

# explicity categories and their unique values
categories = [('sex', list(law_data['sex'].unique())),
              ('race', list(law_data['race'].unique()))]

ohe_columns = [x[0] for x in categories]
ohe_categories = [x[1] for x in categories]

# initialize OHE
enc = OneHotEncoder(sparse=False, categories=ohe_categories, )

# fit and transform the train
train_trans = pd.DataFrame(
    enc.fit_transform(train[ohe_columns]),
    columns = enc.get_feature_names_out(),
    index = train.index
)
# concatenate transformed cols with non transformed
train_trans = pd.concat([train.drop(ohe_columns, axis=1), train_trans], axis=1).reset_index(drop=True)
train_trans.columns = [col.split('_')[1] if '_' in col else col for col in train_trans.columns]


# apply same transformation to test
test_trans = pd.DataFrame(
    enc.fit_transform(test[ohe_columns]),
    columns = enc.get_feature_names_out(),
    index = test.index
)
test_trans = pd.concat([test.drop(ohe_columns, axis=1) ,test_trans], axis=1).reset_index(drop=True)
test_trans.columns = [col.split('_')[1] if '_' in col else col for col in test_trans.columns]




In [35]:
n_train = train_trans.shape[0]
n_test = test_trans.shape[0]

train_trans['LSAT'] = train_trans['LSAT'].round()
test_trans['LSAT'] = test_trans['LSAT'].round()

X_train, y_train = torch.tensor(train_trans.drop(['ZFYA'], axis=1).values, dtype=torch.float32), torch.tensor(train_trans['ZFYA'], dtype=torch.float32) 
X_test, y_test = torch.tensor(test_trans.drop(['ZFYA'], axis=1).values, dtype=torch.float32), torch.tensor(test_trans['ZFYA'], dtype=torch.float32) 

In [36]:
class Dataset(torch.utils.data.Dataset):
     def __init__(self, dataframe):
          self.dataframe = dataframe
      
     def __len__(self):
          return self.dataframe.shape[0]
     
     def __getitem__(self, idx):
          x = torch.tensor(self.dataframe.drop(['ZFYA'], axis=1).loc[idx,:].values, dtype=torch.float32)
          y = torch.tensor(self.dataframe.loc[idx, 'ZFYA'], dtype=torch.float32)
          return x,y

Full Model

In [48]:
# class inherits form a class called nn.Module
class LinearRegressionModel(nn.Module):
    # initialization method for new class
    def __init__(self, input_size, output_size):
        # first thing always do is call initialization method from the parent class, nn.module
        super().__init__()

        # fully connected linear layer
        self.fc1 = nn.Linear(input_size, output_size)

    def forward(self, x):
        # run the linear layer
        output = self.fc1(x)

        return output
    
# evaluate models performance
def evaluate(model, X_train, y_test):
    # Make predictions
    with torch.no_grad(): # disable gradient computation
        predictions = model(X_train).squeeze()

    # Calculate RMSE
    mse = torch.nn.functional.mse_loss(predictions, y_test)
    rmse = np.sqrt(mse.item())

    return rmse

def train(network, train_dataset, test_dataset, file_name_model, n_epochs=10, batch_size = 25):
    assert isinstance(file_name_model, str), "The filename is not a string"
    
    data_loader = torch.utils.data.DataLoader(Dataset(train_dataset), batch_size = batch_size, shuffle=True)
    
    X_test = torch.tensor(test_dataset.drop(['ZFYA'], axis=1).values, dtype=torch.float32)
    y_test = torch.tensor(test_dataset['ZFYA'], dtype=torch.float32)
    
    # Move network to GPU if available
    network = network.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # define optimizer
    optimizer = torch.optim.Adam(network.parameters())

    # best validation score initialization 
    validation_score_best = float('inf')
    train_losses = []
    validation_scores = []

    # train loop
    for epoch in range(n_epochs):
        epoch_loss = 0.0
        for batch in tqdm(data_loader, leave=False):
            # unpack batch
            X, y = batch

            # zero parameter gradients
            optimizer.zero_grad()
            
            # forward pass to get input
            # output is of shape [20,1] but we want of size [20] to compare 
            output = network(X).squeeze()

            # calculate loss
            loss = nn.MSELoss()(output, y)
            epoch_loss += loss.item()
            # root_loss = torch.sqrt(loss)
            
            # backward pass and optimize
            loss.backward()
            optimizer.step() # update model parameters
        
        avg_epoch_loss = epoch_loss / len(data_loader)  # Average loss per epoch
        train_losses.append(avg_epoch_loss)  # Append average epoch loss
        
        validation_score = evaluate(network, X_test, y_test) # evaluation mode
        validation_scores.append(validation_score)
        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}, validation score: {validation_score}')
        network.train() # back to train mode

        if validation_score < validation_score_best:
            validation_score_best = validation_score
            torch.save(network.state_dict(), file_name_model+'.pt') 
            
    print(f'Best validation score:{validation_score_best}')
    return validation_scores, train_losses

In [49]:
# create and train model
full_model = LinearRegressionModel(train_trans.shape[1]-1, 1)
full_validation_scores, full_train_losses = train(full_model, train_trans, test_trans, 'full_model', n_epochs=15, batch_size=20)

  0%|          | 0/872 [00:00<?, ?it/s]

                                                 

Epoch 1, validation score: 0.9341887489372261


                                                 

Epoch 6, validation score: 0.8754007579756609


                                                 

Epoch 11, validation score: 0.8718918706430719


                                                 

Best validation score:0.8716028547604374




Unaware Model

In [50]:
protected_attributes = ['Female', 'Male', 'White', 'Hispanic', 'Asian', 'Black', 'Other', 'Mexican', 'Puertorican', 'Amerindian']

train_unaware = train_trans.drop(protected_attributes, axis=1)
test_unaware = test_trans.drop(protected_attributes, axis=1)

unaware_model = LinearRegressionModel(train_unaware.shape[1]-1, 1)
unaware_validation_scores, unaware_train_losses = train(unaware_model, train_unaware, test_unaware, 'unaware_model', n_epochs=15, batch_size=20)

                                                 

Epoch 1, validation score: 0.9424677518999135


                                                 

Epoch 6, validation score: 0.9147393092896225


                                                 

Epoch 11, validation score: 0.9047014554912369


                                                 

Best validation score:0.9021603947086761




Unfair K