In [257]:
## import the required package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [258]:
## Read the file
df = pd.read_csv("../data/compas-scores-two-years.csv")

## Filter the data
df = df[(df["race"] == "African-American") | (df["race"] == "Caucasian")]

## Change the race entry
df["race"] = np.where(df["race"] == "African-American", 0, 1)

## We drop attributes that is clearly independent with the two_year_recid, ie. name, id, r_case_number, c_case_number,
## as well as the attribute with all NaN value
## as well as the duplicate columns
df.drop(["id", "name", "first", "last", "r_case_number", "c_case_number",
         "violent_recid",
         "decile_score.1", "priors_count.1"],
        axis = 1, inplace=True)

In [316]:
## We drop the column with the number of NaN value exceed 1000
df.drop(["vr_charge_desc", "vr_offense_date",
        "vr_charge_degree", "vr_case_number", "c_arrest_date",
        "c_arrest_date", "r_jail_out", "r_jail_in", "r_days_from_arrest",
        "r_charge_desc", "r_charge_degree", "r_offense_date"],
        axis = 1, inplace=True)

## Remove the rows with NaN
df.dropna(inplace=True)

## we also drop the columns represent the date
df.drop(['compas_screening_date', 'dob', 'c_jail_in', 'c_jail_out',
         'c_offense_date', 'screening_date', 'v_screening_date',
         'in_custody', 'out_custody', 'days_b_screening_arrest'],
        axis = 1, inplace=True)

## We drop the somehow repeated attribute, age_cat, score_text, v_score_text
df.drop(['age_cat', 'score_text', 'v_score_text', 'is_recid', "v_decile_score"],
        axis = 1, inplace=True)

## Set sex into 0 or 1, c_charge_degree into 0 or 1
df["sex"] = np.where(df["sex"] == "Male", 0, 1)
df["c_charge_degree"] = np.where(df["c_charge_degree"] == "M", 0, 1)

## Drop the column with only 1 input or too many category
df.drop(['type_of_assessment', 'v_type_of_assessment', 'c_charge_desc'],
        axis = 1, inplace=True)

KeyError: "['vr_charge_desc', 'vr_offense_date', 'vr_charge_degree', 'vr_case_number', 'c_arrest_date', 'c_arrest_date', 'r_jail_out', 'r_jail_in', 'r_days_from_arrest', 'r_charge_desc', 'r_charge_degree', 'r_offense_date'] not found in axis"

In [317]:
df.head(5)

Unnamed: 0,sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_days_from_compas,c_charge_degree,is_violent_recid,start,end,event,two_year_recid
1,0,34,0,0,3,0,0,0,1.0,1,1,9,159,1,1
2,0,24,0,0,4,0,1,4,1.0,1,0,0,63,0,1
6,0,41,1,0,6,0,0,14,1.0,1,0,5,40,1,1
8,1,39,1,0,1,0,0,0,1.0,0,0,2,747,0,0
9,0,21,1,0,3,0,0,1,308.0,1,1,0,428,1,1


In [329]:
from sklearn.linear_model import LogisticRegression
scaler = StandardScaler()
X, y = df.drop("two_year_recid", axis = 1, inplace = False), df.two_year_recid
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=5243)


baseline = LogisticRegression(random_state=5243,max_iter=1000)

baseline.fit(X_train, y_train)
preds = baseline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

preds_all = baseline.predict(X_test)
accuracy = sum(y_test == preds_all)/len(y_test)
print("Accuracy: %f" % (accuracy))

## Calculate D_{all}
def D_all_func(data = df):
    P_y1r1 = len(data[(data["two_year_recid"] == 1) & (data["race"] == 1)])/len(data[data['race'] == 1])
    P_y1r0 = len(data[(data["two_year_recid"] == 1) & (data["race"] == 0)])/len(data[data['race'] == 0])
    D_all = P_y1r0-P_y1r1
    return(D_all)

D_all_base = D_all_func()
print("discriminatioin",D_all_base)

RMSE: 0.219308
Accuracy: 0.951904
discriminatioin 0.13598888652499458


In [330]:
import torch as t
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import pandas as pd 
import re
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
#1
df_1 = df.loc[df['two_year_recid']==1]
X_1, y_1 = df_1.drop("two_year_recid", axis = 1, inplace = False), df_1.two_year_recid
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=5243)

x_1 = t.from_numpy(np.array(X_train_1)).to(t.float32)
y_1 = t.from_numpy(np.array(y_train_1).astype('float32')).reshape(x_1.shape[0],1)

x_test_1 = t.from_numpy(np.array(X_test_1)).to(t.float32)
y_test_1 = t.from_numpy(np.array(y_test_1).astype('float32')).reshape(x_test_1.shape[0],1)

#0
df_0 = df.loc[df['two_year_recid']==0]
X_0, y_0 = df_0.drop("two_year_recid", axis = 1, inplace = False), df_0.two_year_recid
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size=0.2, random_state=5243)

x_0 = t.from_numpy(np.array(X_train_0)).to(t.float32)
y_0 = t.from_numpy(np.array(y_train_0).astype('float32')).reshape(x_0.shape[0],1)

x_test_0 = t.from_numpy(np.array(X_test_0)).to(t.float32)
y_test_0 = t.from_numpy(np.array(y_test_0).astype('float32')).reshape(x_test_0.shape[0],1)

In [331]:
x_0

tensor([[  1.,  28.,   1.,  ...,  44., 792.,   0.],
        [  1.,  54.,   1.,  ...,   0., 921.,   0.],
        [  1.,  25.,   1.,  ...,   0., 827.,   0.],
        ...,
        [  1.,  20.,   1.,  ...,   0., 921.,   0.],
        [  0.,  25.,   0.,  ...,   1., 758.,   0.],
        [  0.,  23.,   0.,  ...,   0., 870.,   0.]])

In [332]:
class PRLoss():
    def __init__(self, eta=1.0):
        super(PRLoss, self).__init__()
        self.eta = eta
    def forward(self,output_1,output_0):
        N_1 = t.tensor(output_1.shape[0])
        N_0   = t.tensor(output_0.shape[0])
        Dxisi = t.stack((N_0,N_1),axis=0)

        y_pred_1 = t.sum(output_1)
        y_pred_0 = t.sum(output_0)
        P_ys = t.stack((y_pred_0,y_pred_1),axis=0) / Dxisi

        P = t.cat((output_1,output_0),0)
        P_y = t.sum(P) / (x_1.shape[0]+x_0.shape[0])

        P_s1y1 = t.log(P_ys[1]) - t.log(P_y)
        P_s1y0 = t.log(1-P_ys[1]) - t.log(1-P_y)
        P_s0y1 = t.log(P_ys[0]) - t.log(P_y)
        P_s0y0 = t.log(1-P_ys[0]) - t.log(1-P_y)

        PI_s1y1 = output_1 * P_s1y1
        PI_s1y0 =(1- output_1) * P_s1y0
        PI_s0y1 = output_0 * P_s0y1
        PI_s0y0 = (1- output_0 )* P_s0y0
        PI = t.sum(PI_s1y1) + t.sum(PI_s1y0) + t.sum(PI_s0y1) + t.sum(PI_s0y0)
        PI = self.eta * PI
        return PI

In [333]:
def accuracy( Model_1,Model_0, x_1, y_1,x_0,y_0):
    y1_pred = (Model_1(x_1) >= 0.5)
    y0_pred = (Model_0(x_0) >= 0.5)
    accu_1  = t.sum(y1_pred.flatten() == y_1.flatten()) / x_1.shape[0]
    accu_0  = t.sum(y0_pred.flatten() == y_0.flatten()) / x_0.shape[0]
    accuracy = (accu_1 + accu_0) / 2
    return round(accuracy.item(),6),y1_pred,y0_pred

In [334]:
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()        
        self.w = nn.Linear(x_1.shape[1], out_features=1, bias=True)
        self.sigmod = nn.Sigmoid()
    def forward(self,x):
        w = self.w(x)
        output = self.sigmod(w)
        return output
    
class PRLR():#using linear
    def __init__(self, eta=1.0,epochs = 3000,lr = 0.01):
        super(PRLR, self).__init__()
        self.eta = eta
        self.epochs = epochs
        self.lr = lr
        self.model_1 = LogisticRegression()
        self.model_0 = LogisticRegression()
    def fit(self,x_1,y_1,x_0,y_0,x_test_1,y_test_1,x_test_0,y_test_0):
        criterion = nn.BCELoss(reduction='sum')
        PI = PRLoss(eta=self.eta)
        epochs = self.epochs
        optimizer = t.optim.Adam(list(self.model_1.parameters())+ list(self.model_0.parameters()), self.lr, weight_decay=1e-5)
        for epoch in range(self.epochs):
            optimizer.zero_grad()
            output_1 = self.model_1(x_1)
            output_0 = self.model_0(x_0)
            self.output=output_1
            logloss = criterion(output_1, y_1)+ criterion(output_0, y_0)
            PIloss = PI.forward(output_1,output_0)
            loss = PIloss +logloss
            loss.backward()
            optimizer.step()
        self.model_1.eval()
        self.model_0.eval()
        accu,y1_pred,y0_pred= accuracy(self.model_1,self.model_0,x_test_1,y_test_1,x_test_0,y_test_0)
        return accu,y1_pred,y0_pred

In [336]:
eta_list=[0.0,1.0,2.0,3.0,4.0,5.0,10.0,15.0,20.0,25.0,30.0,80.0]
for i in range(len(eta_list)):
    PR = PRLR(eta = eta_list[i], epochs = 1000, lr = 0.01)
    accu,y1_pred,y0_pred=PR.fit(x_1,y_1,x_0,y_0,x_test_1,y_test_1,x_test_0,y_test_0)
    df_x_test_1 = pd.DataFrame(x_test_1, columns=df.columns[:-1])
    df_x_test_0 = pd.DataFrame(x_test_0, columns=df.columns[:-1])
    df_features = pd.concat([df_x_test_1, df_x_test_0], axis=0).reset_index(drop=True)

    df_y_pred = pd.DataFrame(np.vstack((y1_pred, y0_pred)), columns=['two_year_recid'])
    final_df = pd.concat([df_features, df_y_pred], axis=1)
    
    D_all_base = D_all_func(data=final_df)
    
    print("accuracy:",float(accu), end=" ")
    print("discrimination",D_all_base)

accuracy: 1.0 discriminatioin 0.1193639892505225
accuracy: 1.0 discriminatioin 0.1193639892505225
accuracy: 0.504193 discriminatioin 0.0026251617398228323
accuracy: 0.5 discriminatioin 0.0
accuracy: 0.477012 discriminatioin -0.013250223947446998
accuracy: 0.502096 discriminatioin 0.00338409475465313
accuracy: 0.723867 discriminatioin -0.11331740818154673
accuracy: 0.265127 discriminatioin -0.15303075544938788
accuracy: 0.204005 discriminatioin -0.18435851497959588
accuracy: 0.480843 discriminatioin 0.004839753160147307
accuracy: 0.60972 discriminatioin 0.0123419926346173
accuracy: 0.466746 discriminatioin -0.04110679804916889
