In [1]:
import torch as t
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import pandas as pd 
import re
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
data = '../data/compas-scores-two-years.csv'
compas_scores = pd.read_csv(data)
compas_scores.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


# Some Exploration of the dataset

In [3]:
# Check category distribution of a certain feature 
compas_scores['v_score_text'].value_counts()

v_score_text
Low       4761
Medium    1739
High       714
Name: count, dtype: int64

In [4]:
compas_scores.shape 

(7214, 53)

In [5]:
compas_scores.dtypes

id                           int64
name                        object
first                       object
last                        object
compas_screening_date       object
sex                         object
dob                         object
age                          int64
age_cat                     object
race                        object
juv_fel_count                int64
decile_score                 int64
juv_misd_count               int64
juv_other_count              int64
priors_count                 int64
days_b_screening_arrest    float64
c_jail_in                   object
c_jail_out                  object
c_case_number               object
c_offense_date              object
c_arrest_date               object
c_days_from_compas         float64
c_charge_degree             object
c_charge_desc               object
is_recid                     int64
r_case_number               object
r_charge_degree             object
r_days_from_arrest         float64
r_offense_date      

In [6]:
# While exploring the dataset, also keep track of the columns that need to be removed 

col_to_drops = []
for col in compas_scores.columns: 
    num_nan = sum(compas_scores[col].isnull())
    if num_nan > 2000: 
        col_to_drops.append(col)
    print(f"{col}, {num_nan}")
    
"""
id useless, 
name, first, last, date of birth, c_offense date, screeing date, v_screening_date are dropped
All these do not have specific predictive power, clearly useless.

age can be droped cuz there is a "age_range" feature

type_of_assessment and v_type_of_assessment can be dropped cuz all rows have the same value. 
in_custody and out_custody are processed into durations. 
c_jail_in and c_jail_out will be processed into number of hours in jail. 

c_case_number will be dropped; it is just a random string
"""

col_to_drops += ["id", "name", "first", "last", "dob", "age", "c_offense_date", "type_of_assessment", "screening_date", "v_type_of_assessment", "v_screening_date", "compas_screening_date", "c_case_number"]

id, 0
name, 0
first, 0
last, 0
compas_screening_date, 0
sex, 0
dob, 0
age, 0
age_cat, 0
race, 0
juv_fel_count, 0
decile_score, 0
juv_misd_count, 0
juv_other_count, 0
priors_count, 0
days_b_screening_arrest, 307
c_jail_in, 307
c_jail_out, 307
c_case_number, 22
c_offense_date, 1159
c_arrest_date, 6077
c_days_from_compas, 22
c_charge_degree, 0
c_charge_desc, 29
is_recid, 0
r_case_number, 3743
r_charge_degree, 3743
r_days_from_arrest, 4898
r_offense_date, 3743
r_charge_desc, 3801
r_jail_in, 4898
r_jail_out, 4898
violent_recid, 7214
is_violent_recid, 0
vr_case_number, 6395
vr_charge_degree, 6395
vr_offense_date, 6395
vr_charge_desc, 6395
type_of_assessment, 0
decile_score.1, 0
score_text, 0
screening_date, 0
v_type_of_assessment, 0
v_decile_score, 0
v_score_text, 0
v_screening_date, 0
in_custody, 236
out_custody, 236
priors_count.1, 0
start, 0
end, 0
event, 0
two_year_recid, 0


# Data Preprocessing 

In [7]:
def process_data(df):
    df_processed = df.copy()
    df_processed = df_processed.drop(columns = col_to_drops)
    
    df_processed['in_custody'] = pd.to_datetime(df_processed['in_custody'], errors='coerce')
    df_processed['out_custody'] = pd.to_datetime(df_processed['out_custody'], errors='coerce')
    df_processed['custody_duration'] = (df_processed['out_custody'] - df_processed['in_custody']).dt.days
    df_processed = df_processed.drop(columns = ['in_custody', 'out_custody'])
    
    df_processed['c_jail_in'] = pd.to_datetime(df_processed['c_jail_in'])
    df_processed['c_jail_out'] = pd.to_datetime(df_processed['c_jail_out'])
    df_processed['jail_duration_hours'] = (df_processed['c_jail_out'] - df_processed['c_jail_in']) / np.timedelta64(1, 'h')
    df_processed = df_processed.drop(columns = ['c_jail_in', 'c_jail_out'])
    
    df_processed = df_processed.dropna()
    
    # maintain the order such that two_year_recid is always rightmost 
    two_year_recid = df_processed['two_year_recid']
    df_processed = df_processed.drop(columns=['two_year_recid'])
    df_processed['two_year_recid'] = two_year_recid
    
    return df_processed 

compas_scores_processed = process_data(compas_scores)

In [8]:
pd.set_option('display.max_columns', None) 
compas_scores_processed

Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,is_violent_recid,decile_score.1,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,custody_duration,jail_duration_hours,two_year_recid
0,Male,Greater than 45,Other,0,1,0,0,0,-1.0,1.0,F,Aggravated Assault w/Firearm,0,0,1,Low,1,Low,0,0,327,0,7.0,23.627222,0
1,Male,25 - 45,African-American,0,3,0,0,0,-1.0,1.0,F,Felony Battery w/Prior Convict,1,1,3,Low,1,Low,0,9,159,1,10.0,241.857222,1
2,Male,Less than 25,African-American,0,4,0,1,4,-1.0,1.0,F,Possession of Cocaine,1,0,4,Low,3,Low,4,0,63,0,0.0,26.058333,1
5,Male,25 - 45,Other,0,1,0,0,0,0.0,0.0,M,Battery,0,0,1,Low,1,Low,0,1,853,0,1.0,31.643889,0
6,Male,25 - 45,Caucasian,0,6,0,0,14,-1.0,1.0,F,Possession Burglary Tools,1,0,6,Medium,2,Low,14,5,40,1,18.0,151.168333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,Male,Less than 25,African-American,0,7,0,0,0,-1.0,1.0,F,Deliver Cannabis,0,0,7,Medium,5,Medium,0,1,860,0,2.0,45.681389,0
7210,Male,Less than 25,African-American,0,3,0,0,0,-1.0,1.0,F,Leaving the Scene of Accident,0,0,3,Low,5,Medium,0,1,790,0,2.0,44.832778,0
7211,Male,Greater than 45,Other,0,1,0,0,0,-1.0,1.0,F,Aggravated Battery / Pregnant,0,0,1,Low,1,Low,0,0,808,0,1.0,26.029167,0
7212,Female,25 - 45,African-American,0,2,0,0,3,-1.0,1.0,M,Battery on Law Enforc Officer,0,0,2,Low,2,Low,3,0,754,0,1.0,28.200556,0


In [9]:
def convert_to_num(df):
    modified_df = df.copy()
    
    # Iterate over each column in the DataFrame
    for col in modified_df.columns:
        # Check if the column is of a non-numeric data type
        if modified_df[col].dtype == object or modified_df[col].dtype == 'category':
            # Get the number of unique values in the column
            unique_values = modified_df[col].nunique()
            
            # Create an encoder
            encoder = LabelEncoder()
            # Fit the encoder to the unique values
            modified_df[col] = encoder.fit_transform(modified_df[col])
            
            # If the number of unique values is less than 10, encode the column
            if unique_values < 10:
                # Print the column name and the mapping from category to unique integer
                print(f'Column: {col}')
                mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
                for category, number in mapping.items():
                    print(f"  '{category}' is mapped to {number}")
                    
    return modified_df
    

In [10]:
compas_scores_processed_numeric = convert_to_num(compas_scores_processed)
compas_scores_processed_numeric

Column: sex
  'Female' is mapped to 0
  'Male' is mapped to 1
Column: age_cat
  '25 - 45' is mapped to 0
  'Greater than 45' is mapped to 1
  'Less than 25' is mapped to 2
Column: race
  'African-American' is mapped to 0
  'Asian' is mapped to 1
  'Caucasian' is mapped to 2
  'Hispanic' is mapped to 3
  'Native American' is mapped to 4
  'Other' is mapped to 5
Column: c_charge_degree
  'F' is mapped to 0
  'M' is mapped to 1
Column: score_text
  'High' is mapped to 0
  'Low' is mapped to 1
  'Medium' is mapped to 2
Column: v_score_text
  'High' is mapped to 0
  'Low' is mapped to 1
  'Medium' is mapped to 2


Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,is_violent_recid,decile_score.1,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,custody_duration,jail_duration_hours,two_year_recid
0,1,1,5,0,1,0,0,0,-1.0,1.0,0,15,0,0,1,1,1,1,0,0,327,0,7.0,23.627222,0
1,1,0,0,0,3,0,0,0,-1.0,1.0,0,161,1,1,3,1,1,1,0,9,159,1,10.0,241.857222,1
2,1,2,0,0,4,0,1,4,-1.0,1.0,0,304,1,0,4,1,3,1,4,0,63,0,0.0,26.058333,1
5,1,0,5,0,1,0,0,0,0.0,0.0,1,42,0,0,1,1,1,1,0,1,853,0,1.0,31.643889,0
6,1,0,2,0,6,0,0,14,-1.0,1.0,0,282,1,0,6,2,2,1,14,5,40,1,18.0,151.168333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,1,2,0,0,7,0,0,0,-1.0,1.0,0,115,0,0,7,2,5,2,0,1,860,0,2.0,45.681389,0
7210,1,2,0,0,3,0,0,0,-1.0,1.0,0,201,0,0,3,1,5,2,0,1,790,0,2.0,44.832778,0
7211,1,1,5,0,1,0,0,0,-1.0,1.0,0,19,0,0,1,1,1,1,0,0,808,0,1.0,26.029167,0
7212,0,0,0,0,2,0,0,3,-1.0,1.0,1,48,0,0,2,1,2,1,3,0,754,0,1.0,28.200556,0


In [9]:
# Save CSV 
# compas_scores_processed.to_csv("../data/compas-scores-two-years_processed.csv", index = False)
compas_scores_processed.to_csv("../data/compas-scores-two-years_processed.csv", index = False)

In [37]:
# pd.set_option('display.max_rows', None)
compas_scores_processed["c_charge_desc"].value_counts()

c_charge_desc
Battery                                                 1125
arrest case no charge                                   1052
Possession of Cocaine                                    464
Grand Theft in the 3rd Degree                            420
Driving While License Revoked                            197
Driving Under The Influence                              128
Felony Battery (Dom Strang)                               98
Pos Cannabis W/Intent Sel/Del                             95
Grand Theft (Motor Vehicle)                               95
Felony Driving While Lic Suspd                            94
Possess Cannabis/20 Grams Or Less                         90
Burglary Unoccupied Dwelling                              84
Burglary Conveyance Unoccup                               75
Poss3,4 Methylenedioxymethcath                            72
Possession of Cannabis                                    72
DUI Property Damage/Injury                                71
Felony Pet

# Define PRLR

In [78]:
class PRLoss():#using linear
     def __init__(self, eta=1.0):
        super(PRLoss, self).__init__()
        self.eta = eta
        
     def forward(self,output_f,output_m):
        N_female = t.tensor(output_f.shape[0])
        N_male   = t.tensor(output_m.shape[0])
        Dxisi = t.stack((N_male,N_female),axis=0) #male sample, #female sample
        # Pr[y|s]
        y_pred_female = t.sum(output_f)
        y_pred_male   = t.sum(output_m)
        P_ys = t.stack((y_pred_male,y_pred_female),axis=0) / Dxisi
        # Pr[y]
        P = t.cat((output_f,output_m),0)
        P_y = t.sum(P) / (x_female.shape[0]+x_male.shape[0])
        # P(siyi)
        P_s1y1 = t.log(P_ys[1]) - t.log(P_y)
        P_s1y0 = t.log(1-P_ys[1]) - t.log(1-P_y)
        P_s0y1 = t.log(P_ys[0]) - t.log(P_y)
        P_s0y0 = t.log(1-P_ys[0]) - t.log(1-P_y)
        # PI
        PI_s1y1 = output_f * P_s1y1
        PI_s1y0 =(1- output_f) * P_s1y0
        PI_s0y1 = output_m * P_s0y1
        PI_s0y0 = (1- output_m )* P_s0y0
        PI = t.sum(PI_s1y1) + t.sum(PI_s1y0) + t.sum(PI_s0y1) + t.sum(PI_s0y0)
        PI = self.eta * PI
        return PI