In [1]:
import torch as t
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import pandas as pd 
import re
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
data = '../data/compas-scores-two-years.csv'
compas_scores = pd.read_csv(data)
compas_scores.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


# Some Exploration of the dataset

In [3]:
# Check category distribution of a certain feature 
compas_scores['v_score_text'].value_counts()

v_score_text
Low       4761
Medium    1739
High       714
Name: count, dtype: int64

In [4]:
compas_scores.shape 

(7214, 53)

In [5]:
compas_scores.dtypes

id                           int64
name                        object
first                       object
last                        object
compas_screening_date       object
sex                         object
dob                         object
age                          int64
age_cat                     object
race                        object
juv_fel_count                int64
decile_score                 int64
juv_misd_count               int64
juv_other_count              int64
priors_count                 int64
days_b_screening_arrest    float64
c_jail_in                   object
c_jail_out                  object
c_case_number               object
c_offense_date              object
c_arrest_date               object
c_days_from_compas         float64
c_charge_degree             object
c_charge_desc               object
is_recid                     int64
r_case_number               object
r_charge_degree             object
r_days_from_arrest         float64
r_offense_date      

In [3]:
# While exploring the dataset, also keep track of the columns that need to be removed 

col_to_drops = []
for col in compas_scores.columns: 
    num_nan = sum(compas_scores[col].isnull())
    if num_nan > 2000: 
        col_to_drops.append(col)
    print(f"{col}, {num_nan}")
    
"""
id useless, 
name, first, last, date of birth, c_offense date, screeing date, v_screening_date are dropped
All these do not have specific predictive power, clearly useless.

age can be droped cuz there is a "age_range" feature

type_of_assessment and v_type_of_assessment can be dropped cuz all rows have the same value. 
in_custody and out_custody are processed into durations. 
c_jail_in and c_jail_out will be processed into number of hours in jail. 

c_case_number will be dropped; it is just a random string
"""

col_to_drops += ["id", "name", "first", "last", "dob", "age", "c_offense_date", "type_of_assessment", "screening_date", "v_type_of_assessment", "v_screening_date", "compas_screening_date", "c_case_number"]

id, 0
name, 0
first, 0
last, 0
compas_screening_date, 0
sex, 0
dob, 0
age, 0
age_cat, 0
race, 0
juv_fel_count, 0
decile_score, 0
juv_misd_count, 0
juv_other_count, 0
priors_count, 0
days_b_screening_arrest, 307
c_jail_in, 307
c_jail_out, 307
c_case_number, 22
c_offense_date, 1159
c_arrest_date, 6077
c_days_from_compas, 22
c_charge_degree, 0
c_charge_desc, 29
is_recid, 0
r_case_number, 3743
r_charge_degree, 3743
r_days_from_arrest, 4898
r_offense_date, 3743
r_charge_desc, 3801
r_jail_in, 4898
r_jail_out, 4898
violent_recid, 7214
is_violent_recid, 0
vr_case_number, 6395
vr_charge_degree, 6395
vr_offense_date, 6395
vr_charge_desc, 6395
type_of_assessment, 0
decile_score.1, 0
score_text, 0
screening_date, 0
v_type_of_assessment, 0
v_decile_score, 0
v_score_text, 0
v_screening_date, 0
in_custody, 236
out_custody, 236
priors_count.1, 0
start, 0
end, 0
event, 0
two_year_recid, 0


# Data Preprocessing 

In [3]:
# import os
# # File path
# file_path = '../data/compas-scores-two-years_processed_numeric_discretized.csv'

# if os.path.exists(file_path):
#     print("The processed data csv file already exists, stop running the preprocessing")

The processed data csv file already exists, stop running the preprocessing


In [4]:
def process_data(df):
    df_processed = df.copy()
    df_processed = df_processed.drop(columns = col_to_drops)
    
    df_processed['in_custody'] = pd.to_datetime(df_processed['in_custody'], errors='coerce')
    df_processed['out_custody'] = pd.to_datetime(df_processed['out_custody'], errors='coerce')
    df_processed['custody_duration'] = (df_processed['out_custody'] - df_processed['in_custody']).dt.days
    df_processed = df_processed.drop(columns = ['in_custody', 'out_custody'])
    
    df_processed['c_jail_in'] = pd.to_datetime(df_processed['c_jail_in'])
    df_processed['c_jail_out'] = pd.to_datetime(df_processed['c_jail_out'])
    df_processed['jail_duration_hours'] = (df_processed['c_jail_out'] - df_processed['c_jail_in']) / np.timedelta64(1, 'h')
    df_processed = df_processed.drop(columns = ['c_jail_in', 'c_jail_out'])
    
    df_processed = df_processed.dropna()
    
    # maintain the order such that two_year_recid is always rightmost 
    two_year_recid = df_processed['two_year_recid']
    df_processed = df_processed.drop(columns=['two_year_recid'])
    df_processed['two_year_recid'] = two_year_recid
    
    return df_processed 

compas_scores_processed = process_data(compas_scores)

In [5]:
compas_scores_processed

Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,...,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,custody_duration,jail_duration_hours,two_year_recid
0,Male,Greater than 45,Other,0,1,0,0,0,-1.0,1.0,...,Low,1,Low,0,0,327,0,7.0,23.627222,0
1,Male,25 - 45,African-American,0,3,0,0,0,-1.0,1.0,...,Low,1,Low,0,9,159,1,10.0,241.857222,1
2,Male,Less than 25,African-American,0,4,0,1,4,-1.0,1.0,...,Low,3,Low,4,0,63,0,0.0,26.058333,1
5,Male,25 - 45,Other,0,1,0,0,0,0.0,0.0,...,Low,1,Low,0,1,853,0,1.0,31.643889,0
6,Male,25 - 45,Caucasian,0,6,0,0,14,-1.0,1.0,...,Medium,2,Low,14,5,40,1,18.0,151.168333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,Male,Less than 25,African-American,0,7,0,0,0,-1.0,1.0,...,Medium,5,Medium,0,1,860,0,2.0,45.681389,0
7210,Male,Less than 25,African-American,0,3,0,0,0,-1.0,1.0,...,Low,5,Medium,0,1,790,0,2.0,44.832778,0
7211,Male,Greater than 45,Other,0,1,0,0,0,-1.0,1.0,...,Low,1,Low,0,0,808,0,1.0,26.029167,0
7212,Female,25 - 45,African-American,0,2,0,0,3,-1.0,1.0,...,Low,2,Low,3,0,754,0,1.0,28.200556,0


In [6]:
def convert_to_num(df):
    modified_df = df.copy()
    
    # Iterate over each column in the DataFrame
    for col in modified_df.columns:
        # Check if the column is of a non-numeric data type
        if modified_df[col].dtype == object or modified_df[col].dtype == 'category':
            # Get the number of unique values in the column
            unique_values = modified_df[col].nunique()
            # If the number of unique values is less than 10, encode the column
            if unique_values < 10:
                # Create an encoder
                encoder = LabelEncoder()
                # Fit the encoder to the unique values
                modified_df[col] = encoder.fit_transform(modified_df[col])
                # Print the column name and the mapping from category to unique integer
                print(f'Column: {col}')
                mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
                for category, number in mapping.items():
                    print(f"  '{category}' is mapped to {number}")
                    
    return modified_df
    

In [7]:
compas_scores_processed_numeric = convert_to_num(compas_scores_processed)
compas_scores_processed_numeric

Column: sex
  'Female' is mapped to 0
  'Male' is mapped to 1
Column: age_cat
  '25 - 45' is mapped to 0
  'Greater than 45' is mapped to 1
  'Less than 25' is mapped to 2
Column: race
  'African-American' is mapped to 0
  'Asian' is mapped to 1
  'Caucasian' is mapped to 2
  'Hispanic' is mapped to 3
  'Native American' is mapped to 4
  'Other' is mapped to 5
Column: c_charge_degree
  'F' is mapped to 0
  'M' is mapped to 1
Column: score_text
  'High' is mapped to 0
  'Low' is mapped to 1
  'Medium' is mapped to 2
Column: v_score_text
  'High' is mapped to 0
  'Low' is mapped to 1
  'Medium' is mapped to 2


Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,...,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,custody_duration,jail_duration_hours,two_year_recid
0,1,1,5,0,1,0,0,0,-1.0,1.0,...,1,1,1,0,0,327,0,7.0,23.627222,0
1,1,0,0,0,3,0,0,0,-1.0,1.0,...,1,1,1,0,9,159,1,10.0,241.857222,1
2,1,2,0,0,4,0,1,4,-1.0,1.0,...,1,3,1,4,0,63,0,0.0,26.058333,1
5,1,0,5,0,1,0,0,0,0.0,0.0,...,1,1,1,0,1,853,0,1.0,31.643889,0
6,1,0,2,0,6,0,0,14,-1.0,1.0,...,2,2,1,14,5,40,1,18.0,151.168333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,1,2,0,0,7,0,0,0,-1.0,1.0,...,2,5,2,0,1,860,0,2.0,45.681389,0
7210,1,2,0,0,3,0,0,0,-1.0,1.0,...,1,5,2,0,1,790,0,2.0,44.832778,0
7211,1,1,5,0,1,0,0,0,-1.0,1.0,...,1,1,1,0,0,808,0,1.0,26.029167,0
7212,0,0,0,0,2,0,0,3,-1.0,1.0,...,1,2,1,3,0,754,0,1.0,28.200556,0


## Further work on deal with continuous value features (including a str column)
This paper seems to require everything to be discretized

In [8]:
# The key here is to find if we need the discretization process here 
for column in compas_scores_processed_numeric.columns:
    print(f"{column}: {compas_scores_processed_numeric[column].nunique()} unique values")

sex: 2 unique values
age_cat: 3 unique values
race: 6 unique values
juv_fel_count: 11 unique values
decile_score: 10 unique values
juv_misd_count: 10 unique values
juv_other_count: 9 unique values
priors_count: 37 unique values
days_b_screening_arrest: 423 unique values
c_days_from_compas: 369 unique values
c_charge_degree: 2 unique values
c_charge_desc: 420 unique values
is_recid: 2 unique values
is_violent_recid: 2 unique values
decile_score.1: 10 unique values
score_text: 3 unique values
v_decile_score: 10 unique values
v_score_text: 3 unique values
priors_count.1: 37 unique values
start: 236 unique values
end: 1110 unique values
event: 2 unique values
custody_duration: 366 unique values
jail_duration_hours: 6815 unique values
two_year_recid: 2 unique values


In [9]:
# Convert the columns with too much unique values into bin, the parameter values are set according to the value distribution of each continuous column

def bin_var(data, var, bins, group_names):
    ds = data
    bin_value = bins
    group = group_names
    data[var] = pd.cut(ds[var], bin_value, labels=group, right = False)
    data[var] = data[var].astype(int)
    
bin_var(compas_scores_processed_numeric, 'priors_count', [0, 2, 12, 20, float("inf")], [1,2,3,4])
bin_var(compas_scores_processed_numeric, "days_b_screening_arrest", [float("-inf"), -1, 0, 1, float("inf")], [1,2,3,4])
bin_var(compas_scores_processed_numeric, "c_days_from_compas", [0, 1, 2, float("inf")], [1,2,3])
bin_var(compas_scores_processed_numeric, 'priors_count.1', [0, 2, 12, 20, float("inf")], [1,2,3,4])
bin_var(compas_scores_processed_numeric, 'start', [0, 1, 2, 5, 40, float("inf")], [1,2,3,4,5])
bin_var(compas_scores_processed_numeric, 'end', [0, 400, 800, 1200], [1,2,3])
bin_var(compas_scores_processed_numeric, 'custody_duration', [float("-inf"), 0, 100, 500, 1000, 2000, 5000, float("inf")], [1,2,3,4,5,6,7])
bin_var(compas_scores_processed_numeric, 'jail_duration_hours', [float("-inf"), 0, 100, 500, 1000, 2000, 5000, float("inf")], [1,2,3,4,5,6,7])

In [10]:
# The key here is to find if we need the discretization process here 
for column in compas_scores_processed_numeric.columns:
    print(f"{column}: {compas_scores_processed_numeric[column].nunique()} unique values")

sex: 2 unique values
age_cat: 3 unique values
race: 6 unique values
juv_fel_count: 11 unique values
decile_score: 10 unique values
juv_misd_count: 10 unique values
juv_other_count: 9 unique values
priors_count: 4 unique values
days_b_screening_arrest: 4 unique values
c_days_from_compas: 3 unique values
c_charge_degree: 2 unique values
c_charge_desc: 420 unique values
is_recid: 2 unique values
is_violent_recid: 2 unique values
decile_score.1: 10 unique values
score_text: 3 unique values
v_decile_score: 10 unique values
v_score_text: 3 unique values
priors_count.1: 4 unique values
start: 5 unique values
end: 3 unique values
event: 2 unique values
custody_duration: 6 unique values
jail_duration_hours: 7 unique values
two_year_recid: 2 unique values


In [11]:
# This process is to tackle the c_charge_desc, which is a bunch of charge description
# It is hard to deal because because it's a str type here. 
import re

def categorize_charge(x):
    # Drug Offenses
    if re.search('Possession of|Possess|Drug|Cocaine|Cannabis|Methamphetamine|Heroin|MDMA|Ecstasy|Morphine|Oxycodone|LSD|Fentanyl|Alprazolam|Codeine|Hydromorphone|Diazepam|Clonazepam|Amphetamine|Phentermine|Buprenorphine|Methadone|Hydrocodone|Butylone|Ethylone|Benzylpiperazine|MDA', x, re.IGNORECASE):
        return 1  # Drug Offenses
    
    # Theft and Burglary
    elif re.search('Theft|Burglary|Robbery|Steal|Stolen|Shoplifting|Larceny|Embezzlement|Extortion', x, re.IGNORECASE):
        return 2  # Theft and Burglary
    
    # Assault and Battery
    elif re.search('Assault|Battery|Molest|Abuse|Domestic Violence|Stalking|Harm|Violence', x, re.IGNORECASE):
        return 3  # Assault and Battery
    
    # DUI and Traffic Offenses
    elif re.search('DUI|Driving|Traffic|DWI|License|Motor Vehicle|Eluding', x, re.IGNORECASE):
        return 4  # DUI and Traffic Offenses
    
    # Fraud and Financial Crimes
    elif re.search('Fraud|Forgery|Counterfeit|Embezzle|Scam|Extort|Deceit|Defraud|Identity Theft', x, re.IGNORECASE):
        return 5  # Fraud and Financial Crimes
    
    # Weapon-Related Charges
    elif re.search('Firearm|Weapon|Gun|Armed|Ammunition', x, re.IGNORECASE):
        return 6  # Weapon-Related Charges
    
    # Sexual Offenses
    elif re.search('Sexual|Lewd|Lascivious|Molestation|Rape', x, re.IGNORECASE):
        return 7  # Sexual Offenses
    
    # Miscellaneous Crimes
    else:
        return 8  # Miscellaneous Crimes

compas_scores_processed_numeric['c_charge_desc']=compas_scores_processed_numeric.c_charge_desc.apply(lambda x: x.strip()).apply(lambda x: categorize_charge(x))

In [12]:
# The key here is to find if we find the discretization process here 
for column in compas_scores_processed_numeric.columns:
    print(f"{column}: {compas_scores_processed_numeric[column].nunique()} unique values")

sex: 2 unique values
age_cat: 3 unique values
race: 6 unique values
juv_fel_count: 11 unique values
decile_score: 10 unique values
juv_misd_count: 10 unique values
juv_other_count: 9 unique values
priors_count: 4 unique values
days_b_screening_arrest: 4 unique values
c_days_from_compas: 3 unique values
c_charge_degree: 2 unique values
c_charge_desc: 8 unique values
is_recid: 2 unique values
is_violent_recid: 2 unique values
decile_score.1: 10 unique values
score_text: 3 unique values
v_decile_score: 10 unique values
v_score_text: 3 unique values
priors_count.1: 4 unique values
start: 5 unique values
end: 3 unique values
event: 2 unique values
custody_duration: 6 unique values
jail_duration_hours: 7 unique values
two_year_recid: 2 unique values


In [17]:
# Save CSV 
# compas_scores_processed.to_csv("../data/compas-scores-two-years_processed.csv", index = False)
# compas_scores_processed_numeric.to_csv("../data/compas-scores-two-years_processed_numeric.csv", index = False)
compas_scores_processed_numeric.to_csv("../data/compas-scores-two-years_processed_numeric_discretized.csv", index = False)

# Define Logistic Regression & PRLR

In [26]:
# df = pd.read_csv('../data/compas-scores-two-years_processed_numeric.csv')
# df

Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,...,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,custody_duration,jail_duration_hours,two_year_recid
0,1,1,5,0,1,0,0,0,-1.0,1.0,...,1,1,1,0,0,327,0,7.0,23.627222,0
1,1,0,0,0,3,0,0,0,-1.0,1.0,...,1,1,1,0,9,159,1,10.0,241.857222,1
2,1,2,0,0,4,0,1,4,-1.0,1.0,...,1,3,1,4,0,63,0,0.0,26.058333,1
3,1,0,5,0,1,0,0,0,0.0,0.0,...,1,1,1,0,1,853,0,1.0,31.643889,0
4,1,0,2,0,6,0,0,14,-1.0,1.0,...,2,2,1,14,5,40,1,18.0,151.168333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6895,1,2,0,0,7,0,0,0,-1.0,1.0,...,2,5,2,0,1,860,0,2.0,45.681389,0
6896,1,2,0,0,3,0,0,0,-1.0,1.0,...,1,5,2,0,1,790,0,2.0,44.832778,0
6897,1,1,5,0,1,0,0,0,-1.0,1.0,...,1,1,1,0,0,808,0,1.0,26.029167,0
6898,0,0,0,0,2,0,0,3,-1.0,1.0,...,1,2,1,3,0,754,0,1.0,28.200556,0


In [13]:
df = compas_scores_processed_numeric

In [14]:
df.dtypes

sex                        int64
age_cat                    int64
race                       int64
juv_fel_count              int64
decile_score               int64
juv_misd_count             int64
juv_other_count            int64
priors_count               int64
days_b_screening_arrest    int64
c_days_from_compas         int64
c_charge_degree            int64
c_charge_desc              int64
is_recid                   int64
is_violent_recid           int64
decile_score.1             int64
score_text                 int64
v_decile_score             int64
v_score_text               int64
priors_count.1             int64
start                      int64
end                        int64
event                      int64
custody_duration           int64
jail_duration_hours        int64
two_year_recid             int64
dtype: object

In [15]:
# The key here is to find if we find the discretization process here 
for column in df.columns:
    print(f"{column}: {df[column].nunique()} unique values")

sex: 2 unique values
age_cat: 3 unique values
race: 6 unique values
juv_fel_count: 11 unique values
decile_score: 10 unique values
juv_misd_count: 10 unique values
juv_other_count: 9 unique values
priors_count: 4 unique values
days_b_screening_arrest: 4 unique values
c_days_from_compas: 3 unique values
c_charge_degree: 2 unique values
c_charge_desc: 8 unique values
is_recid: 2 unique values
is_violent_recid: 2 unique values
decile_score.1: 10 unique values
score_text: 3 unique values
v_decile_score: 10 unique values
v_score_text: 3 unique values
priors_count.1: 4 unique values
start: 5 unique values
end: 3 unique values
event: 2 unique values
custody_duration: 6 unique values
jail_duration_hours: 7 unique values
two_year_recid: 2 unique values


In [16]:
# split the data file into train and test:
from sklearn.model_selection import train_test_split
Train_data, Test_data = train_test_split(df, test_size=0.2, random_state=42)
Train_data

Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,...,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,custody_duration,jail_duration_hours,two_year_recid
1781,1,2,0,0,4,0,0,1,2,2,...,1,5,2,1,1,2,0,2,2,0
2215,1,1,0,0,5,0,0,2,2,2,...,2,1,1,2,4,2,1,2,3,1
1010,1,1,3,0,1,0,0,1,3,1,...,1,1,1,1,2,3,0,2,2,0
4725,1,0,2,0,5,0,0,2,1,3,...,2,3,1,2,1,2,0,2,2,0
2884,1,0,2,1,10,0,0,2,4,3,...,0,10,0,2,1,1,1,3,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3946,1,0,2,0,8,0,0,2,1,3,...,0,4,1,2,1,2,1,2,7,1
5435,0,0,2,0,3,0,0,2,2,3,...,1,1,1,2,1,3,0,2,2,0
5473,1,1,2,0,5,0,0,1,2,2,...,2,3,1,1,1,3,0,2,2,0
5645,1,0,0,0,6,0,0,1,2,2,...,2,5,2,1,5,3,1,2,5,0


In [19]:
#Try linear
class LogisticRegression(nn.Module):
     def __init__(self):
        super(LogisticRegression, self).__init__()
        # out_features is the number of outputs. I don't use bias here because the constant dimension in x already introduced the bias term in W.
        self.w = nn.Linear(x_female.shape[1], out_features=1, bias=True)
        self.sigmod = nn.Sigmoid()
     def forward(self,x):
        # For the torch.nn.NLLLoss, the first input is the "log probability",log_softmax? log sigmod(binary).
        w = self.w(x)
        output = self.sigmod(w)
        return output

In [20]:
class PRLoss():#using linear
     def __init__(self, eta=1.0):
        super(PRLoss, self).__init__()
        self.eta = eta
        
     def forward(self,output_f,output_m):
        N_female = t.tensor(output_f.shape[0])
        N_male   = t.tensor(output_m.shape[0])
        Dxisi = t.stack((N_male,N_female),axis=0) #male sample, #female sample
        # Pr[y|s]
        y_pred_female = t.sum(output_f)
        y_pred_male   = t.sum(output_m)
        P_ys = t.stack((y_pred_male,y_pred_female),axis=0) / Dxisi
        # Pr[y]
        P = t.cat((output_f,output_m),0)
        P_y = t.sum(P) / (x_female.shape[0]+x_male.shape[0])
        # P(siyi)
        P_s1y1 = t.log(P_ys[1]) - t.log(P_y)
        P_s1y0 = t.log(1-P_ys[1]) - t.log(1-P_y)
        P_s0y1 = t.log(P_ys[0]) - t.log(P_y)
        P_s0y0 = t.log(1-P_ys[0]) - t.log(1-P_y)
        # PI
        PI_s1y1 = output_f * P_s1y1
        PI_s1y0 =(1- output_f) * P_s1y0
        PI_s0y1 = output_m * P_s0y1
        PI_s0y0 = (1- output_m )* P_s0y0
        PI = t.sum(PI_s1y1) + t.sum(PI_s1y0) + t.sum(PI_s0y1) + t.sum(PI_s0y0)
        PI = self.eta * PI
        return PI

In [21]:
def accuracy( Model_f,Model_m, x_female, y_female,x_male,y_male):
    yf_pred = (Model_f(x_female) >= 0.5)
    ym_pred = (Model_m(x_male) >= 0.5)
    accu_f  = t.sum(yf_pred.flatten() == y_female.flatten()) / x_female.shape[0]
    accu_m  = t.sum(ym_pred.flatten() == y_male.flatten()) / x_male.shape[0]
    accuracy = (accu_f + accu_m) / 2
    return round(accuracy.item(),4)
    print("Accuracy : %.3f" % (accuracy * 100)+'%')


In [22]:
def CVS(Model_f,Model_m,x_female,x_male):
    yf_pred = (Model_f(x_female) >= 0.5)
    ym_pred = (Model_m(x_male) >= 0.5)
    corr_f = t.sum(yf_pred == True)
    corr_m = t.sum(ym_pred == True)
    P_y1_s1 = corr_f / x_female.shape[0]
    P_y1_s0 = corr_m / x_male.shape[0]
    CV_score = t.abs(P_y1_s0 - P_y1_s1)
    return round(CV_score.item(),4)
    print ('Calder-Verwer discrimination score: %.4f' % (CV_score.item()))
    

In [23]:
class PRLR():#using linear
     def __init__(self, eta=0.0,epochs = 3000,lr = 0.01):
        super(PRLR, self).__init__()
        self.eta = eta
        self.epochs = epochs
        self.lr = lr
     def fit(self,x_female,y_female,x_male,y_male):
       model_f = LogisticRegression()
       model_m = LogisticRegression()
       criterion = nn.BCELoss(reduction='sum')
       PI = PRLoss(eta=self.eta)
       epochs = self.epochs
       optimizer = t.optim.Adam(list(model_f.parameters())+ list(model_m.parameters()), self.lr, weight_decay=1e-5)
       for epoch in range(epochs):
         optimizer.zero_grad()
         output_f = model_f(x_female)
         output_m = model_m(x_male)
         logloss = criterion(output_f, y_female)+ criterion(output_m, y_male)
         PIloss = PI.forward(output_f,output_m)
         loss = PIloss +logloss
         loss.backward()
         optimizer.step()
       model_f.eval()
       model_m.eval()
       accu = accuracy(model_f,model_m,x_female,y_female,x_male,y_male)
       cvs = CVS(model_f,model_m,x_female, x_male)
       return accu,cvs

In [24]:
PR = PRLR(eta = 1.0, epochs = 3000, lr = 0.01)

In [26]:
f = Train_data.loc[Train_data['race']==0]
x_female = t.from_numpy(np.array(f.drop(columns=['two_year_recid','race']))).to(t.float32)
y_female = t.from_numpy(np.array(f['two_year_recid']).astype('float32')).reshape(x_female.shape[0],1)

m = Train_data.loc[Train_data['race']==2]
x_male = t.from_numpy(np.array(m.drop(columns=['two_year_recid','race']))).to(t.float32)
y_male = t.from_numpy(np.array(m['two_year_recid']).astype('float32')).reshape(x_male.shape[0],1)
t.tensor(PR.fit(x_female,y_female,x_male,y_male)).reshape(2,1)

tensor([[0.9806],
        [0.1208]])