In [1]:
import os
import tkinter
import numpy as np
import random as r

import pandas as pd  # For reading CSV files
import time

from pandas.core.frame import DataFrame
from plotgen import Window
from LRModel import Model

# Path to all CSVs
DATA_PATH = "csv_data"


def read_records():
    all_records = []
    length = 0

    # Array containing a list of CSV files:
    record_data = os.listdir(DATA_PATH)

    for record in record_data:
        print("Reading " + record)
        data = pd.read_csv(DATA_PATH + "/" + record)
        length += data.shape[0]
        all_records.append(data)

    # Create the dataframe and return it.
    all_records = pd.concat(all_records, axis=0, ignore_index=True)
    print("Total number of rows: {}".format(length))
    return all_records


def test_data(num_records):
    races = ['A', 'B', 'C', 'D']
    sexs = ['F','M']

    dtypes = np.dtype(
        [
            ("sex", str),
            ("age", int),
            ("race", str),
            ("juv_fel_count", int),
            ("juv_misd_count", int),
            ("juv_other_count", int),
            ("priors_count", int),
            ("days_b_screening_arrest", int),
            ("c_days_from_compas", int),
            ("c_charge_degree", str),
            ("is_recid", int),
            ("r_charge_degree", str),
            ("decile_score", int)
        ]
    )

    data = np.empty(0,dtype=dtypes)
    testframe = DataFrame(data)
    
    for i in range(num_records):
        race = r.choice(races)
        age = r.randint(18,80)
        sex = r.choice(sexs)
        record = rand_record(sex,age,race,testframe.columns)
        testframe = testframe.append(record, ignore_index=True)

    #print(testframe.shape)
    return testframe

def rand_record(sex,age,race,columns):
    record = [sex,age,race]

    #juv_fel_count
    record.append(int(r.randint(0,3)))
    #juv_misd_count
    record.append(r.randint(0,3))
    #juv_other_count
    record.append(r.randint(0,3))
    #priors_count
    record.append(r.randint(0,5))
    #days_b_screening_arrest
    record.append(r.randint(0,10))
    #c_days_from_compas
    record.append(r.randint(0,4))
    #c_charge_degree
    record.append(r.randint(0,3))
    #is_recid
    record.append(r.randint(0,1))
    #r_charge_degree
    if record[10] == 1:
        record.append(r.choice(['F','M']))
    else:
        record.append('')
    #decile_score
    score = 1
    if race == 'B':
        score += 2
    if race == 'C':
        score += 5
    if race == 'D':
        score += 8
    record.append(score)

    rec_dict = {columns[i]: record[i] for i in range(len(record))}
    
    return rec_dict


if __name__ == "__main__":
    start = time.time()
    # root = tkinter.Tk()
    # window = Window(root, read_records())
    # tkinter.mainloop()

    # Random Dataframe
    # rand_model = Model(test_data(1000))
    # rand_model.convert_data()

    print('----------------')

    compas_model = Model(read_records())
    #X_test, y_test, history = compas_model.convert_data()
    X_test, y_test = compas_model.convert_data()


    elapsed = time.time() - start
    print("\n\nScript execution time: {} seconds".format(elapsed))


----------------
Reading compas-scores-two-years.csv
Total number of rows: 7214
Original number of columns: 13
COUNTS:
2400
2400
4800
Spliting dataset 80/20...
(3103, 24) (776, 24)
Training...
Model train R2 score: 0.45084205276556577
Model test R2 score: 0.40326472801174584
MSE Train: 4.51
MSE Test: 4.87



Script execution time: 2.8941807746887207 seconds


In [5]:
# control for attribute (currently hardcoded to race) and compare scores
count = 0
recid_avg_race = [[],[],[],[]]
for i in range(len(X_test)):
    compas_score = y_test.iat[i, 0]
    sample = X_test.iloc[[i]]
    race_scores = compas_model.score_attribute(sample, compas_score) # input shape is a single df row and the recidivism score

    # calculate difference for each race from recid score filter, accurate predictions only
    acc_pred = False
    diffs = []
    for i in range(len(race_scores)):
        diff = race_scores[i] - compas_score
        if (abs(diff) < 10):
            acc_pred = True
        diffs.append(diff)

    # if the model accurately predicted the recid score, add the respective race differences to the tally
    if acc_pred == True:
        count += 1
        for i in range(len(diffs)):
            recid_avg_race[i].append(diffs[i])

print(len(X_test))
print(count)
print("baseline Caucasian learned recidivism \nscore disparities by race:")
print("African American: " + str(round(np.mean(recid_avg_race[0]), 2)))
print("Hispanic: " + str(round(np.mean(recid_avg_race[1]), 2)))
#print("White: " + str(np.mean(recid_avg_race[2])))
print("Other: " + str(round(np.mean(recid_avg_race[3]), 2)))

    

compas score: 5.0
African-American score: 4.91
Hispanic score: 4.9
Caucasian score: 4.93
Other score: 4.88
---------------------------
compas score: 9.0
African-American score: 3.07
Hispanic score: 3.06
Caucasian score: 3.09
Other score: 3.04
---------------------------
compas score: 9.0
African-American score: 5.43
Hispanic score: 5.42
Caucasian score: 5.44
Other score: 5.4
---------------------------
compas score: 1.0
African-American score: 2.31
Hispanic score: 2.3
Caucasian score: 2.32
Other score: 2.28
---------------------------
compas score: 9.0
African-American score: 6.55
Hispanic score: 6.55
Caucasian score: 6.57
Other score: 6.53
---------------------------
compas score: 6.0
African-American score: 3.63
Hispanic score: 3.62
Caucasian score: 3.65
Other score: 3.6
---------------------------
compas score: 2.0
African-American score: 3.23
Hispanic score: 3.22
Caucasian score: 3.25
Other score: 3.2
---------------------------
compas score: 5.0
African-American score: 7.05
Hispan

In [7]:
print(len(X_test))
print(count)
print("learned recidivism \nscore disparities by race trained with\n on AA_count = CC_count:")
print("African American: " + str(round(np.mean(recid_avg_race[0]), 2)))
print("Hispanic: " + str(round(np.mean(recid_avg_race[1]), 2)))
#print("White: " + str(np.mean(recid_avg_race[2])))
print("Other: " + str(round(np.mean(recid_avg_race[3]), 2)))

776
776
baseline Caucasian learned recidivism 
score disparities by race trained with
 on AA_count = CC_count:
African American: 0.06
Hispanic: 0.06
Other: 0.04


In [4]:
# import matplotlib.pyplot as plt

# plt.plot(history.history['loss'])
# plt.title('loss')
# plt.show()

# plt.plot(history.history['mse'])
# plt.title('mse')
# plt.show()

# plt.plot(history.history['mae'])
# plt.title('mae')
# plt.show()

# plt.plot(history.history['val_loss'])
# plt.title('val_loss')
# plt.show()

# plt.plot(history.history['val_mse'])
# plt.title('val_mse')
# plt.show()

# plt.plot(history.history['val_mae'])
# plt.title('val_mae')
# plt.show()
