In [1]:
import pandas as pd
import random,time,csv
import numpy as np
import math,copy,os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics


import sys
sys.path.append(os.path.abspath('..'))

from Measure import measure_final_score,calculate_recall,calculate_far,calculate_precision,calculate_accuracy
from Generate_Samples import generate_samples

# Load Data

In [2]:
## Load dataset
dataset_orig = pd.read_csv('../data/compas-scores-two-years.csv')



## Drop categorical features
## Removed two duplicate coumns - 'decile_score','priors_count'
dataset_orig = dataset_orig.drop(['id','name','first','last','compas_screening_date',
                                  'dob','age','juv_fel_count','decile_score',
                                  'juv_misd_count','juv_other_count','days_b_screening_arrest',
                                  'c_jail_in','c_jail_out','c_case_number','c_offense_date','c_arrest_date',
                                  'c_days_from_compas','c_charge_desc','is_recid','r_case_number','r_charge_degree',
                                  'r_days_from_arrest','r_offense_date','r_charge_desc','r_jail_in','r_jail_out',
                                  'violent_recid','is_violent_recid','vr_case_number','vr_charge_degree','vr_offense_date',
                                  'vr_charge_desc','type_of_assessment','decile_score','score_text','screening_date',
                                  'v_type_of_assessment','v_decile_score','v_score_text','v_screening_date','in_custody',
                                  'out_custody','start','end','event'],axis=1)

## Drop NULL values
dataset_orig = dataset_orig.dropna()


## Change symbolics to numerics
dataset_orig['sex'] = np.where(dataset_orig['sex'] == 'Female', 1, 0)
dataset_orig['race'] = np.where(dataset_orig['race'] != 'Caucasian', 0, 1)
dataset_orig['priors_count'] = np.where((dataset_orig['priors_count'] >= 1 ) & (dataset_orig['priors_count'] <= 3), 3, dataset_orig['priors_count'])
dataset_orig['priors_count'] = np.where(dataset_orig['priors_count'] > 3, 4, dataset_orig['priors_count'])
dataset_orig['age_cat'] = np.where(dataset_orig['age_cat'] == 'Greater than 45',45,dataset_orig['age_cat'])
dataset_orig['age_cat'] = np.where(dataset_orig['age_cat'] == '25 - 45', 25, dataset_orig['age_cat'])
dataset_orig['age_cat'] = np.where(dataset_orig['age_cat'] == 'Less than 25', 0, dataset_orig['age_cat'])
dataset_orig['c_charge_degree'] = np.where(dataset_orig['c_charge_degree'] == 'F', 1, 0)


protected_attribute = 'sex'

## Rename class column
dataset_orig.rename(index=str, columns={"two_year_recid": "Probability"}, inplace=True)

# Here did not rec means 0 is the favorable lable
dataset_orig['Probability'] = np.where(dataset_orig['Probability'] == 0, 1, 0)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_orig = pd.DataFrame(scaler.fit_transform(dataset_orig),columns = dataset_orig.columns)

default_score_dict = {"accuracy":[],
                       "F1":[],
                      "aod race":[],
                      "eod race":[],
                      "SPD race":[],
                      "DI  race":[],
                     "aod sex":[],
                      "eod sex":[],
                      "SPD sex":[],
                      "DI  sex":[]}

improved_score_dict = {"accuracy":[],
                       "F1":[],
                      "aod race":[],
                      "eod race":[],
                      "SPD race":[],
                      "DI  race":[],
                     "aod sex":[],
                      "eod sex":[],
                      "SPD sex":[],
                      "DI  sex":[]}

# Fair-SMOTE

In [3]:
for i in range(5):
    
    dataset_orig_train, dataset_orig_test = train_test_split(dataset_orig, test_size=0.2, shuffle = True)

    X_train, y_train = dataset_orig_train.loc[:, dataset_orig_train.columns != 'Probability'], dataset_orig_train['Probability']
    X_test , y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test['Probability']

    clf = LogisticRegression()

    default_score_dict["accuracy"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
    default_score_dict["F1"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
    default_score_dict["aod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'aod'))
    default_score_dict["eod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'eod'))
    default_score_dict["SPD race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'SPD'))
    default_score_dict["DI  race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'DI'))

    default_score_dict["aod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'aod'))
    default_score_dict["eod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'eod'))
    default_score_dict["SPD sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'SPD'))
    default_score_dict["DI  sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'DI'))
    
    # Find Class & Protected attribute Distribution
    # first one is class value and second one is protected attribute value
    zero_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)])
    zero_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)])
    one_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)])
    one_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)])

    print(zero_zero,zero_one,one_zero,one_one)
    
    # Sort these four
    
    maximum = max(zero_zero,zero_one,one_zero,one_one)
    if maximum == zero_zero:
        print("zero_zero is maximum")
    if maximum == zero_one:
        print("zero_one is maximum")
    if maximum == one_zero:
        print("one_zero is maximum")
    if maximum == one_one:
        print("one_one is maximum")
        
    zero_one_to_be_incresed = maximum - zero_one ## where class is 0 attribute is 1
    one_zero_to_be_incresed = maximum - one_zero ## where class is 1 attribute is 0
    one_one_to_be_incresed = maximum - one_one ## where class is 1 attribute is 1

    print(zero_one_to_be_incresed,one_zero_to_be_incresed,one_one_to_be_incresed)

    df_zero_one = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)]
    df_one_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)]
    df_one_one = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)]


    df_zero_one = generate_samples(zero_one_to_be_incresed,df_zero_one,'Compas')
    df_one_zero = generate_samples(one_zero_to_be_incresed,df_one_zero,'Compas')
    df_one_one = generate_samples(one_one_to_be_incresed,df_one_one,'Compas')   
    
    df = df_zero_one.append(df_one_zero)
    df = df.append(df_one_one)

    df['race'] = df['race'].astype(float)
    df['sex'] = df['sex'].astype(float)

    df_zero_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)]
    df = df.append(df_zero_zero)
    
    # Check score after Fair-SMOTE
    
    X_train, y_train = df.loc[:, df.columns != 'Probability'], df['Probability']    

    clf = LogisticRegression() # LSR
    
    improved_score_dict["accuracy"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
    improved_score_dict["F1"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
    improved_score_dict["aod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'aod'))
    improved_score_dict["eod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'eod'))
    improved_score_dict["SPD race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'SPD'))
    improved_score_dict["DI  race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'DI'))

    improved_score_dict["aod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'aod'))
    improved_score_dict["eod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'eod'))
    improved_score_dict["SPD sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'SPD'))
    improved_score_dict["DI  sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'DI'))
   

2212 405 2411 743
one_zero is maximum
2006 0 1668
2229 386 2442 714
one_zero is maximum
2056 0 1728
2203 407 2440 721
one_zero is maximum
2033 0 1719
2187 399 2464 721
one_zero is maximum
2065 0 1743
2195 403 2457 716
one_zero is maximum
2054 0 1741


# Analyze Scores 

In [4]:
import statistics

print("Default Scores")
print("---------------")

print("accuracy ",statistics.median(default_score_dict['accuracy']))
print("F1 ",statistics.median(default_score_dict['F1']))
print("aod sex ",statistics.median(default_score_dict['aod sex']))
print("eod sex ",statistics.median(default_score_dict['eod sex']))
print("SPD sex ",statistics.median(default_score_dict['SPD sex']))
print("DI  sex",statistics.median(default_score_dict['DI  sex']))
print("aod race ",statistics.median(default_score_dict['aod race']))
print("eod race ",statistics.median(default_score_dict['eod race']))
print("SPD race ",statistics.median(default_score_dict['SPD race']))
print("DI  race",statistics.median(default_score_dict['DI  race']))


print("---------------------------------------------------------------------")


print("Fair-SMOTE Scores")
print("---------------")


print("accuracy ",statistics.median(improved_score_dict['accuracy']))
print("F1 ",statistics.median(improved_score_dict['F1']))
print("aod sex ",statistics.median(improved_score_dict['aod sex']))
print("eod sex ",statistics.median(improved_score_dict['eod sex']))
print("SPD sex ",statistics.median(improved_score_dict['SPD sex']))
print("DI  sex",statistics.median(improved_score_dict['DI  sex']))
print("aod race ",statistics.median(improved_score_dict['aod race']))
print("eod race ",statistics.median(improved_score_dict['eod race']))
print("SPD race ",statistics.median(improved_score_dict['SPD race']))
print("DI  race",statistics.median(improved_score_dict['DI  race']))

Default Scores
---------------
accuracy  0.63
F1  0.67
aod sex  -0.02
eod sex  0.23
SPD sex  0.27
DI  sex 0.34
aod race  -0.03
eod race  0.13
SPD race  0.17
DI  race 0.26
---------------------------------------------------------------------
Fair-SMOTE Scores
---------------
accuracy  0.62
F1  0.64
aod sex  -0.01
eod sex  -0.09
SPD sex  0.06
DI  sex -0.13
aod race  0.0
eod race  0.11
SPD race  0.15
DI  race 0.25
