In [1]:
import pandas as pd
import random,time,csv
import numpy as np
import math,copy,os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics


import sys
sys.path.append(os.path.abspath('..'))

from Measure import measure_final_score,calculate_recall,calculate_far,calculate_precision,calculate_accuracy
from Generate_Samples import generate_samples

# Load Dataset

In [2]:
## Load dataset
from sklearn import preprocessing
dataset_orig = pd.read_csv('../data/adult.data.csv')

## Drop NULL values
dataset_orig = dataset_orig.dropna()

## Drop categorical features
dataset_orig = dataset_orig.drop(['workclass','fnlwgt','education','marital-status','occupation','relationship','native-country'],axis=1)

## Change symbolics to numerics
dataset_orig['sex'] = np.where(dataset_orig['sex'] == ' Male', 1, 0)
dataset_orig['race'] = np.where(dataset_orig['race'] != ' White', 0, 1)
dataset_orig['Probability'] = np.where(dataset_orig['Probability'] == ' <=50K', 0, 1)


## Discretize age
dataset_orig['age'] = np.where(dataset_orig['age'] >= 70, 70, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 60 ) & (dataset_orig['age'] < 70), 60, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 50 ) & (dataset_orig['age'] < 60), 50, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 40 ) & (dataset_orig['age'] < 50), 40, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 30 ) & (dataset_orig['age'] < 40), 30, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 20 ) & (dataset_orig['age'] < 30), 20, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 10 ) & (dataset_orig['age'] < 10), 10, dataset_orig['age'])
dataset_orig['age'] = np.where(dataset_orig['age'] < 10, 0, dataset_orig['age'])

protected_attribute = 'race'

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_orig = pd.DataFrame(scaler.fit_transform(dataset_orig),columns = dataset_orig.columns)

default_score_dict = {"accuracy":[],
                       "F1":[],
                      "aod race":[],
                      "eod race":[],
                      "SPD race":[],
                      "DI  race":[],
                     "aod sex":[],
                      "eod sex":[],
                      "SPD sex":[],
                      "DI  sex":[]}

improved_score_dict = {"accuracy":[],
                       "F1":[],
                      "aod race":[],
                      "eod race":[],
                      "SPD race":[],
                      "DI  race":[],
                     "aod sex":[],
                      "eod sex":[],
                      "SPD sex":[],
                      "DI  sex":[]}

# Fair-SMOTE

In [3]:
for i in range(5):
    
    dataset_orig_train, dataset_orig_test = train_test_split(dataset_orig, test_size=0.2, shuffle = True)

    X_train, y_train = dataset_orig_train.loc[:, dataset_orig_train.columns != 'Probability'], dataset_orig_train['Probability']
    X_test , y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test['Probability']

    clf = LogisticRegression()

    default_score_dict["accuracy"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
    default_score_dict["F1"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
    default_score_dict["aod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'aod'))
    default_score_dict["eod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'eod'))
    default_score_dict["SPD race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'SPD'))
    default_score_dict["DI  race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'DI'))

    default_score_dict["aod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'aod'))
    default_score_dict["eod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'eod'))
    default_score_dict["SPD sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'SPD'))
    default_score_dict["DI  sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'DI'))
    
    # Find Class & Protected attribute Distribution
    # first one is class value and second one is protected attribute value
    zero_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)])
    zero_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)])
    one_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)])
    one_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)])

    print(zero_zero,zero_one,one_zero,one_one)
    
    # Sort these four
    
    maximum = max(zero_zero,zero_one,one_zero,one_one)
    if maximum == zero_zero:
        print("zero_zero is maximum")
    if maximum == zero_one:
        print("zero_one is maximum")
    if maximum == one_zero:
        print("one_zero is maximum")
    if maximum == one_one:
        print("one_one is maximum")
        
    zero_zero_to_be_incresed = maximum - zero_zero ## where both are 0
    one_zero_to_be_incresed = maximum - one_zero ## where class is 1 attribute is 0
    one_one_to_be_incresed = maximum - one_one ## where class is 1 attribute is 1

    print(zero_zero_to_be_incresed,one_zero_to_be_incresed,one_one_to_be_incresed)
    
    df_zero_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)]
    df_one_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)]
    df_one_one = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)]


    df_zero_zero = generate_samples(zero_zero_to_be_incresed,df_zero_zero,'Adult')
    df_one_zero = generate_samples(one_zero_to_be_incresed,df_one_zero,'Adult')
    df_one_one = generate_samples(one_one_to_be_incresed,df_one_one,'Adult')
    
    
    df = df_zero_zero.append(df_one_zero)
    df = df.append(df_one_one)

    df['race'] = df['race'].astype(float)
    df['sex'] = df['sex'].astype(float)

    df_zero_one = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)]
    df = df.append(df_zero_one)
    
    # Check score after Fair-SMOTE
    
    X_train, y_train = df.loc[:, df.columns != 'Probability'], df['Probability']    

    clf = LogisticRegression() # LSR
    
    improved_score_dict["accuracy"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
    improved_score_dict["F1"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
    improved_score_dict["aod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'aod'))
    improved_score_dict["eod race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'eod'))
    improved_score_dict["SPD race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'SPD'))
    improved_score_dict["DI  race"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "race", 'DI'))

    improved_score_dict["aod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'aod'))
    improved_score_dict["eod sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'eod'))
    improved_score_dict["SPD sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'SPD'))
    improved_score_dict["DI  sex"].append(measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, "sex", 'DI'))
    


4831 24902 862 8478
zero_one is maximum
20071 24040 16424
4803 24985 845 8440
zero_one is maximum
20182 24140 16545
4823 24905 870 8475
zero_one is maximum
20082 24035 16430
4759 24895 873 8546
zero_one is maximum
20136 24022 16349
4825 24899 870 8479
zero_one is maximum
20074 24029 16420


# Analyze Scores 

In [21]:
import statistics

print("Default Scores")
print("---------------")

print("accuracy ",statistics.median(default_score_dict['accuracy']))
print("F1 ",statistics.median(default_score_dict['F1']))
print("aod race ",statistics.median(default_score_dict['aod race']))
print("eod race ",statistics.median(default_score_dict['eod race']))
print("SPD race ",statistics.median(default_score_dict['SPD race']))
print("DI  race",statistics.median(default_score_dict['DI  race']))
print("aod sex ",statistics.median(default_score_dict['aod sex']))
print("eod sex ",statistics.median(default_score_dict['eod sex']))
print("SPD sex ",statistics.median(default_score_dict['SPD sex']))
print("DI  sex",statistics.median(default_score_dict['DI  sex']))

print("---------------------------------------------------------------------")


print("Fair-SMOTE Scores")
print("---------------")

print("accuracy ",statistics.median(improved_score_dict['accuracy']))
print("F1 ",statistics.median(improved_score_dict['F1']))
print("aod race ",statistics.median(improved_score_dict['aod race']))
print("eod race ",statistics.median(improved_score_dict['eod race']))
print("SPD race ",statistics.median(improved_score_dict['SPD race']))
print("DI  race",statistics.median(improved_score_dict['DI  race']))
print("aod sex ",statistics.median(improved_score_dict['aod sex']))
print("eod sex ",statistics.median(improved_score_dict['eod sex']))
print("SPD sex ",statistics.median(improved_score_dict['SPD sex']))
print("DI  sex",statistics.median(improved_score_dict['DI  sex']))

Default Scores
---------------
accuracy  0.82
F1  0.53
aod race  0.04
eod race  0.13
SPD race  0.09
DI  race 0.6
aod sex  0.1
eod sex  0.28
SPD sex  0.16
DI  sex 0.85
---------------------------------------------------------------------
Fair-SMOTE Scores
---------------
accuracy  0.76
F1  0.6
aod race  -0.02
eod race  -0.01
SPD race  0.08
DI  race 0.23
aod sex  0.05
eod sex  0.34
SPD sex  0.35
DI  sex 0.75
