In [1]:
import pandas as pd
import random,time,csv
import numpy as np
import math,copy,os
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import sklearn.metrics as metrics


import sys
sys.path.append(os.path.abspath('..'))

from SMOTE import smote
from Measure import measure_final_score,calculate_recall,calculate_far,calculate_precision,calculate_accuracy
from Generate_Samples import generate_samples

# Load Dataset

In [2]:
## Load dataset
from sklearn import preprocessing
dataset_orig = pd.read_csv('../data/adult.data.csv')

## Drop NULL values
dataset_orig = dataset_orig.dropna()

## Drop categorical features
dataset_orig = dataset_orig.drop(['workclass','fnlwgt','education','marital-status','occupation','relationship','native-country'],axis=1)

## Change symbolics to numerics
dataset_orig['sex'] = np.where(dataset_orig['sex'] == ' Male', 1, 0)
dataset_orig['race'] = np.where(dataset_orig['race'] != ' White', 0, 1)
dataset_orig['Probability'] = np.where(dataset_orig['Probability'] == ' <=50K', 0, 1)


## Discretize age
dataset_orig['age'] = np.where(dataset_orig['age'] >= 70, 70, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 60 ) & (dataset_orig['age'] < 70), 60, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 50 ) & (dataset_orig['age'] < 60), 50, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 40 ) & (dataset_orig['age'] < 50), 40, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 30 ) & (dataset_orig['age'] < 40), 30, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 20 ) & (dataset_orig['age'] < 30), 20, dataset_orig['age'])
dataset_orig['age'] = np.where((dataset_orig['age'] >= 10 ) & (dataset_orig['age'] < 10), 10, dataset_orig['age'])
dataset_orig['age'] = np.where(dataset_orig['age'] < 10, 0, dataset_orig['age'])

protected_attribute = 'sex'

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_orig = pd.DataFrame(scaler.fit_transform(dataset_orig),columns = dataset_orig.columns)


dataset_orig_train, dataset_orig_test = train_test_split(dataset_orig, test_size=0.2, shuffle = True)
# dataset_orig

In [3]:
dataset_orig_train

Unnamed: 0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week,Probability
1989,0.245283,0.600000,0.0,0.0,0.00000,0.00000,0.346939,0.0
17806,0.245283,0.866667,1.0,0.0,0.00000,0.38315,0.551020,0.0
41753,0.245283,0.600000,1.0,0.0,0.00000,0.00000,0.397959,1.0
5235,0.056604,0.400000,1.0,0.0,0.00000,0.00000,0.397959,0.0
2572,0.245283,0.266667,1.0,1.0,0.00000,0.00000,0.397959,0.0
...,...,...,...,...,...,...,...,...
13060,0.811321,0.266667,0.0,1.0,0.00000,0.00000,0.397959,0.0
22391,0.245283,0.200000,1.0,1.0,0.03942,0.00000,0.846939,0.0
36702,0.056604,0.666667,1.0,0.0,0.00000,0.00000,0.397959,0.0
29151,0.056604,0.800000,1.0,0.0,0.00000,0.00000,0.397959,0.0


## note: so they are only using numerical attributes

# Check Original Score

In [4]:
X_train, y_train = dataset_orig_train.loc[:, dataset_orig_train.columns != 'Probability'], dataset_orig_train['Probability']
X_test , y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test['Probability']

clf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100) # LSR

print("recall :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'recall'))
print("far :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'far'))
print("precision :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'precision'))
print("accuracy :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
print("F1 Score :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
print("aod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'aod'))
print("eod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'eod'))

print("SPD:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'SPD'))
print("DI:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'DI'))

recall : 0.41
far : 0.06
precision : 0.7
accuracy : 0.81
F1 Score : 0.52
aod :sex 0.1
eod :sex 0.28
SPD: 0.18
DI: 0.87


## check flip rate for comparison before fairsmote operation

In [5]:
clf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100) # LSR
clf.fit(X_train, y_train)

X_train_flipped = X_train.copy()
X_train_flipped['sex'] = 1 - X_train['sex']
X_train_flipped

Unnamed: 0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week
1989,0.245283,0.600000,0.0,1.0,0.00000,0.00000,0.346939
17806,0.245283,0.866667,1.0,1.0,0.00000,0.38315,0.551020
41753,0.245283,0.600000,1.0,1.0,0.00000,0.00000,0.397959
5235,0.056604,0.400000,1.0,1.0,0.00000,0.00000,0.397959
2572,0.245283,0.266667,1.0,0.0,0.00000,0.00000,0.397959
...,...,...,...,...,...,...,...
13060,0.811321,0.266667,0.0,0.0,0.00000,0.00000,0.397959
22391,0.245283,0.200000,1.0,0.0,0.03942,0.00000,0.846939
36702,0.056604,0.666667,1.0,1.0,0.00000,0.00000,0.397959
29151,0.056604,0.800000,1.0,1.0,0.00000,0.00000,0.397959


In [6]:
X_train

Unnamed: 0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week
1989,0.245283,0.600000,0.0,0.0,0.00000,0.00000,0.346939
17806,0.245283,0.866667,1.0,0.0,0.00000,0.38315,0.551020
41753,0.245283,0.600000,1.0,0.0,0.00000,0.00000,0.397959
5235,0.056604,0.400000,1.0,0.0,0.00000,0.00000,0.397959
2572,0.245283,0.266667,1.0,1.0,0.00000,0.00000,0.397959
...,...,...,...,...,...,...,...
13060,0.811321,0.266667,0.0,1.0,0.00000,0.00000,0.397959
22391,0.245283,0.200000,1.0,1.0,0.03942,0.00000,0.846939
36702,0.056604,0.666667,1.0,0.0,0.00000,0.00000,0.397959
29151,0.056604,0.800000,1.0,0.0,0.00000,0.00000,0.397959


In [7]:
y_org = clf.predict(X_train)
y_flp = clf.predict(X_train_flipped)

mismatches = (y_org != y_flp).sum()
mismatches

4201

In [8]:
X_test_flipped = X_test.copy()
X_test_flipped['sex'] = 1 - X_test['sex']

y_org = clf.predict(X_test)
y_flp = clf.predict(X_test_flipped)

mismatches = (y_org != y_flp).sum()
mismatches

1129

In [9]:
4413/39073

0.11294244107183989

# Check SMOTE Scores

In [10]:
def apply_smote(df):
    df.reset_index(drop=True,inplace=True)
    cols = df.columns
    smt = smote(df)
    df = smt.run()
    df.columns = cols
    return df

# dataset_orig_train, dataset_orig_test = train_test_split(dataset_orig, test_size=0.2, random_state=0,shuffle = True)

X_train, y_train = dataset_orig_train.loc[:, dataset_orig_train.columns != 'Probability'], dataset_orig_train['Probability']
X_test , y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test['Probability']

train_df = X_train
train_df['Probability'] = y_train

train_df = apply_smote(train_df)

y_train = train_df.Probability
X_train = train_df.drop('Probability', axis = 1)

clf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100) # LSR

print("recall :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'recall'))
print("far :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'far'))
print("precision :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'precision'))
print("accuracy :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
print("F1 Score :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
print("aod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'aod'))
print("eod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'eod'))

print("SPD:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'SPD'))
print("DI:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'DI'))

recall : 0.68
far : 0.19
precision : 0.54
accuracy : 0.78
F1 Score : 0.6
aod :sex 0.08
eod :sex 0.44
SPD: 0.38
DI: 0.87


# Find Class & Protected attribute Distribution

In [11]:
# first one is class value and second one is protected attribute value
zero_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)])
zero_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)])
one_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)])
one_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)])

print(zero_zero,zero_one,one_zero,one_one)

11546 18246 1397 7884


# Sort these four

In [12]:
maximum = max(zero_zero,zero_one,one_zero,one_one)
if maximum == zero_zero:
    print("zero_zero is maximum")
if maximum == zero_one:
    print("zero_one is maximum")
if maximum == one_zero:
    print("one_zero is maximum")
if maximum == one_one:
    print("one_one is maximum")

zero_one is maximum


In [13]:
zero_zero_to_be_incresed = maximum - zero_zero ## where both are 0
one_zero_to_be_incresed = maximum - one_zero ## where class is 1 attribute is 0
one_one_to_be_incresed = maximum - one_one ## where class is 1 attribute is 1

print(zero_zero_to_be_incresed,one_zero_to_be_incresed,one_one_to_be_incresed)

6700 16849 10362


In [14]:
df_zero_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)]
df_one_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)]
df_one_one = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)]

df_zero_zero['race'] = df_zero_zero['race'].astype(str)
df_zero_zero['sex'] = df_zero_zero['sex'].astype(str)


df_one_zero['race'] = df_one_zero['race'].astype(str)
df_one_zero['sex'] = df_one_zero['sex'].astype(str)

df_one_one['race'] = df_one_one['race'].astype(str)
df_one_one['sex'] = df_one_one['sex'].astype(str)


df_zero_zero = generate_samples(zero_zero_to_be_incresed,df_zero_zero,'Adult')
df_one_zero = generate_samples(one_zero_to_be_incresed,df_one_zero,'Adult')
df_one_one = generate_samples(one_one_to_be_incresed,df_one_one,'Adult')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_zero_zero['race'] = df_zero_zero['race'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_zero_zero['sex'] = df_zero_zero['sex'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_one_zero['race'] = df_one_zero['race'].astype(str)
A value is trying to be set on a copy of 

# Append the dataframes

In [15]:
df = df_zero_zero.append(df_one_zero)
df = df.append(df_one_one)

df['race'] = df['race'].astype(float)
df['sex'] = df['sex'].astype(float)

df_zero_one = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)]
df = df.append(df_zero_one)

In [16]:
df.shape

(72984, 8)

In [17]:
df

Unnamed: 0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week,Probability
0,0.245283,0.600000,0.0,0.0,0.00000,0.00000,0.346939,0.0
1,0.245283,0.866667,1.0,0.0,0.00000,0.38315,0.551020,0.0
2,0.056604,0.400000,1.0,0.0,0.00000,0.00000,0.397959,0.0
3,0.056604,0.533333,0.0,0.0,0.00000,0.00000,0.397959,0.0
4,0.811321,0.600000,1.0,0.0,0.02964,0.00000,0.408163,0.0
...,...,...,...,...,...,...,...,...
13298,0.245283,0.600000,0.0,1.0,0.00000,0.00000,0.295918,0.0
3048,0.622642,0.600000,1.0,1.0,0.00000,0.00000,0.295918,0.0
13060,0.811321,0.266667,0.0,1.0,0.00000,0.00000,0.397959,0.0
22391,0.245283,0.200000,1.0,1.0,0.03942,0.00000,0.846939,0.0


# Check Score after oversampling

In [18]:
X_train, y_train = df.loc[:, df.columns != 'Probability'], df['Probability']
X_test , y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test['Probability']

clf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100) # LSR


print("recall :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'recall'))
print("far :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'far'))
print("precision :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'precision'))
print("accuracy :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
print("F1 Score :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
print("aod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'aod'))
print("eod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'eod'))

print("SPD:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'SPD'))
print("DI:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'DI'))

recall : 0.7
far : 0.24
precision : 0.49
accuracy : 0.75
F1 Score : 0.58
aod :sex 0.02
eod :sex 0.05
SPD: 0.11
DI: 0.28


# Verification 

In [19]:
# first one is class value and second one is protected attribute value
zero_zero = len(df[(df['Probability'] == 0) & (df[protected_attribute] == 0)])
zero_one = len(df[(df['Probability'] == 0) & (df[protected_attribute] == 1)])
one_zero = len(df[(df['Probability'] == 1) & (df[protected_attribute] == 0)])
one_one = len(df[(df['Probability'] == 1) & (df[protected_attribute] == 1)])

print(zero_zero,zero_one,one_zero,one_one)

18246 18246 18246 18246


## note: check flip rate again

In [21]:
clf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100) # LSR
clf.fit(X_train, y_train)

X_train_flipped = X_train.copy()
X_train_flipped['sex'] = 1 - X_train['sex']
X_train_flipped

Unnamed: 0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week
0,0.245283,0.600000,0.0,1.0,0.00000,0.00000,0.346939
1,0.245283,0.866667,1.0,1.0,0.00000,0.38315,0.551020
2,0.056604,0.400000,1.0,1.0,0.00000,0.00000,0.397959
3,0.056604,0.533333,0.0,1.0,0.00000,0.00000,0.397959
4,0.811321,0.600000,1.0,1.0,0.02964,0.00000,0.408163
...,...,...,...,...,...,...,...
13298,0.245283,0.600000,0.0,0.0,0.00000,0.00000,0.295918
3048,0.622642,0.600000,1.0,0.0,0.00000,0.00000,0.295918
13060,0.811321,0.266667,0.0,0.0,0.00000,0.00000,0.397959
22391,0.245283,0.200000,1.0,0.0,0.03942,0.00000,0.846939


In [22]:
X_train

Unnamed: 0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week
0,0.245283,0.600000,0.0,0.0,0.00000,0.00000,0.346939
1,0.245283,0.866667,1.0,0.0,0.00000,0.38315,0.551020
2,0.056604,0.400000,1.0,0.0,0.00000,0.00000,0.397959
3,0.056604,0.533333,0.0,0.0,0.00000,0.00000,0.397959
4,0.811321,0.600000,1.0,0.0,0.02964,0.00000,0.408163
...,...,...,...,...,...,...,...
13298,0.245283,0.600000,0.0,1.0,0.00000,0.00000,0.295918
3048,0.622642,0.600000,1.0,1.0,0.00000,0.00000,0.295918
13060,0.811321,0.266667,0.0,1.0,0.00000,0.00000,0.397959
22391,0.245283,0.200000,1.0,1.0,0.03942,0.00000,0.846939


In [23]:
y_org = clf.predict(X_train)
y_flp = clf.predict(X_train_flipped)

In [24]:
mismatches = (y_org != y_flp).sum()
mismatches

1639

In [27]:
1639/72800

0.022513736263736265

In [28]:
X_test_flipped = X_test.copy()
X_test_flipped['sex'] = 1 - X_test['sex']

y_org = clf.predict(X_test)
y_flp = clf.predict(X_test_flipped)

mismatches = (y_org != y_flp).sum()
mismatches

180