In [1]:
import pandas as pd
import numpy as np
import copy

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

random_state = 1

In [2]:
df = pd.read_csv('../data/cleaned.csv', index_col=0)
df.head()

Unnamed: 0,Duration,Credit Amount,Installment rate,Residence,Number of credits,Maintenance,Target,Account Status_<0,Account Status_<200,Account Status_>200,...,Housing_own,Housing_rent,Job_management/ highly qualified employee,Job_skilled employee / official,Job_unemployed/ unskilled - non-resident,Job_unskilled - resident,Telephone_none,Telephone_yes,Foreign_no,Foreign_yes
0,6,1169,4,4,2,1,1,1,0,0,...,1,0,0,1,0,0,0,1,0,1
1,48,5951,2,2,1,1,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
2,12,2096,2,3,1,2,1,0,0,0,...,1,0,0,0,0,1,1,0,0,1
3,42,7882,2,4,1,2,1,1,0,0,...,0,0,0,1,0,0,1,0,0,1
4,24,4870,3,4,2,2,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1


In [3]:
sensitive_indexes = []
sensitive_columns = []

for index, column in enumerate(df.loc[:, df.columns != 'Target'].columns):
    if column.startswith("Sex") or column.startswith("Age") or column.startswith("Foreign"):
        sensitive_indexes.append(index)
        sensitive_columns.append(column)
        
sensitive_indexes, sensitive_columns


([35, 36, 37, 38, 46, 47, 48, 49, 62, 63],
 ['Sex_female divorced/separated/married',
  'Sex_male divorced/separated',
  'Sex_male married/widowed',
  'Sex_male single',
  'Age_Middle1',
  'Age_Middle2',
  'Age_Older',
  'Age_Younger',
  'Foreign_no',
  'Foreign_yes'])

In [4]:
Xs = df.loc[:, df.columns != 'Target']
ys = df['Target']

combined = ["" for _ in range(Xs.shape[0])]

for index in sensitive_indexes:
    for j in range(Xs.shape[0]):
        combined[j] = combined[j] + "_" + str(Xs.iloc[j, index].item())


In [5]:
classes = dict()

for index, entry in enumerate(combined):
    if entry not in classes:
        classes[entry] = [index]
    else:
        indexes = classes[entry]
        indexes.append(index)
        classes[entry] = indexes
        
positives = dict()

for (key, value) in classes.items():
    positives[key] = 0
    for index in value:
        positives[key] += ys[index]
    
positives

{'_0_0_0_1_0_0_1_0_0_1': 116,
 '_1_0_0_0_0_0_0_1_0_1': 70,
 '_0_0_0_1_0_1_0_0_0_1': 125,
 '_0_1_0_0_0_0_1_0_0_1': 13,
 '_0_0_1_0_1_0_0_0_0_1': 21,
 '_1_0_0_0_1_0_0_0_0_1': 41,
 '_0_0_0_1_0_0_0_1_0_1': 45,
 '_1_0_0_0_0_0_1_0_0_1': 48,
 '_0_0_0_1_1_0_0_0_0_1': 93,
 '_0_0_0_1_0_0_1_0_1_0': 6,
 '_0_0_1_0_0_0_0_1_1_0': 3,
 '_0_0_1_0_0_1_0_0_0_1': 15,
 '_1_0_0_0_0_1_0_0_0_1': 37,
 '_0_1_0_0_0_0_0_1_0_1': 4,
 '_0_0_1_0_0_0_0_1_0_1': 20,
 '_1_0_0_0_1_0_0_0_1_0': 2,
 '_0_1_0_0_1_0_0_0_0_1': 5,
 '_0_1_0_0_0_1_0_0_0_1': 8,
 '_0_0_1_0_1_0_0_0_1_0': 2,
 '_0_0_0_1_0_1_0_0_1_0': 9,
 '_0_0_1_0_0_0_1_0_0_1': 6,
 '_0_0_0_1_1_0_0_0_1_0': 5,
 '_0_1_0_0_0_0_1_0_1_0': 0,
 '_1_0_0_0_0_1_0_0_1_0': 2,
 '_1_0_0_0_0_0_0_1_1_0': 1,
 '_0_0_1_0_0_1_0_0_1_0': 0,
 '_0_0_0_1_0_0_0_1_1_0': 3}

In [6]:
target = 0.7

target_positives = dict()

for (key, value) in classes.items():
    target_positives[key] = round(target*len(value))
    
target_positives

{'_0_0_0_1_0_0_1_0_0_1': 113,
 '_1_0_0_0_0_0_0_1_0_1': 87,
 '_0_0_0_1_0_1_0_0_0_1': 115,
 '_0_1_0_0_0_0_1_0_0_1': 13,
 '_0_0_1_0_1_0_0_0_0_1': 18,
 '_1_0_0_0_1_0_0_0_0_1': 46,
 '_0_0_0_1_0_0_0_1_0_1': 51,
 '_1_0_0_0_0_0_1_0_0_1': 43,
 '_0_0_0_1_1_0_0_0_0_1': 89,
 '_0_0_0_1_0_0_1_0_1_0': 4,
 '_0_0_1_0_0_0_0_1_1_0': 2,
 '_0_0_1_0_0_1_0_0_0_1': 13,
 '_1_0_0_0_0_1_0_0_0_1': 36,
 '_0_1_0_0_0_0_0_1_0_1': 3,
 '_0_0_1_0_0_0_0_1_0_1': 22,
 '_1_0_0_0_1_0_0_0_1_0': 3,
 '_0_1_0_0_1_0_0_0_0_1': 9,
 '_0_1_0_0_0_1_0_0_0_1': 10,
 '_0_0_1_0_1_0_0_0_1_0': 1,
 '_0_0_0_1_0_1_0_0_1_0': 6,
 '_0_0_1_0_0_0_1_0_0_1': 7,
 '_0_0_0_1_1_0_0_0_1_0': 4,
 '_0_1_0_0_0_0_1_0_1_0': 1,
 '_1_0_0_0_0_1_0_0_1_0': 1,
 '_1_0_0_0_0_0_0_1_1_0': 1,
 '_0_0_1_0_0_1_0_0_1_0': 1,
 '_0_0_0_1_0_0_0_1_1_0': 2}

In [7]:
diff_positives = dict()

for (key, value) in classes.items():
    diff_positives[key] = target_positives[key] - positives[key]
    
diff_positives

{'_0_0_0_1_0_0_1_0_0_1': -3,
 '_1_0_0_0_0_0_0_1_0_1': 17,
 '_0_0_0_1_0_1_0_0_0_1': -10,
 '_0_1_0_0_0_0_1_0_0_1': 0,
 '_0_0_1_0_1_0_0_0_0_1': -3,
 '_1_0_0_0_1_0_0_0_0_1': 5,
 '_0_0_0_1_0_0_0_1_0_1': 6,
 '_1_0_0_0_0_0_1_0_0_1': -5,
 '_0_0_0_1_1_0_0_0_0_1': -4,
 '_0_0_0_1_0_0_1_0_1_0': -2,
 '_0_0_1_0_0_0_0_1_1_0': -1,
 '_0_0_1_0_0_1_0_0_0_1': -2,
 '_1_0_0_0_0_1_0_0_0_1': -1,
 '_0_1_0_0_0_0_0_1_0_1': -1,
 '_0_0_1_0_0_0_0_1_0_1': 2,
 '_1_0_0_0_1_0_0_0_1_0': 1,
 '_0_1_0_0_1_0_0_0_0_1': 4,
 '_0_1_0_0_0_1_0_0_0_1': 2,
 '_0_0_1_0_1_0_0_0_1_0': -1,
 '_0_0_0_1_0_1_0_0_1_0': -3,
 '_0_0_1_0_0_0_1_0_0_1': 1,
 '_0_0_0_1_1_0_0_0_1_0': -1,
 '_0_1_0_0_0_0_1_0_1_0': 1,
 '_1_0_0_0_0_1_0_0_1_0': -1,
 '_1_0_0_0_0_0_0_1_1_0': 0,
 '_0_0_1_0_0_1_0_0_1_0': 1,
 '_0_0_0_1_0_0_0_1_1_0': -1}

In [8]:
for (key, value) in classes.items():
    temp_Xs = Xs.iloc[value, :]
    temp_ys = ys.iloc[value]
    
    try:
        lr = LogisticRegression(max_iter=1000, random_state=random_state)
        lr.fit(temp_Xs, temp_ys)
    except:
        print("Too few data in this class")
        continue
        
    positive_probs = lr.predict_proba(temp_Xs)[:, 1]
    
    positive_indexes_with_probs = dict()
    negative_indexes_with_probs = dict()
    
    for (index, positive_prob) in enumerate(positive_probs):
        true_y = ys.iloc[value[index]]
        if true_y == 1:
            positive_indexes_with_probs[value[index]] = positive_prob
        else:
            negative_indexes_with_probs[value[index]] = positive_prob
    
    diff = diff_positives[key]
    if diff > 0:
        # need more positives
        # remove |diff| negatives
        sorted_n = dict(sorted(negative_indexes_with_probs.items(), key=lambda item: item[1], reverse=True))
        replace_indexes = list(sorted_n.keys())[:diff]
        
        # add |diff| positives
        sorted_p = dict(sorted(positive_indexes_with_probs.items(), key=lambda item: item[1]))
        border_index = list(sorted_p.keys())[0]
        
        for i in replace_indexes:
            Xs.iloc[i, :] = Xs.iloc[border_index, :]
            ys.iloc[i] = ys.iloc[border_index]
    else:
        diff = -diff
        # need more negatives
        # remove |diff| positives
        sorted_p = dict(sorted(positive_indexes_with_probs.items(), key=lambda item: item[1]))
        replace_indexes = list(sorted_p.keys())[:diff]
        
        # add |diff| negatives
        sorted_n = dict(sorted(negative_indexes_with_probs.items(), key=lambda item: item[1], reverse=True))
        border_index = list(sorted_n.keys())[0]
        
        for i in replace_indexes:
            Xs.iloc[i, :] = Xs.iloc[border_index, :]
            ys.iloc[i] = ys.iloc[border_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xs.iloc[i, :] = Xs.iloc[border_index, :]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xs.iloc[i, :] = Xs.iloc[border_index, :]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xs.iloc[i, :] = Xs.iloc[border_index, :]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xs.iloc[i, :] = Xs.iloc[border_index, :]
A va

Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class
Too few data in this class


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xs.iloc[i, :] = Xs.iloc[border_index, :]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xs.iloc[i, :] = Xs.iloc[border_index, :]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xs.iloc[i, :] = Xs.iloc[border_index, :]


In [9]:
classes = dict()

for index, entry in enumerate(combined):
    if entry not in classes:
        classes[entry] = [index]
    else:
        indexes = classes[entry]
        indexes.append(index)
        classes[entry] = indexes
        
positives = dict()

for (key, value) in classes.items():
    positives[key] = 0
    for index in value:
        positives[key] += ys[index]
    
positives

{'_0_0_0_1_0_0_1_0_0_1': 113,
 '_1_0_0_0_0_0_0_1_0_1': 87,
 '_0_0_0_1_0_1_0_0_0_1': 115,
 '_0_1_0_0_0_0_1_0_0_1': 13,
 '_0_0_1_0_1_0_0_0_0_1': 18,
 '_1_0_0_0_1_0_0_0_0_1': 46,
 '_0_0_0_1_0_0_0_1_0_1': 51,
 '_1_0_0_0_0_0_1_0_0_1': 43,
 '_0_0_0_1_1_0_0_0_0_1': 89,
 '_0_0_0_1_0_0_1_0_1_0': 6,
 '_0_0_1_0_0_0_0_1_1_0': 3,
 '_0_0_1_0_0_1_0_0_0_1': 13,
 '_1_0_0_0_0_1_0_0_0_1': 36,
 '_0_1_0_0_0_0_0_1_0_1': 4,
 '_0_0_1_0_0_0_0_1_0_1': 22,
 '_1_0_0_0_1_0_0_0_1_0': 3,
 '_0_1_0_0_1_0_0_0_0_1': 9,
 '_0_1_0_0_0_1_0_0_0_1': 10,
 '_0_0_1_0_1_0_0_0_1_0': 2,
 '_0_0_0_1_0_1_0_0_1_0': 9,
 '_0_0_1_0_0_0_1_0_0_1': 7,
 '_0_0_0_1_1_0_0_0_1_0': 5,
 '_0_1_0_0_0_0_1_0_1_0': 0,
 '_1_0_0_0_0_1_0_0_1_0': 2,
 '_1_0_0_0_0_0_0_1_1_0': 1,
 '_0_0_1_0_0_1_0_0_1_0': 0,
 '_0_0_0_1_0_0_0_1_1_0': 3}

Here I use 5 fold CV.

In [10]:
Xs = df.loc[:, df.columns != 'Target']
ys = df['Target']

all_preds = np.zeros(df.shape[0])

kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
for train_index, test_index in kf.split(df):
    X_train, y_train = Xs.iloc[train_index], ys.iloc[train_index]
    X_test, y_test = Xs.iloc[test_index], ys.iloc[test_index]
    
    lr = LogisticRegression(max_iter=1000, random_state=random_state)
    lr.fit(X_train, y_train)
    
    all_preds[test_index] = lr.predict(X_test)

accuracy_score(ys, all_preds)

0.775

In [11]:
orig_df = pd.read_csv('../data/orig.csv', index_col=0)
orig_df.rename(columns={"Target": "label_value"}, inplace=True)
orig_df['score'] = all_preds
orig_df.head()

Unnamed: 0,Account Status,Duration,Credit History,Purpose,Credit Amount,Savings,Employment,Installment rate,Sex,Other debtors,...,Age,Other installments,Housing,Number of credits,Job,Maintenance,Telephone,Foreign,label_value,score
0,<0,6,critical account,radio/television,1169,no,>= 7 years,4,male single,none,...,Older,none,own,2,skilled employee / official,1,yes,yes,1,1.0
1,<200,48,existing credits paid back duly till now,radio/television,5951,<100,1 <= < 4 years,2,female divorced/separated/married,none,...,Younger,none,own,1,skilled employee / official,1,none,yes,0,1.0
2,no,12,critical account,education,2096,<100,4 <= < 7 years,2,male single,none,...,Older,none,own,1,unskilled - resident,2,none,yes,1,1.0
3,<0,42,existing credits paid back duly till now,furniture/equipment,7882,<100,4 <= < 7 years,2,male single,guarantor,...,Older,none,for free,1,skilled employee / official,2,none,yes,1,0.0
4,<0,24,delay in paying off,car (new),4870,<100,1 <= < 4 years,3,male single,none,...,Older,none,for free,2,skilled employee / official,2,none,yes,0,0.0


In [12]:
orig_df.to_csv('../data/fair_processed.csv')

In [13]:
# orig_df['label_value'].value_counts()

In [14]:
# Xs = df.loc[:, df.columns != 'Target']
# ys = df['Target']

# remove_special = lambda string: string.replace("<", "")

# Xs.columns = map(remove_special, Xs.columns)

# clf = XGBClassifier()
# kfold = KFold(n_splits=10)
# results = cross_val_score(clf, Xs, ys, cv=kfold)
# print(f"Accuracy: {results.mean()} ({results.std()})")