<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Celeb-attribute-dataset" data-toc-modified-id="Celeb-attribute-dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Celeb attribute dataset</a></span></li><li><span><a href="#Dutch-census-dataset" data-toc-modified-id="Dutch-census-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dutch census dataset</a></span></li><li><span><a href="#Adult-dataset" data-toc-modified-id="Adult-dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Adult dataset</a></span></li><li><span><a href="#Compas-dataset" data-toc-modified-id="Compas-dataset-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Compas dataset</a></span></li><li><span><a href="#German-credit" data-toc-modified-id="German-credit-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>German credit</a></span></li><li><span><a href="#Communities-and-crime" data-toc-modified-id="Communities-and-crime-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Communities and crime</a></span></li></ul></div>

In [10]:
import numpy as np
from scipy.io import arff
import pandas as pd
import pickle

In [11]:
def get_X_y(df, y_cols, keep_sen=True):
    y_rev = y_cols.copy()
    y_rev.reverse()
    col_order = [col for col in df.columns if col not in y_cols] + y_rev
    df = df[col_order]
    y = df[y_cols].to_numpy()
    if(keep_sen is True):
        X = df.drop(y_cols[0], axis=1).to_numpy()
    elif(keep_sen is False):
        X = df.drop(y_cols, axis=1).to_numpy()
    index = {}
    for i, col in enumerate(y_cols):
        index[col] = i
    
    return(X, y, index)

def one_hot_encode(df, cat_cols):
    for col in cat_cols:
        one_hot_enc = pd.get_dummies(df[col], prefix=col)
        df = df.join(one_hot_enc)
        df = df.drop([col], axis=1)
    return(df)

### Adult dataset
label: Class (1 if income above 50k else 0)  
Sensitive attributes: Race, sex  
race - 1 if white  
sex - 1 if male

In [22]:
adult = pd.read_csv('data/adult/adult-full.csv')
adult.shape

(48842, 15)

In [23]:
adult['class'].value_counts()

<=50K    37155
>50K     11687
Name: class, dtype: int64

In [24]:
adult = adult.drop(['education', 'fnlwgt', 'capitalgain', 'capitalloss'], axis=1)
adult['class'] = adult['class'].apply(lambda x: 0 if x=='<=50K' else 1)

In [25]:
counts = adult['native-country'].value_counts()
replace = counts[counts <= 150].index
adult['native-country'] = adult['native-country'].replace(replace, 'other')

In [26]:
cat_cols_adult = ['workclass', 'marital-status', 'occupation', 'relationship', 'native-country']
adult = one_hot_encode(adult, cat_cols_adult)

adult['sex'] = adult['sex'].apply(lambda x: 1 if x=='Male' else 0)
adult['race'] = adult['race'].apply(lambda x: 1 if x=='White' else 0)

In [27]:
y_cols = ['class', 'sex', 'race']
adult_X, adult_y, adult_index = get_X_y(adult, y_cols)
adult_X_a, _, __ = get_X_y(adult, y_cols)

In [28]:
np.save('data/adult/X.npy', adult_X)
np.save('data/adult/y.npy', adult_y)
with open('data/adult/index', 'wb') as fp:
    pickle.dump(adult_index, fp)
    
np.save('data/adult/X_a.npy', adult_X_a)


### Compas dataset

In [29]:
compas = pd.read_csv('data/compas/propublica-recidivism_numerical-binsensitive.csv')
compas.shape

(6167, 403)

In [30]:
2809/(3358+2809)

0.45548889249229774

In [31]:
compas.two_year_recid.value_counts()

0    3358
1    2809
Name: two_year_recid, dtype: int64

In [32]:
y_cols = ['two_year_recid', 'race', 'sex']
compas_X, compas_y, compas_index = get_X_y(compas, y_cols)
compas_X_a, _, __ = get_X_y(compas, y_cols)

In [33]:
np.save('data/compas/X.npy', compas_X)
np.save('data/compas/y.npy', compas_y)
with open('data/compas/index', 'wb') as fp:
    pickle.dump(compas_index, fp)
    
np.save('data/compas/X_a.npy', compas_X_a)
