In [1]:
#Importing Libraries
#Data Handling
import pandas as pd
import numpy as np

import sys
sys.path.append('../../')

#Pipelines
import source.pipes as op

#Sampling
from sklearn.model_selection import train_test_split

#Storing trains
import pickle
%load_ext autoreload
%autoreload 2

In [2]:
ds = pd.read_excel('../../data/to-ml-compas/compas_final.xlsx', sheet_name='two_years_recid', index_col=0)


In [16]:
for i, x in enumerate(ds.columns):
    print("{0}-{1}: {2}, Nan: {3:2f}".format(i, x, ds[x].dtype, 100*ds[x].isna().sum()/len(ds)))

0-sex: object, Nan: 0.000000
1-age: int64, Nan: 0.000000
2-age_cat: object, Nan: 0.000000
3-race: object, Nan: 0.000000
4-juv_fel_count: int64, Nan: 0.000000
5-juv_misd_count: int64, Nan: 0.000000
6-juv_other_count: int64, Nan: 0.000000
7-priors_count: int64, Nan: 0.000000
8-c_days_jail: int64, Nan: 0.000000
9-c_charge_degree: object, Nan: 0.000000
10-is_recid: int64, Nan: 0.000000
11-two_year_recid: int64, Nan: 0.000000


In [18]:
#Selecting useful variables
Y = ds.iloc[:,[11]]
A = ds.iloc[:,[3]]
X = ds.iloc[:,[0,1,2,4,5,6,7,8,9]]

# Split Dataset

Split dataset into train and test. Train will be used to fit and transform estimators, such as imputation, normalization, and predictive models. Notice that estimators will be used for transform in the data test only.

For this case, 70% and 30% will be used for train and test respectively.

In [19]:
causal_context = (Y, pd.concat([X,A], axis=1, ignore_index=False))

In [20]:
for i, x in enumerate(X.columns):
    print("{0}-{1}: {2}, Nan: {3:2f}".format(i, x, X[x].dtype, 100*X[x].isna().sum()/len(ds)))

0-sex: object, Nan: 0.000000
1-age: int64, Nan: 0.000000
2-age_cat: object, Nan: 0.000000
3-juv_fel_count: int64, Nan: 0.000000
4-juv_misd_count: int64, Nan: 0.000000
5-juv_other_count: int64, Nan: 0.000000
6-priors_count: int64, Nan: 0.000000
7-c_days_jail: int64, Nan: 0.000000
8-c_charge_degree: object, Nan: 0.000000


In [22]:
analysis_scenarios = ['compas', 'fairness']

for key in analysis_scenarios:
    print('Considering {} context'.format(key))
    Y_final, X_final = causal_context
    
    X_train, X_test, y_train, y_test = train_test_split(X_final, Y_final, test_size = 0.3, stratify= Y_final, random_state = 1)
    A_train = X_train.loc[:,['race']]
    A_test = X_test.loc[:,['race']]

    #Get the datasets considering X_test, y_test, and A_test
    X_train_notaware_race = X_train.iloc[np.array((A_train['race']=='African-American')|(A_train['race']=='Caucasian')),X_train.columns != 'race']
    X_train_aware = X_train
    
    #Creating a dictionary for datasets
    train_datasets = {}
    train_datasets = {
        'aware': X_train_aware,
        'race': X_train_notaware_race,
    }

    #Dictionary where pickles will be stored
    preprocessing_pickles = {}
    preprocessing_pickles_norm = {}
    
    enc = 'label'
    
    idnumerical = [1, 3, 4, 5, 6,7]
    
    #Creating pickles for each case
    for td in train_datasets:
        print('Training prepro for {}'.format(td))
        data = train_datasets[td]
        X, pipe_nom, pipe_num, numerical_features , nominal_features = op.preprocessing(data, idnumerical=idnumerical, imputation=True, encode = enc, normalization = False)
        preprocessing_pickles[td] = (X, pipe_nom, pipe_num)
        
        X, pipe_nom, pipe_num, numerical_features , nominal_features = op.preprocessing(data, idnumerical=idnumerical, imputation=True, encode = enc, normalization = True)
        preprocessing_pickles_norm[td] = (X, pipe_nom, pipe_num)
        
        numerical_features.extend(nominal_features)        
        with open('../../pipes/compas-recid/'+key+'/preprocessing_features_'+td+'.pickle', 'wb') as f:
            pickle.dump(numerical_features, f)
            
        print('Total of variables: {0}'.format(len(numerical_features)))

    #Run following codes for storing pipelines on pickles
    with open('../../pipes/compas-recid/'+key+'/preprocessing.pickle', 'wb') as f:
        pickle.dump(preprocessing_pickles, f)
        
    with open('../../pipes/compas-recid/'+key+'/preprocessing_norm.pickle', 'wb') as f:
        pickle.dump(preprocessing_pickles_norm, f)
    
    #Get the datasets considering X_test, y_test, and A_test
    X_test_notaware_race= X_test.iloc[np.array((A_test['race']=='African-American')|(A_test['race']=='Caucasian')),X_test.columns != 'race']
    X_test_aware = X_test
    
    test_datasets = {}
    test_datasets = {
        'aware': X_test_aware,
        'race': X_test_notaware_race,
    }
    
    #Applying Preprocessing Pipelines
    data_test_prepro = {}
    data_test_prepro_norm = {}
    
    for d in preprocessing_pickles.keys():
        print("Transforming test dataset {}".format(d))
        ds = test_datasets[d]
        
        _ , pnom, pnum = preprocessing_pickles[d]
        prep_d = op.applypreprocessing(ds, pnom, pnum, idnumerical= idnumerical)
        data_test_prepro[d] = prep_d

        _ , pnom, pnum = preprocessing_pickles_norm[d]
        prep_d = op.applypreprocessing(ds, pnom, pnum, idnumerical=idnumerical)
        data_test_prepro_norm[d] = prep_d
        
    
    with open('../../pipes/compas-recid/'+key+'/preprocessing_test.pickle','wb') as f:
        pickle.dump(data_test_prepro,f)
    with open('../../pipes/compas-recid/'+key+'/preprocessing_test_norm.pickle', 'wb') as f:
        pickle.dump(data_test_prepro_norm, f)
   
    y_train_race= y_train.iloc[np.array((A_train['race']=='African-American')|(A_train['race']=='Caucasian')),:]
    
    y_test_race= y_test.iloc[np.array((A_test['race']=='African-American')|(A_test['race']=='Caucasian')),:]
    
    y = {'train_aware': y_train,
         'train_race': y_train_race,
         'test_aware': y_test,
         'test_race': y_test_race,
    }
    
    with open('../../pipes/compas-recid/'+key+'/y.pickle', 'wb') as f:
        pickle.dump(y,f)
        
    A_train_race = A_train.iloc[np.array((A_train['race']=='African-American')|(A_train['race']=='Caucasian')), :]
    A_test_race = A_test.iloc[np.array((A_test['race']=='African-American')|(A_test['race']=='Caucasian')), :]
    
    A = {'train_aware': A_train,
         'train_race': A_train_race,
         'test_aware': A_test,
         'test_race': A_test_race,
        }
    with open('../../pipes/compas-recid/'+key+'/A.pickle','wb') as f:
        pickle.dump(A, f)
        
    print()

Considering compas context
Training prepro for aware
Total of variables: 10
Training prepro for race
Total of variables: 9
Transforming test dataset aware
Transforming test dataset race

Considering fairness context
Training prepro for aware
Total of variables: 10
Training prepro for race
Total of variables: 9
Transforming test dataset aware
Transforming test dataset race

