# DATA PARTITIONING

Given a raw data set, this notebook performs the following operations:
- randomly partitons a raw data set into training and test set
- further splits the training set into training and validation fold combinations
- applies `MaxAbsScaler` and exports scaled and raw data as CSV and pickle files

This ensures that fairness processors trained in other notebooks use the same data partititioning.

## 1. Parameters and preparations

In [1]:
##### PARAMETERS

# working path
path      = 'H:/Fair Credit Scoring/'
func_path = path + 'functions/'
data_path = path + 'data/'
res_path  = path + 'results/'
out_path  = path + 'output/'

# data  set
# one of ['bene', 'german', 'uk', 'taiwan', 'pkdd', 'gmsc', 'homecredit']
data = 'taiwan' 

# partitioning
test_ratio = 0.3
num_folds  = 5
seed       = 1

In [2]:
##### PACKAGES

import sys
sys.path.append(func_path)

import pickle
import numpy as np
import time
from sklearn.preprocessing import MaxAbsScaler

from load_data import *

!pip install numba==0.48
from aif360.metrics import BinaryLabelDatasetMetric

!pip install BlackBoxAuditing

Collecting numba==0.48
  Downloading numba-0.48.0-1-cp38-cp38-win_amd64.whl (2.1 MB)
Collecting llvmlite<0.32.0,>=0.31.0dev0
  Downloading llvmlite-0.31.0-cp38-cp38-win_amd64.whl (13.6 MB)
Installing collected packages: llvmlite, numba
Successfully installed llvmlite-0.31.0 numba-0.48.0
Collecting BlackBoxAuditing
  Downloading BlackBoxAuditing-0.1.54.tar.gz (2.6 MB)
Collecting networkx
  Downloading networkx-2.5.1-py3-none-any.whl (1.6 MB)
Building wheels for collected packages: BlackBoxAuditing
  Building wheel for BlackBoxAuditing (setup.py): started
  Building wheel for BlackBoxAuditing (setup.py): finished with status 'done'
  Created wheel for BlackBoxAuditing: filename=BlackBoxAuditing-0.1.54-py2.py3-none-any.whl size=1394769 sha256=e4380eb6de69f782d16bbfd18a8edec57fec2bef06b11be06f7f79dd0203cfa5
  Stored in directory: c:\users\kozodoin4.hub.rdc\appdata\local\pip\cache\wheels\e3\77\36\a32ec1b04c2ebe2c45e88d42f33f22f987e76aad3f297b681e
Successfully built BlackBoxAuditing
Installi

## 2. Data import

In [4]:
##### DATA IMPORT

dataset_orig = load_dataset(path = data_path + 'raw/' + data + '.csv', 
                            data = data) 

In [5]:
##### DATA PREP

# protected attribute
protected           = 'AGE'
privileged_groups   = [{'AGE': 1}] 
unprivileged_groups = [{'AGE': 0}]

# check dimensions
print(dataset_orig.metadata['params']['df'].shape)

(50000, 186)


## 3. Partitioning and scaling

In [6]:
##### PARTITIONING

# set seed
np.random.seed(seed)

# train / test partitioning
dataset_orig_train, dataset_orig_test = dataset_orig.split([1 - test_ratio], shuffle = True)
tr = dataset_orig_train.convert_to_dataframe()[0]
te = dataset_orig_test.convert_to_dataframe()[0]
 
# export test set
te.to_csv(data_path + 'prepared/' + data + '_' + 'orig_test' + '.csv', index = None, header = True)
pickle.dump(dataset_orig_test, open(data_path + 'prepared/' + data + '_orig_test.pkl',  'wb'))
print(tr.shape, te.shape)

# cross-validation on the training set
skf = dataset_orig_train.split(num_or_size_splits = num_folds, seed = seed, shuffle = True)

(35000, 186) (15000, 186)


In [7]:
##### SCALING

# timer
cv_start = time.time()

# data partitioning loop
for fold in range(num_folds):

    ##### DATA PARTITIONING

    # validation fold
    data_valid = skf[fold].copy()

    # train folds
    train_folds = [f for f in range(num_folds) if f != fold]
    for fold_idx in train_folds:

        if fold_idx == train_folds[0]:
            data_train = skf[fold_idx].copy()
        else:
            data_train.features             = np.concatenate([data_train.features, skf[fold_idx].features],                         axis = 0)
            data_train.instance_names       = np.concatenate([data_train.instance_names, skf[fold_idx].instance_names],             axis = 0)
            data_train.instance_weights     = np.concatenate([data_train.instance_weights, skf[fold_idx].instance_weights],         axis = 0)
            data_train.labels               = np.concatenate([data_train.labels, skf[fold_idx].labels],                             axis = 0)
            data_train.protected_attributes = np.concatenate([data_train.protected_attributes, skf[fold_idx].protected_attributes], axis = 0)
            data_train.scores               = np.concatenate([data_train.scores, skf[fold_idx].scores],                             axis = 0)

    # test set
    data_test = dataset_orig_test.copy()
    
    # convert to DF
    tr = data_train.convert_to_dataframe()[0]
    va = data_valid.convert_to_dataframe()[0]

    # export CSV
    tr.to_csv(data_path + 'prepared/' + data + '_' + 'orig_' + str(fold) + '_train' + '.csv', index = None, header=True)
    va.to_csv(data_path + 'prepared/' + data + '_' + 'orig_' + str(fold) + '_valid' + '.csv', index = None, header=True)
    
    # export pickle
    pickle.dump(data_train, open(data_path + 'prepared/' + data + '_orig_' + str(fold) + '_train.pkl', 'wb'))
    pickle.dump(data_valid, open(data_path + 'prepared/' + data + '_orig_' + str(fold) + '_valid.pkl', 'wb'))
    print('--', tr.shape, va.shape)


    ##### SCALING

    # scale features
    min_max_scaler      = MaxAbsScaler()
    data_train.features = min_max_scaler.fit_transform(data_train.features)
    data_valid.features = min_max_scaler.transform(data_valid.features)
    data_test.features  = min_max_scaler.transform(data_test.features)

    # convert to DF
    tr = data_train.convert_to_dataframe()[0]
    va = data_valid.convert_to_dataframe()[0]
    te = data_test.convert_to_dataframe()[0]

    # save CSV
    tr.to_csv(data_path + 'prepared/' + data + '_' + 'scaled_' + str(fold) + '_train' + '.csv', index = None, header = True)
    va.to_csv(data_path + 'prepared/' + data + '_' + 'scaled_' + str(fold) + '_valid' + '.csv', index = None, header = True)
    te.to_csv(data_path + 'prepared/' + data + '_' + 'scaled_' + str(fold) + '_test'  + '.csv', index = None, header = True)
    
    # save pickle
    pickle.dump(data_train, open(data_path + 'prepared/' + data + '_scaled_' + str(fold) + '_train.pkl', 'wb'))
    pickle.dump(data_valid, open(data_path + 'prepared/' + data + '_scaled_' + str(fold) + '_valid.pkl', 'wb'))
    pickle.dump(data_test,  open(data_path + 'prepared/' + data + '_scaled_' + str(fold) + '_test.pkl',  'wb'))

# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

-- (28000, 186) (7000, 186)
-- (28000, 186) (7000, 186)
-- (28000, 186) (7000, 186)
-- (28000, 186) (7000, 186)
-- (28000, 186) (7000, 186)

Finished in 1.90 minutes
