## Imports

In [1]:
# Import packages
import os
import random
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, normalized_mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import scale, normalize
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_validate, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
%matplotlib qt

  from collections import Sequence
  from numpy.core.umath_tests import inner1d


## Loading Datasets

In [123]:
adult_train_path = os.path.join(os.getcwd(), 'Datasets', 'adult_train.csv') 
adult_test_path = os.path.join(os.getcwd(), 'Datasets', 'adult_test.csv')
fair_tgan_adult_path = os.path.join(os.getcwd(), 'Datasets', 'fair_tgan_adult_data_18_small.csv')
base_tgan_adult_path = os.path.join(os.getcwd(), 'Datasets', 'base_tgan_adult_data_2_small.csv')
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'Income']

In [None]:
%matplotlib qt

In [124]:
def prep_data(path, columns):

    data = []

    for line in open(path):
        line = line.split(',')

        if line[13] != 'United-States':
            line[13] = 'Non-US'
        else:
            line[13] = 'US'

        if line[3] in ['Preschool', '1st-4th', '5th-6th', '7th-8th']:
            line[3] = 'prim-middle-school'
        elif line[3] in ['9th', '10th', '11th', '12th']:
            line[3] = "high-school"

        line[-1] = line[-1][:-1]

        to_int = lambda a : int(line[a])
        for i in [0,2,4,10,11,12]:
            line[i] = to_int(i)

        data.append(line)
        
    return pd.DataFrame(data, columns=column_names)

In [125]:
adult_train_data = prep_data(adult_train_path, columns=column_names)
adult_test_data = prep_data(adult_test_path, columns=column_names)
fair_tgan_adult_data = prep_data(fair_tgan_adult_path, columns=column_names)
base_tgan_adult_data = prep_data(base_tgan_adult_path, columns=column_names)

## Crosstabulation on Original Dataset

In [36]:
train_f_pos = 0
train_f_neg = 0
train_m_pos = 0
train_m_neg = 0

test_f_pos = 0
test_f_neg = 0
test_m_pos = 0
test_m_neg = 0


for i in range(30162):
    if adult_train_data['sex'][i] == 'Female':
        if adult_train_data['Income'][i] == '>50K':
            train_f_pos += 1
        else: train_f_neg += 1
    else:
        if adult_train_data['Income'][i] == '>50K':
            train_m_pos += 1
        else: train_m_neg += 1
    
for i in range(adult_test_data.shape[0]):
    if adult_test_data['sex'][i] == 'Female':
        if adult_test_data['Income'][i] == '>50K':
            test_f_pos += 1
        else: test_f_neg += 1
    else:
        if adult_test_data['Income'][i] == '>50K':
            test_m_pos += 1
        else: test_m_neg += 1

In [45]:
pd.crosstab(adult_train_data['sex'], adult_train_data['Income']).plot(kind='bar')
plt.title('Income class per sex in training data')
plt.xlabel('Sex')
plt.ylabel('Income')
x = [-0.25,0.75]
for a,b in zip(x, [train_f_neg, train_m_neg]):
    plt.text(a, b+100, str(b), {'color':  'blue', 'size': 14})
x = [0.03, 1.03]
for a,b in zip(x, [train_f_pos, train_m_pos]):
    plt.text(a, b+100, str(b), {'color':  'orange', 'size': 14})

In [48]:
pd.crosstab(adult_test_data['sex'], adult_test_data['Income']).plot(kind='bar')
plt.title('Income class per sex in test data')
plt.xlabel('Sex')
plt.ylabel('Income')
x = [-0.23,0.77]
for a,b in zip(x, [test_f_neg, test_m_neg]):
    plt.text(a, b+100, str(b), {'color':  'blue', 'size': 14})
x = [0.05, 1.05]
for a,b in zip(x, [test_f_pos, test_m_pos]):
    plt.text(a, b+100, str(b), {'color':  'orange', 'size': 14})

## Sensitive attribute and output label distributions

In [5]:
idx = random.sample(range(0, 40000), 30162)

In [126]:
df_sex = pd.DataFrame()
df_sex['sex'] = pd.concat([adult_train_data['sex'], base_tgan_adult_data['sex'].iloc[idx], fair_tgan_adult_data['sex'].iloc[idx]], axis=0)

In [127]:
df_income = pd.DataFrame()
df_income['Income'] = pd.concat([adult_train_data['Income'], base_tgan_adult_data['Income'].iloc[idx], fair_tgan_adult_data['Income'].iloc[idx]], axis=0)

In [128]:
# adult_train = np.repeat('Original Data', adult_train_data.shape[0])
adult_train = np.repeat('Original', adult_train_data.shape[0])
# base_tgan = np.repeat('Base TGAN Data', base_tgan_adult_data.shape[0])
base_tgan = np.repeat('Base TGAN Model', 30162)
# fair_tgan = np.repeat('Fair TGAN Data', fair_tgan_adult_data.shape[0])
fair_tgan = np.repeat('Fair TGAN Model 6', 30162)
data_tags = np.concatenate((adult_train, base_tgan), axis=0)
data_tags = np.concatenate((data_tags, fair_tgan), axis=0)

In [129]:
df_sex['Data'] = data_tags
df_income['Data'] = data_tags

In [130]:
f_or = 0
f_base = 0
f_fair = 0
m_or = 0
m_base = 0
m_fair = 0
pos_or = 0
pos_base = 0
pos_fair = 0
neg_or = 0
neg_base = 0
neg_fair = 0

for i in range(30162):
    if adult_train_data['sex'][i] == 'Female':
        f_or += 1
    else: m_or += 1
    if adult_train_data['Income'][i]  == '>50K':
        pos_or += 1
    else: neg_or += 1
        
    if base_tgan_adult_data['sex'][i]  == 'Female':
        f_base += 1
    else: m_base += 1
    if base_tgan_adult_data['Income'][i]  == '>50K':
        pos_base += 1
    else: neg_base += 1
        
    if fair_tgan_adult_data['sex'][i]  == 'Female':
        f_fair += 1
    else: m_fair += 1
    if fair_tgan_adult_data['Income'][i]  == '>50K':
        pos_fair += 1
    else: neg_fair += 1

In [131]:
pd.crosstab(df_sex['sex'], df_sex['Data']).plot(kind='bar')
plt.title('Sex attribute distribution between datasets')
plt.xticks([0,1],['Female','Male'], rotation=360)
plt.xlabel('Sex')
plt.ylabel('Count')
x = [-0.3,0.7]
for a,b in zip(x, [f_base, m_base]):
    plt.text(a, b+200, str(b), {'color':  'blue', 'size': 12})
x = [-0.085,0.89]
for a,b in zip(x, [f_fair, m_fair]):
    if b == m_fair:
        plt.text(a, m_or+200, str(b), {'color':  'darkorange', 'size': 12})
    else:
        plt.text(a, b+200, str(b), {'color':  'orange', 'size': 12})
x = [0.1,1.10]
for a,b in zip(x, [f_or, m_or]):
    plt.text(a, b+200, str(b), {'color':  'green', 'size': 12})
plt.show()

In [133]:
plt.clf
pd.crosstab(df_income['Income'], df_income['Data']).plot(kind='bar')
plt.title('Output label distribution between datasets')
plt.xticks([0,1],['<=50K','>50K'], rotation=360)
plt.xlabel('Income')
plt.ylabel('Count')
x = [-0.32,0.72]
for a,b in zip(x, [neg_base, pos_base]):
    plt.text(a, b+200, str(b), {'color':  'blue', 'size': 12})
x = [-0.11,0.92]
for a,b in zip(x, [neg_fair, pos_fair]):
    if b == neg_fair:
        plt.text(a, neg_or+200, str(b), {'color':  'orange', 'size': 12})
    else:
        plt.text(a, b+200, str(b), {'color':  'orange', 'size': 12})
#     plt.text(a, b+200, str(b), {'color':  'orange', 'size': 12})
x = [0.1,1.10]
for a,b in zip(x, [neg_or, pos_or]):
    plt.text(a, b+200, str(b), {'color':  'green', 'size': 12})
plt.show()

## Correlation Matrices

In [78]:
def corr_matrix(data):
    plt.clf()
    corr = data.corr()
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True
    )
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right'
    );
    plt.show()

In [64]:
corr_matrix(adult_train_data)

In [65]:
corr_matrix(adult_test_data)

In [73]:
corr_matrix(base_tgan_adult_data)

In [79]:
corr_matrix(fair_tgan_adult_data)