In [134]:
import numpy as np
import pandas as pd

In [135]:
path = '../data/adult/'

In [136]:
train_file = path + 'adult.data'
test_file = path + 'adult.test'

In [137]:
def raw_to_df(fname):
  return pd.read_csv(fname, header=None, delimiter = ' *, *', engine='python',
                     names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                            'marital-status', 'occupation', 'relationship', 'race', 'sex',
                            'capital-gain', 'capital-loss', 'hours-per-week',
                            'native-country', 'income'])

df_train = raw_to_df(train_file)
df_test = raw_to_df(test_file)

In [138]:
print(df_train.shape)
print(df_test.shape)

(32561, 15)
(16281, 15)


In [139]:
df_train[(df_train == '?').any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
38,31,Private,84154,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,38,?,>50K
51,18,Private,226956,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,30,?,<=50K
61,32,?,293936,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
69,25,?,200681,Some-college,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
93,30,Private,117747,HS-grad,9,Married-civ-spouse,Sales,Wife,Asian-Pac-Islander,Female,0,1573,35,?,<=50K
106,17,?,304873,10th,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K
128,35,?,129305,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,<=50K


In [140]:
age_buckets = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, float('+inf')]
continuous_cols = ['capital-gain', 'capital-loss', 'education-num', 'hours-per-week']

def featurize(df):
  df = df.copy()
  # income to binary classification
  df['income'].replace(['<=50K', '<=50K.'], 0, inplace=True)
  df['income'].replace(['>50K', '>50K.'], 1, inplace=True)
  # prune columns
  df.drop('fnlwgt', axis=1, inplace=True)
  # bucket age
  df['age'] = pd.cut(df['age'], age_buckets, right=False)
  # one-hot
  df = pd.get_dummies(df, sparse=True)
  # split X, y
  X = df.drop('income', axis=1)
  y = df['income']
  # normalize continuous cols in X
  for col in continuous_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()
  return X, y

In [141]:
X_train, y_train = featurize(df_train)
X_test, y_test = featurize(df_test)

In [142]:
# make sure train and test are not missing features
def match_cols(a, b):
  b_missing = set(a.columns) - set(b.columns)
  a_missing = set(b.columns) - set(a.columns)
  for col in a_missing:
    a[col] = 0
  for col in b_missing:
    b[col] = 0

match_cols(X_train, X_test)
print(X_train.shape)
print(X_test.shape)

(32561, 117)
(16281, 117)


In [143]:
print(df_train['race'].unique())
print(df_train['sex'].unique())
nums = [str(i) for i in range(11)]
print(nums)
df_train['age'] = pd.cut(df_train['age'], age_buckets, right=False, labels=range(11))
df_test['age'] = pd.cut(df_test['age'], age_buckets, right=False, labels=range(11))
print(df_train['age'].unique())

['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
['Male' 'Female']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
[4, 7, 2, 6, 3, ..., 1, 8, 10, 0, 9]
Length: 11
Categories (11, int64): [0 < 1 < 2 < 3 ... 7 < 8 < 9 < 10]


In [144]:
race_ids = {'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4}
sex_ids = {'Male': 0, 'Female': 1}

def featurize_protected(df, protected, id_map):
  col = df[protected].copy()
  col.replace(id_map, inplace=True)
  return col

race_train = featurize_protected(df_train, 'race', race_ids)
race_test = featurize_protected(df_test, 'race', race_ids)
sex_train = featurize_protected(df_train, 'sex', sex_ids)
sex_test = featurize_protected(df_test, 'sex', sex_ids)
age_train = df_train['age'].copy()
age_test = df_test['age'].copy()

In [145]:
X_train.to_pickle(path + 'X_train.pkl')
y_train.to_pickle(path + 'y_train.pkl')

X_test.to_pickle(path + 'X_test.pkl')
y_test.to_pickle(path + 'y_test.pkl')

race_train.to_pickle(path + 'race_train.pkl')
race_test.to_pickle(path + 'race_test.pkl')

sex_train.to_pickle(path + 'sex_train.pkl')
sex_test.to_pickle(path + 'sex_test.pkl')

age_train.to_pickle(path + 'age_train.pkl')
age_test.to_pickle(path + 'age_test.pkl')

In [149]:
num_train = sum(1 for i,j in sex_train.iteritems())
num_test = sum(1 for i,j in sex_test.iteritems())

print ("TRAIN prop women: ", 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 1))/num_train)
print ("TRAIN prop men: ", 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 0))/num_train)

print ("TEST prop women: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 1))/num_test)
print ("TEST prop men: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 0))/num_test)

print ("Total pop women: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 1)) + sum(1 for i, j in sex_test.iteritems() if (j == 1)))/(num_train+num_test))
print ("Total pop men: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 0)) + sum(1 for i, j in sex_test.iteritems() if (j == 0)))/(num_train+num_test))

('TRAIN prop women: ', 33.07945087681582)
('TRAIN prop men: ', 66.92054912318417)
('TEST prop women: ', 33.29648056016215)
('TEST prop men: ', 66.70351943983785)
('Total pop women: ', 33.15179558576635)
('Total pop men: ', 66.84820441423365)


In [150]:
num_train = sum(1 for i,j in race_train.iteritems())
num_test = sum(1 for i,j in race_test.iteritems())

for q in range(5):
    print ("Total race pop demographic ", q, ": ", 100.0*(sum(1 for i, j in race_train.iteritems() if (j == q)) + sum(1 for i, j in race_test.iteritems() if (j == q)))/(num_train+num_test))


('Total race pop demographic ', 0, ': ', 85.5042791040498)
('Total race pop demographic ', 1, ': ', 9.592154293435977)
('Total race pop demographic ', 2, ': ', 3.110028254371238)
('Total race pop demographic ', 3, ': ', 0.9622865566520618)
('Total race pop demographic ', 4, ': ', 0.83125179149093)


In [151]:
num_train = sum(1 for i,j in age_train.iteritems())
num_test = sum(1 for i,j in age_test.iteritems())
s = 0
for q in range(11):
    s += 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test)
    print ("Total age pop demographic ", q, ": ", 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test))
print(s)

('Total age pop demographic ', 0, ': ', 1.2182138323573972)
('Total age pop demographic ', 1, ': ', 16.04561647762172)
('Total age pop demographic ', 2, ': ', 12.45444494492445)
('Total age pop demographic ', 3, ': ', 13.295933827443594)
('Total age pop demographic ', 4, ': ', 13.175136153310675)
('Total age pop demographic ', 5, ': ', 11.789034028090578)
('Total age pop demographic ', 6, ': ', 10.167478809221572)
('Total age pop demographic ', 7, ': ', 7.790426272470415)
('Total age pop demographic ', 8, ': ', 5.761434830678515)
('Total age pop demographic ', 9, ': ', 4.029319028704803)
('Total age pop demographic ', 10, ': ', 4.272961795176283)
100.0
