In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [91]:
path = '../data/adult/'

In [92]:
train_file = path + 'adult.data'
test_file = path + 'adult.test'

In [93]:
def raw_to_df(fname):
  return pd.read_csv(fname, header=None, delimiter = ' *, *', engine='python',
                     names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                            'marital-status', 'occupation', 'relationship', 'race', 'sex',
                            'capital-gain', 'capital-loss', 'hours-per-week',
                            'native-country', 'income'])

df_train_init = raw_to_df(train_file)
df_test = raw_to_df(test_file)

In [94]:
#separate original train into train and validation
df_train, df_valid = train_test_split(df_train_init, test_size=0.2, random_state=1234)

In [95]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(26048, 15)
(6513, 15)
(16281, 15)


In [96]:
# df_train[(df_train == '?').any(axis=1)]
# df_valid[(df_valid == '?').any(axis=1)]
# df_test[(df_test == '?').any(axis=1)]

In [97]:
age_buckets = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, float('+inf')]
continuous_cols = ['capital-gain', 'capital-loss', 'education-num', 'hours-per-week']

def featurize(df):
  df = df.copy()
  # income to binary classification
  df['income'].replace(['<=50K', '<=50K.'], 0, inplace=True)
  df['income'].replace(['>50K', '>50K.'], 1, inplace=True)
  # prune columns
  df.drop('fnlwgt', axis=1, inplace=True)
  # bucket age
  df['age'] = pd.cut(df['age'], age_buckets, right=False)
  # one-hot
  df = pd.get_dummies(df, sparse=True)
  # split X, y
  X = df.drop('income', axis=1)
  y = df['income']
  # normalize continuous cols in X
  for col in continuous_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()
  return X, y

In [98]:
X_train.head()

Unnamed: 0,education-num,capital-gain,capital-loss,hours-per-week,"age_[0.0, 18.0)","age_[18.0, 25.0)","age_[25.0, 30.0)","age_[30.0, 35.0)","age_[35.0, 40.0)","age_[40.0, 45.0)",...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_Holand-Netherlands
29187,-0.422033,-0.146332,-0.219001,4.760819,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
23535,1.132479,-0.146332,-0.219001,-0.039399,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
14450,-0.422033,-0.146332,-0.219001,-0.039399,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2727,-0.422033,-0.146332,-0.219001,-0.039399,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
18474,-0.033405,-0.146332,-0.219001,-1.666592,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [99]:
X_train, y_train = featurize(df_train)
X_valid, y_valid = featurize(df_valid)
X_test, y_test = featurize(df_test)

In [107]:
# make sure train and test are not missing features
def match_cols(a, b):
  b_missing = set(a.columns) - set(b.columns)
  a_missing = set(b.columns) - set(a.columns)
  for col in a_missing:
    a[col] = 0
  for col in b_missing:
    b[col] = 0

match_cols(X_train, X_test)
match_cols(X_train, X_valid)
match_cols(X_test, X_valid)
print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)

(26048, 117)
(16281, 117)
(6513, 117)


In [108]:
print(df_train['race'].unique())
print(df_train['sex'].unique())
age_train = pd.cut(df_train['age'], age_buckets, right=False, labels=range(11)).astype(int)
age_valid = pd.cut(df_valid['age'], age_buckets, right=False, labels=range(11)).astype(int)
age_test = pd.cut(df_test['age'], age_buckets, right=False, labels=range(11)).astype(int)
print(age_train.unique())

['White' 'Black' 'Other' 'Amer-Indian-Eskimo' 'Asian-Pac-Islander']
['Female' 'Male']
[ 3  8  1  6  2  4  7  5  9 10  0]


In [109]:
race_ids = {'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4}
sex_ids = {'Male': 0, 'Female': 1}

def featurize_protected(df, protected, id_map):
  col = df[protected].copy()
  col.replace(id_map, inplace=True)
  return col

race_train = featurize_protected(df_train, 'race', race_ids)
race_valid = featurize_protected(df_valid, 'race', race_ids)
race_test = featurize_protected(df_test, 'race', race_ids)
sex_train = featurize_protected(df_train, 'sex', sex_ids)
sex_valid = featurize_protected(df_valid, 'sex', sex_ids)
sex_test = featurize_protected(df_test, 'sex', sex_ids)
race_train.head()

29187    0
23535    0
14450    0
2727     0
18474    0
Name: race, dtype: int64

In [110]:
X_train.to_pickle(path + 'X_train.pkl')
y_train.to_pickle(path + 'y_train.pkl')

X_valid.to_pickle(path + 'X_valid.pkl')
y_valid.to_pickle(path + 'y_valid.pkl')

X_test.to_pickle(path + 'X_test.pkl')
y_test.to_pickle(path + 'y_test.pkl')

race_train.to_pickle(path + 'race_train.pkl')
race_valid.to_pickle(path + 'race_valid.pkl')
race_test.to_pickle(path + 'race_test.pkl')

sex_train.to_pickle(path + 'sex_train.pkl')
sex_valid.to_pickle(path + 'sex_valid.pkl')
sex_test.to_pickle(path + 'sex_test.pkl')

age_train.to_pickle(path + 'age_train.pkl')
age_valid.to_pickle(path + 'age_valid.pkl')
age_test.to_pickle(path + 'age_test.pkl')

In [111]:
# NOW COMPUTE STATS ON TRAIN AND TEST -- OUT OF DATE AS NOT COMPUTED ON VALID

num_train = sum(1 for i,j in sex_train.iteritems())
num_test = sum(1 for i,j in sex_test.iteritems())

print ("TRAIN prop women: ", 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 1))/num_train)
print ("TRAIN prop men: ", 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 0))/num_train)

print ("TEST prop women: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 1))/num_test)
print ("TEST prop men: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 0))/num_test)

print ("Total pop women: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 1)) + sum(1 for i, j in sex_test.iteritems() if (j == 1)))/(num_train+num_test))
print ("Total pop men: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 0)) + sum(1 for i, j in sex_test.iteritems() if (j == 0)))/(num_train+num_test))

TRAIN prop women:  32.89695945945946
TRAIN prop men:  67.10304054054055
TEST prop women:  33.29648056016215
TEST prop men:  66.70351943983785
Total pop women:  33.050627229558934
Total pop men:  66.94937277044107


In [112]:
num_train = sum(1 for i,j in race_train.iteritems())
num_test = sum(1 for i,j in race_test.iteritems())

for q in range(5):
    print ("Total race pop demographic ", q, ": ", 100.0*(sum(1 for i, j in race_train.iteritems() if (j == q)) + sum(1 for i, j in race_test.iteritems() if (j == q)))/(num_train+num_test))


Total race pop demographic  0 :  85.51584020411538
Total race pop demographic  1 :  9.570270972619245
Total race pop demographic  2 :  3.1066172127855607
Total race pop demographic  3 :  0.9709655319048407
Total race pop demographic  4 :  0.8363060785749722


In [113]:
num_train = sum(1 for i,j in age_train.iteritems())
num_test = sum(1 for i,j in age_test.iteritems())
s = 0
for q in range(11):
    s += 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test)
    print ("Total age pop demographic ", q, ": ", 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test))
print(s)

Total age pop demographic  0 :  1.245009331663871
Total age pop demographic  1 :  16.064636537598336
Total age pop demographic  2 :  12.471355335585532
Total age pop demographic  3 :  13.279312055564743
Total age pop demographic  4 :  13.22970067802216
Total age pop demographic  5 :  11.729547118996432
Total age pop demographic  6 :  10.123083465236599
Total age pop demographic  7 :  7.772449148338019
Total age pop demographic  8 :  5.7336577759928185
Total age pop demographic  9 :  4.077582744690401
Total age pop demographic  10 :  4.273665808311087
100.00000000000003
