In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [20]:
path = '../data/adult/'

In [21]:
train_file = path + 'adult.data'
test_file = path + 'adult.test'

In [22]:
def raw_to_df(fname):
  return pd.read_csv(fname, header=None, delimiter = ' *, *', engine='python',
                     names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                            'marital-status', 'occupation', 'relationship', 'race', 'sex',
                            'capital-gain', 'capital-loss', 'hours-per-week',
                            'native-country', 'income'])

df_train_init = raw_to_df(train_file)
df_test = raw_to_df(test_file)

In [23]:
#separate original train into train and validation
df_train, df_valid = train_test_split(df_train_init, test_size=0.2)

In [24]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(26048, 15)
(6513, 15)
(16281, 15)


In [25]:
df_train[(df_train == '?').any(axis=1)]
df_valid[(df_valid == '?').any(axis=1)]
df_test[(df_test == '?').any(axis=1)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K.
13,58,?,299831,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,35,United-States,<=50K.
19,40,Private,85019,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,45,?,>50K.
22,72,?,132015,7th-8th,4,Divorced,?,Not-in-family,White,Female,0,0,6,United-States,<=50K.
35,65,?,191846,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,<=50K.
65,41,Private,109912,Bachelors,13,Never-married,Other-service,Not-in-family,White,Female,0,0,40,?,<=50K.
75,17,?,165361,10th,6,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K.
83,44,Self-emp-inc,223881,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,99999,0,50,?,>50K.
89,41,?,38434,Masters,14,Married-civ-spouse,?,Wife,White,Female,7688,0,10,United-States,>50K.


In [26]:
age_buckets = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, float('+inf')]
continuous_cols = ['capital-gain', 'capital-loss', 'education-num', 'hours-per-week']

def featurize(df):
  df = df.copy()
  # income to binary classification
  df['income'].replace(['<=50K', '<=50K.'], 0, inplace=True)
  df['income'].replace(['>50K', '>50K.'], 1, inplace=True)
  # prune columns
  df.drop('fnlwgt', axis=1, inplace=True)
  # bucket age
  df['age'] = pd.cut(df['age'], age_buckets, right=False)
  # one-hot
  df = pd.get_dummies(df, sparse=True)
  # split X, y
  X = df.drop('income', axis=1)
  y = df['income']
  # normalize continuous cols in X
  for col in continuous_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()
  return X, y

In [27]:
X_train, y_train = featurize(df_train)
X_valid, y_valid = featurize(df_valid)
X_test, y_test = featurize(df_test)

In [28]:
# make sure train and test are not missing features
def match_cols(a, b):
  b_missing = set(a.columns) - set(b.columns)
  a_missing = set(b.columns) - set(a.columns)
  for col in a_missing:
    a[col] = 0
  for col in b_missing:
    b[col] = 0

match_cols(X_train, X_test)
match_cols(X_train, X_valid)
print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)
print(y_valid.shape)

(26048, 117)
(16281, 117)
(6513, 117)
(6513,)


In [29]:
print(df_train['race'].unique())
print(df_train['sex'].unique())
nums = [str(i) for i in range(11)]
print(nums)
df_train['age'] = pd.cut(df_train['age'], age_buckets, right=False, labels=range(11))
df_valid['age'] = pd.cut(df_valid['age'], age_buckets, right=False, labels=range(11))
df_test['age'] = pd.cut(df_test['age'], age_buckets, right=False, labels=range(11))
print(df_train['age'].unique())

['White' 'Asian-Pac-Islander' 'Black' 'Amer-Indian-Eskimo' 'Other']
['Male' 'Female']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
[4, 6, 8, 7, 2, ..., 5, 1, 9, 10, 0]
Length: 11
Categories (11, int64): [0 < 1 < 2 < 3 ... 7 < 8 < 9 < 10]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
race_ids = {'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4}
sex_ids = {'Male': 0, 'Female': 1}

def featurize_protected(df, protected, id_map):
  col = df[protected].copy()
  col.replace(id_map, inplace=True)
  return col

race_train = featurize_protected(df_train, 'race', race_ids)
race_valid = featurize_protected(df_valid, 'race', race_ids)
race_test = featurize_protected(df_test, 'race', race_ids)
sex_train = featurize_protected(df_train, 'sex', sex_ids)
sex_valid = featurize_protected(df_valid, 'sex', sex_ids)
sex_test = featurize_protected(df_test, 'sex', sex_ids)
age_train = df_train['age'].copy()
age_valid = df_valid['age'].copy()
age_test = df_test['age'].copy()

In [31]:
X_train.to_pickle(path + 'X_train.pkl')
y_train.to_pickle(path + 'y_train.pkl')

X_valid.to_pickle(path + 'X_valid.pkl')
y_valid.to_pickle(path + 'y_valid.pkl')

X_test.to_pickle(path + 'X_test.pkl')
y_test.to_pickle(path + 'y_test.pkl')

race_train.to_pickle(path + 'race_train.pkl')
race_valid.to_pickle(path + 'race_valid.pkl')
race_test.to_pickle(path + 'race_test.pkl')

sex_train.to_pickle(path + 'sex_train.pkl')
sex_valid.to_pickle(path + 'sex_valid.pkl')
sex_test.to_pickle(path + 'sex_test.pkl')

age_train.to_pickle(path + 'age_train.pkl')
age_valid.to_pickle(path + 'age_valid.pkl')
age_test.to_pickle(path + 'age_test.pkl')

In [32]:
# NOW COMPUTE STATS ON TRAIN AND TEST -- OUT OF DATE AS NOT COMPUTED ON VALID

num_train = sum(1 for i,j in sex_train.iteritems())
num_test = sum(1 for i,j in sex_test.iteritems())

print ("TRAIN prop women: ", 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 1))/num_train)
print ("TRAIN prop men: ", 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 0))/num_train)

print ("TEST prop women: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 1))/num_test)
print ("TEST prop men: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 0))/num_test)

print ("Total pop women: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 1)) + sum(1 for i, j in sex_test.iteritems() if (j == 1)))/(num_train+num_test))
print ("Total pop men: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 0)) + sum(1 for i, j in sex_test.iteritems() if (j == 0)))/(num_train+num_test))

TRAIN prop women:  33.07739557739558
TRAIN prop men:  66.92260442260442
TEST prop women:  33.29648056016215
TEST prop men:  66.70351943983785
Total pop women:  33.16166221739233
Total pop men:  66.83833778260767


In [33]:
num_train = sum(1 for i,j in race_train.iteritems())
num_test = sum(1 for i,j in race_test.iteritems())

for q in range(5):
    print ("Total race pop demographic ", q, ": ", 100.0*(sum(1 for i, j in race_train.iteritems() if (j == q)) + sum(1 for i, j in race_test.iteritems() if (j == q)))/(num_train+num_test))


Total race pop demographic  0 :  85.43315457487775
Total race pop demographic  1 :  9.686030853551939
Total race pop demographic  2 :  3.0711805145408584
Total race pop demographic  3 :  0.9733279784544875
Total race pop demographic  4 :  0.8363060785749722


In [34]:
num_train = sum(1 for i,j in age_train.iteritems())
num_test = sum(1 for i,j in age_test.iteritems())
s = 0
for q in range(11):
    s += 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test)
    print ("Total age pop demographic ", q, ": ", 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test))
print(s)

Total age pop demographic  0 :  1.2473717782135179
Total age pop demographic  1 :  16.052824304850102
Total age pop demographic  2 :  12.457180656287651
Total age pop demographic  3 :  13.201351319426397
Total age pop demographic  4 :  13.232063124571807
Total age pop demographic  5 :  11.722459779347492
Total age pop demographic  6 :  10.108908785938718
Total age pop demographic  7 :  7.796073613834487
Total age pop demographic  8 :  5.802168725932575
Total age pop demographic  9 :  4.025608920598171
Total age pop demographic  10 :  4.3539889909990785
100.00000000000001
