In [146]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [147]:
path = '../data/german/'

In [148]:
train_file = path + 'german.data'
test_file = path + 'german.test'

In [149]:
def raw_to_df(fname):
  df = pd.read_csv(train_file, header=None)

  df = pd.concat([df[0].str.split(' ', expand=True)], axis=1)
  df.columns = ['checking','duration','creadit-history','purpose','credit-amount','saving', 
                            'employment-status','roi','sex','guarantors','residence-since','property','age', 
                            'other-installments','housing','credits','job-status','num-people','telephone', 
                            'worker','label']
  df['label'].replace(['1'], 0, inplace=True)
  df['label'].replace(['2'], 1, inplace=True)
  df['sex'].replace(['A91', 'A93', 'A94'], 'male', inplace=True)
  df['sex'].replace(['A92', 'A95'], 'female', inplace=True)
  df['age'] = df['age'].astype(int)
  df['duration'] = df['duration'].astype(int)
  df['credit-amount'] = df['credit-amount'].astype(int)
  df['roi'] = df['roi'].astype(int)
  df['residence-since'] = df['residence-since'].astype(int)
  df['credits'] = df['credits'].astype(int)
  df['num-people'] = df['num-people'].astype(int)
  return df

df_train_init = raw_to_df(train_file)
df_test = raw_to_df(test_file)

df_train_init.head()

Unnamed: 0,checking,duration,creadit-history,purpose,credit-amount,saving,employment-status,roi,sex,guarantors,...,property,age,other-installments,housing,credits,job-status,num-people,telephone,worker,label
0,A11,6,A34,A43,1169,A65,A75,4,male,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,female,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,male,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,male,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,male,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1


In [150]:
df_train, df_valid = train_test_split(df_train_init, test_size=0.2, random_state=1234)

In [151]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

df_train.head()

(640, 21)
(160, 21)
(800, 21)


Unnamed: 0,checking,duration,creadit-history,purpose,credit-amount,saving,employment-status,roi,sex,guarantors,...,property,age,other-installments,housing,credits,job-status,num-people,telephone,worker,label
324,A14,18,A34,A40,1028,A61,A73,4,female,A101,...,A121,36,A143,A152,2,A173,1,A191,A201,0
147,A14,12,A34,A40,682,A62,A74,4,female,A101,...,A123,51,A143,A152,2,A173,1,A192,A201,0
652,A11,24,A32,A40,2303,A61,A75,4,male,A102,...,A121,45,A143,A152,1,A173,1,A191,A201,1
786,A14,22,A32,A43,2675,A63,A75,3,male,A101,...,A123,40,A143,A152,1,A173,1,A191,A201,0
329,A12,6,A32,A43,1068,A61,A75,4,male,A101,...,A123,28,A143,A152,1,A173,2,A191,A201,0


In [152]:
age_buckets = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, float('+inf')]
continuous_cols = ['duration', 'credit-amount', 'roi', 'residence-since', 'credits', 'num-people']

def featurize(df):
  df = df.copy()
  ftu = df.copy()
  ftu = ftu.drop('age', axis=1)
  ftu = ftu.drop('sex', axis=1)
  # bucket age
  df['age'] = pd.cut(df['age'], age_buckets, right=False)
  # one-hot
  df = pd.get_dummies(df, sparse=True)
  ftu = pd.get_dummies(ftu, sparse=True)
  # split X, y
  X = df.drop('label', axis=1)
  X_ftu = ftu.drop('label', axis=1)
  y = df['label']
  # normalize continuous cols in X
  for col in continuous_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()
  for col in continuous_cols:
    X_ftu[col] = (X_ftu[col] - X_ftu[col].mean()) / X_ftu[col].std()
  return X, y, X_ftu

In [153]:
X_train, y_train, X_train_ftu = featurize(df_train)
X_valid, y_valid, X_valid_ftu = featurize(df_valid)
X_test, y_test, X_test_ftu = featurize(df_test)
X_train.head()

Unnamed: 0,duration,credit-amount,roi,residence-since,credits,num-people,checking_A11,checking_A12,checking_A13,checking_A14,...,housing_A152,housing_A153,job-status_A171,job-status_A172,job-status_A173,job-status_A174,telephone_A191,telephone_A192,worker_A201,worker_A202
324,-0.216884,-0.774769,0.936075,0.146416,1.074258,-0.406813,0,0,0,1,...,1,0,0,0,1,0,1,0,1,0
147,-0.706211,-0.896298,0.936075,0.146416,1.074258,-0.406813,0,0,0,1,...,1,0,0,0,1,0,0,1,1,0
652,0.272443,-0.326939,0.936075,-1.655628,-0.693159,-0.406813,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
786,0.109334,-0.196278,0.060215,1.047438,-0.693159,-0.406813,0,0,0,1,...,1,0,0,0,1,0,1,0,1,0
329,-1.195538,-0.76072,0.936075,1.047438,-0.693159,2.45429,0,1,0,0,...,1,0,0,0,1,0,1,0,1,0


In [154]:
# make sure train and test are not missing features
def match_cols(a, b):
  b_missing = set(a.columns) - set(b.columns)
  a_missing = set(b.columns) - set(a.columns)
  for col in a_missing:
    a[col] = 0
  for col in b_missing:
    b[col] = 0

match_cols(X_train, X_test)
match_cols(X_train, X_valid)
match_cols(X_test, X_valid)
print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)
print(X_train_ftu.shape)

(640, 69)
(800, 69)
(160, 69)
(640, 56)


In [155]:
print(df_train['sex'].unique())
age_train = pd.cut(df_train['age'], age_buckets, right=False, labels=range(11)).astype(int)
age_valid = pd.cut(df_valid['age'], age_buckets, right=False, labels=range(11)).astype(int)
age_test = pd.cut(df_test['age'], age_buckets, right=False, labels=range(11)).astype(int)
print(age_train.unique())

['female' 'male']
[ 4  7  6  5  2  3  1  9  8 10]


In [156]:
sex_ids = {'male': 0, 'female': 1}

def featurize_protected(df, protected, id_map):
  col = df[protected].copy()
  col.replace(id_map, inplace=True)
  return col

sex_train = featurize_protected(df_train, 'sex', sex_ids)
sex_valid = featurize_protected(df_valid, 'sex', sex_ids)
sex_test = featurize_protected(df_test, 'sex', sex_ids)
sex_train.head()

324    1
147    1
652    0
786    0
329    0
Name: sex, dtype: int64

In [157]:
X_train.to_pickle(path + 'X_train.pkl')
y_train.to_pickle(path + 'y_train.pkl')
X_train_ftu.to_pickle(path + 'X_train_ftu.pkl')

X_valid.to_pickle(path + 'X_valid.pkl')
y_valid.to_pickle(path + 'y_valid.pkl')
X_valid_ftu.to_pickle(path + 'X_valid_ftu.pkl')

X_test.to_pickle(path + 'X_test.pkl')
y_test.to_pickle(path + 'y_test.pkl')
X_test_ftu.to_pickle(path + 'X_test_ftu.pkl')

sex_train.to_pickle(path + 'sex_train.pkl')
sex_valid.to_pickle(path + 'sex_valid.pkl')
sex_test.to_pickle(path + 'sex_test.pkl')

age_train.to_pickle(path + 'age_train.pkl')
age_valid.to_pickle(path + 'age_valid.pkl')
age_test.to_pickle(path + 'age_test.pkl')

In [158]:
# NOW COMPUTE STATS ON TRAIN AND TEST -- OUT OF DATE AS NOT COMPUTED ON VALID

num_train = sum(1 for i,j in sex_train.iteritems())
num_test = sum(1 for i,j in sex_test.iteritems())

print ("TRAIN prop women: ", sum(1 for i, j in sex_train.iteritems() if (j == 1)), 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 1))/num_train)
print ("TRAIN prop men: ", 100.0*sum(1 for i, j in sex_train.iteritems() if (j == 0))/num_train)

print ("TEST prop women: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 1))/num_test)
print ("TEST prop men: ", 100.0*sum(1 for i, j in sex_test.iteritems() if (j == 0))/num_test)

print ("Total pop women: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 1)) + sum(1 for i, j in sex_test.iteritems() if (j == 1)))/(num_train+num_test))
print ("Total pop men: ", 100.0*(sum(1 for i, j in sex_train.iteritems() if (j == 0)) + sum(1 for i, j in sex_test.iteritems() if (j == 0)))/(num_train+num_test))

TRAIN prop women:  212 33.125
TRAIN prop men:  66.875
TEST prop women:  31.875
TEST prop men:  68.125
Total pop women:  32.43055555555556
Total pop men:  67.56944444444444


In [159]:
num_train = sum(1 for i,j in age_train.iteritems())
num_test = sum(1 for i,j in age_test.iteritems())
s = 0
for q in range(11):
    s += 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test)
    print ("Total age pop demographic ", q, ": ", 100.0*(sum(1 for i, j in age_train.iteritems() if (j == q)) + sum(1 for i, j in age_test.iteritems() if (j == q)))/(num_train+num_test))
print(s)

Total age pop demographic  0 :  0.0
Total age pop demographic  1 :  15.694444444444445
Total age pop demographic  2 :  22.569444444444443
Total age pop demographic  3 :  17.5
Total age pop demographic  4 :  16.041666666666668
Total age pop demographic  5 :  8.75
Total age pop demographic  6 :  6.736111111111111
Total age pop demographic  7 :  4.791666666666667
Total age pop demographic  8 :  2.7083333333333335
Total age pop demographic  9 :  2.986111111111111
Total age pop demographic  10 :  2.2222222222222223
100.00000000000001
