In [28]:
from __future__ import print_function
import pandas as pd
import numpy as np

DF = pd.read_csv('data/adult_data.csv')

# Feature for logistic classification
DF['income_label'] = (DF["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

# Let's define a new feature that we will use later to illustrate multiclass classification
age_groups = [0, 25, 50, 90]
age_labels = range(len(age_groups) - 1)
DF['age_group'] = pd.cut(DF['age'], age_groups, labels=age_labels)

DF.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket,income_label,age_group
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,2
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,1


In [29]:
wide_cols = ['age','hours_per_week','education', 'relationship','workclass',
             'occupation','native_country','gender']
crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])
embeddings_cols = [('education',10), ('relationship',8), ('workclass',10),
                    ('occupation',10),('native_country',10)]
continuous_cols = ["age","hours_per_week"]
target = 'income_label'
method = 'logistic'

# Hidden layers for the "deep-side" of the model
hidden_layers = [100,50]

In [30]:
# If embeddings_cols does not include the embeddings dimensions it will be set as
# def_dim
if len(embeddings_cols[0]) == 1:
    emb_dim = {e:def_dim for e in embeddings_cols}
else:
    emb_dim = dict(embeddings_cols)
    embeddings_cols = [emb[0] for emb in embeddings_cols]
deep_cols = embeddings_cols+continuous_cols

In [31]:
Y = np.array(DF[target])
df_tmp = DF.copy()[list(set(wide_cols + deep_cols))]

# Build the crossed columns
crossed_columns = []
for cols in crossed_cols:
    colname = '_'.join(cols)
    df_tmp[colname] = df_tmp[cols].apply(lambda x: '-'.join(x), axis=1)
    crossed_columns.append(colname)

# Extract the categorical column names that can be one hot encoded later
categorical_columns = list(df_tmp.select_dtypes(include=['object']).columns)

In [32]:
def label_encode(df, cols=None):
    """
    Helper function to label-encode some features of a given dataset.

    Parameters:
    --------
    df  (pd.Dataframe)
    cols (list): optional - columns to be label-encoded

    Returns:
    ________
    val_to_idx (dict) : Dictionary of dictionaries with useful information about
    the encoding mapping
    df (pd.Dataframe): mutated df with Label-encoded features.
    """

    if cols == None:
        cols = list(df.select_dtypes(include=['object']).columns)

    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.iteritems():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.iteritems():
        df[k] = df[k].apply(lambda x: v[x])

    return val_to_idx, df

# Encode the dataframe and get the encoding Dictionary only for the
# deep_cols (for the wide_cols is uneccessary)
encoding_dict,df_tmp = label_encode(df_tmp)
encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}
embeddings_input = []
for k,v in encoding_dict.iteritems():
    embeddings_input.append((k, len(v), emb_dim[k]))

# select the deep_cols and get the column index that will be use later
# to slice the tensors
df_deep = df_tmp[deep_cols]
deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}

In [33]:
df_wide = df_tmp[wide_cols+crossed_columns]
del(df_tmp)
dummy_cols = [c for c in wide_cols+crossed_columns if c in categorical_columns]
df_wide = pd.get_dummies(df_wide, columns=dummy_cols)

In [34]:
from sklearn.model_selection import train_test_split
from collections import namedtuple

seed = 1981
X_train_deep, X_test_deep = train_test_split(df_deep.values, test_size=0.3, random_state=seed)
X_train_wide, X_test_wide = train_test_split(df_wide.values, test_size=0.3, random_state=seed)
y_train, y_test = train_test_split(Y, test_size=0.3, random_state=1981)

# Building the output dictionary
wd_dataset = dict()
train_dataset = namedtuple('train_dataset', 'wide, deep, labels')
test_dataset  = namedtuple('test_dataset' , 'wide, deep, labels')
wd_dataset['train_dataset'] = train_dataset(X_train_wide, X_train_deep, y_train)
wd_dataset['test_dataset']  = test_dataset(X_test_wide, X_test_deep, y_test)
wd_dataset['embeddings_input']  = embeddings_input
wd_dataset['deep_column_idx'] = deep_column_idx
wd_dataset['encoding_dict'] = encoding_dict

In [35]:
print(wd_dataset['train_dataset'])

train_dataset(wide=array([[46, 50,  0, ...,  0,  0,  0],
       [32, 45,  1, ...,  0,  0,  0],
       [30, 30,  0, ...,  0,  0,  0],
       ..., 
       [40, 40,  0, ...,  0,  0,  0],
       [45, 37,  1, ...,  0,  0,  0],
       [40, 45,  1, ...,  0,  0,  0]]), deep=array([[ 3,  1,  6, ...,  0, 46, 50],
       [ 0,  0,  2, ...,  0, 32, 45],
       [ 1,  4,  2, ...,  0, 30, 30],
       ..., 
       [ 1,  0,  2, ...,  0, 40, 40],
       [ 0,  1,  2, ...,  0, 45, 37],
       [ 0,  1,  2, ...,  0, 40, 45]]), labels=array([1, 0, 0, ..., 0, 0, 0]))


In [36]:
print(wd_dataset['embeddings_input'])

[('workclass', 9, 10), ('education', 16, 10), ('native_country', 42, 10), ('relationship', 6, 8), ('occupation', 15, 10)]


In [37]:
print(wd_dataset['deep_column_idx'])

{'hours_per_week': 6, 'native_country': 4, 'relationship': 1, 'age': 5, 'workclass': 2, 'education': 0, 'occupation': 3}


In [38]:
wd_dataset['encoding_dict']

{'education': {'10th': 12,
  '11th': 2,
  '12th': 15,
  '1st-4th': 13,
  '5th-6th': 11,
  '7th-8th': 8,
  '9th': 4,
  'Assoc-acdm': 6,
  'Assoc-voc': 7,
  'Bachelors': 0,
  'Doctorate': 9,
  'HS-grad': 1,
  'Masters': 3,
  'Preschool': 14,
  'Prof-school': 10,
  'Some-college': 5},
 'native_country': {'?': 4,
  'Cambodia': 17,
  'Canada': 10,
  'China': 28,
  'Columbia': 16,
  'Cuba': 1,
  'Dominican-Republic': 24,
  'Ecuador': 19,
  'El-Salvador': 25,
  'England': 9,
  'France': 26,
  'Germany': 11,
  'Greece': 35,
  'Guatemala': 27,
  'Haiti': 22,
  'Holand-Netherlands': 41,
  'Honduras': 8,
  'Hong': 38,
  'Hungary': 40,
  'India': 3,
  'Iran': 12,
  'Ireland': 39,
  'Italy': 14,
  'Jamaica': 2,
  'Japan': 29,
  'Laos': 20,
  'Mexico': 5,
  'Nicaragua': 36,
  'Outlying-US(Guam-USVI-etc)': 32,
  'Peru': 31,
  'Philippines': 13,
  'Poland': 15,
  'Portugal': 23,
  'Puerto-Rico': 7,
  'Scotland': 33,
  'South': 6,
  'Taiwan': 21,
  'Thailand': 18,
  'Trinadad&Tobago': 34,
  'United-Sta