In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from wide_deep.data_utils import prepare_data

DF = pd.read_csv('data/adult_data.csv')
DF['income_label'] = (DF["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

wide_cols = ['age','hours_per_week','education', 'relationship','workclass',
             'occupation','native_country','gender']
crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])
embeddings_cols = [('education',10), ('relationship',8), ('workclass',10),
                    ('occupation',10),('native_country',10)]
continuous_cols = ["age","hours_per_week"]
target = 'income_label'
method = 'logistic'

wd_dataset = prepare_data(DF, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target)

In [5]:
wd_dataset['train_dataset']

train_dataset(wide=array([[46, 50,  0, ...,  0,  0,  0],
       [32, 45,  1, ...,  0,  0,  0],
       [30, 30,  0, ...,  0,  0,  0],
       ..., 
       [40, 40,  0, ...,  0,  0,  0],
       [45, 37,  1, ...,  0,  0,  0],
       [40, 45,  1, ...,  0,  0,  0]]), deep=array([[ 3,  1,  6, ...,  0, 46, 50],
       [ 0,  0,  2, ...,  0, 32, 45],
       [ 1,  4,  2, ...,  0, 30, 30],
       ..., 
       [ 1,  0,  2, ...,  0, 40, 40],
       [ 0,  1,  2, ...,  0, 45, 37],
       [ 0,  1,  2, ...,  0, 40, 45]]), labels=array([1, 0, 0, ..., 0, 0, 0]))

In [14]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.autograd import Variable
from torch.utils.data import DataLoader

class Wide(nn.Module):
    """
    Wide-side consists in simply in "pluging" the features into the output neuron(s)

    Parameters:
    ----------
    wide_dim: int. Number of features per observation
    method  : str. Regression, logistic or multiclass
    n_class : int. number of classes. Defaults to 1 if logistic or regression
    """
    def __init__(self, wide_dim, n_class):

        super(Wide, self).__init__()
        self.wide_dim = wide_dim
        self.n_class = n_class

        self.linear = nn.Linear(self.wide_dim, self.n_class)

    def forward(self,X):

        out = F.sigmoid(self.linear(X))

        return out


In [15]:
wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_class  = 1
wide_model = Wide(wide_dim, n_class)

In [16]:
print(wide_model)

Wide (
  (linear): Linear (798 -> 1)
)


In [25]:
wd_dataset['train_dataset'].labels.reshape(-1, 1).shape

(34189, 1)

In [28]:
train_dataset = np.hstack([wd_dataset['train_dataset'].labels.reshape(-1, 1), wd_dataset['train_dataset'].wide])
train_dataset

array([[ 1, 46, 50, ...,  0,  0,  0],
       [ 0, 32, 45, ...,  0,  0,  0],
       [ 0, 30, 30, ...,  0,  0,  0],
       ..., 
       [ 0, 40, 40, ...,  0,  0,  0],
       [ 0, 45, 37, ...,  0,  0,  0],
       [ 0, 40, 45, ...,  0,  0,  0]])

In [34]:
optimizer = torch.optim.Adam(model.parameters())
batch_size = 64
n_epochs = 10
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)
# from http://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
for epoch in range(n_epochs):
    total=0
    correct=0
    for i, batch in enumerate(train_loader):

        X_w = Variable(batch[:, 1:]).float()
        y = Variable(batch[:, 0]).float()

        optimizer.zero_grad()
        y_pred = wide_model(X_w)
        loss = F.binary_cross_entropy(y_pred, y)
        loss.backward()
        optimizer.step()

        total+= y.size(0)
        y_pred_cat = (y_pred > 0.5).squeeze(1).float()
        correct+= float((y_pred_cat == y).sum().data[0])

    print ('Epoch {} of {}, Loss: {}, accuracy: {}'.format(epoch+1,
        n_epochs, round(loss.data[0],3), round(correct/total,4)))

Epoch 1 of 10, Loss: 0.427, accuracy: 0.8371
Epoch 2 of 10, Loss: 0.501, accuracy: 0.8363
Epoch 3 of 10, Loss: 0.414, accuracy: 0.8368
Epoch 4 of 10, Loss: 0.183, accuracy: 0.8361
Epoch 5 of 10, Loss: 0.37, accuracy: 0.8366
Epoch 6 of 10, Loss: 0.345, accuracy: 0.837
Epoch 7 of 10, Loss: 0.187, accuracy: 0.8366
Epoch 8 of 10, Loss: 0.972, accuracy: 0.8379
Epoch 9 of 10, Loss: 0.303, accuracy: 0.8376
Epoch 10 of 10, Loss: 0.447, accuracy: 0.8373


In [35]:
class Deep(nn.Module):
    """
    Deep-side, which consists in a series of embeddings and numerical 
    features passed through a series of dense layers.

    Params:
    --------
    embeddings_input (tuple): 3-elements tuple with the embeddings "set-up" -
    (col_name, unique_values, embeddings dim)
    continuous_cols (list) : list with the name of the continuum columns
    deep_column_idx (dict) : dictionary where the keys are column names and the values
    their corresponding index in the deep-side input tensor
    hidden_layers (list) : list with the number of units per hidden layer
    n_class (int) : number of classes. Defaults to 1 if logistic or regression
    """
    def __init__(self,embeddings_input,continuous_cols,deep_column_idx,hidden_layers,n_class):

        super(Deep, self).__init__()
        self.deep_column_idx = deep_column_idx
        self.embeddings_input = embeddings_input
        self.continuous_cols = continuous_cols
        self.hidden_layers = hidden_layers
        self.n_class = n_class

        for col,val,dim in self.embeddings_input:
            setattr(self, 'emb_layer_'+col, nn.Embedding(val, dim))

        input_emb_dim = np.sum([emb[2] for emb in self.embeddings_input])
        self.linear_1 = nn.Linear(input_emb_dim+len(continuous_cols), self.hidden_layers[0])
        for i,h in enumerate(self.hidden_layers[1:],1):
            setattr(self, 'linear_'+str(i+1), nn.Linear( self.hidden_layers[i-1], self.hidden_layers[i] ))

        self.output = nn.Linear(self.hidden_layers[-1], n_class)

    def forward(self, X):

        emb = [getattr(self, 'emb_layer_'+col)(X[:,self.deep_column_idx[col]])
               for col,_,_ in self.embeddings_input]

        cont_idx = [self.deep_column_idx[col] for col in self.continuous_cols]
        cont = [X[:, cont_idx].float()]

        deep_inp = torch.cat(emb+cont, 1)

        x_deep = F.relu(self.linear_1(deep_inp))
        for i in range(1,len(self.hidden_layers)):
            x_deep = F.relu( getattr(self, 'linear_'+str(i+1))(x_deep) )

        out = F.sigmoid(self.output(x_deep))

        return out


In [44]:
deep_column_idx = wd_dataset['deep_column_idx']
embeddings_input= wd_dataset['embeddings_input']
hidden_layers = [100,50]
deep_model = Deep(embeddings_input, continuous_cols, deep_column_idx, hidden_layers, n_class)

In [45]:
deep_model

Deep (
  (emb_layer_workclass): Embedding(9, 10)
  (emb_layer_education): Embedding(16, 10)
  (emb_layer_native_country): Embedding(42, 10)
  (emb_layer_relationship): Embedding(6, 8)
  (emb_layer_occupation): Embedding(15, 10)
  (linear_1): Linear (50 -> 100)
  (linear_2): Linear (100 -> 50)
  (output): Linear (50 -> 1)
)

In [46]:
train_dataset = np.hstack([wd_dataset['train_dataset'].labels.reshape(-1, 1), wd_dataset['train_dataset'].deep])
train_dataset

array([[ 1,  3,  1, ...,  0, 46, 50],
       [ 0,  0,  0, ...,  0, 32, 45],
       [ 0,  1,  4, ...,  0, 30, 30],
       ..., 
       [ 0,  1,  0, ...,  0, 40, 40],
       [ 0,  0,  1, ...,  0, 45, 37],
       [ 0,  0,  1, ...,  0, 40, 45]])

In [47]:
optimizer = torch.optim.Adam(deep_model.parameters())
batch_size = 64
n_epochs = 10
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)
# from http://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
for epoch in range(n_epochs):
    total=0
    correct=0
    for i, batch in enumerate(train_loader):

        X_d = Variable(batch[:, 1:])
        y = Variable(batch[:, 0]).float()

        optimizer.zero_grad()
        y_pred = deep_model(X_d)
        loss = F.binary_cross_entropy(y_pred, y)
        loss.backward()
        optimizer.step()

        total+= y.size(0)
        y_pred_cat = (y_pred > 0.5).squeeze(1).float()
        correct+= float((y_pred_cat == y).sum().data[0])

    print ('Epoch {} of {}, Loss: {}, accuracy: {}'.format(epoch+1,
        n_epochs, round(loss.data[0],3), round(correct/total,4)))

Epoch 1 of 10, Loss: 0.549, accuracy: 0.8149
Epoch 2 of 10, Loss: 0.409, accuracy: 0.8342
Epoch 3 of 10, Loss: 0.41, accuracy: 0.837
Epoch 4 of 10, Loss: 0.177, accuracy: 0.8381
Epoch 5 of 10, Loss: 0.15, accuracy: 0.8383
Epoch 6 of 10, Loss: 0.519, accuracy: 0.8404
Epoch 7 of 10, Loss: 0.383, accuracy: 0.8405
Epoch 8 of 10, Loss: 0.348, accuracy: 0.8403
Epoch 9 of 10, Loss: 0.701, accuracy: 0.8404
Epoch 10 of 10, Loss: 0.175, accuracy: 0.8417


In [51]:
class WideDeep(nn.Module):

    def __init__(self, wide_dim, embeddings_input, continuous_cols, deep_column_idx, hidden_layers, n_class):

        super(WideDeep, self).__init__()
        self.wide_dim = wide_dim
        self.deep_column_idx = deep_column_idx
        self.embeddings_input = embeddings_input
        self.continuous_cols = continuous_cols
        self.hidden_layers = hidden_layers
        self.n_class = n_class

        for col,val,dim in self.embeddings_input:
            setattr(self, 'emb_layer_'+col, nn.Embedding(val, dim))

        input_emb_dim = np.sum([emb[2] for emb in self.embeddings_input])
        self.linear_1 = nn.Linear(input_emb_dim+len(continuous_cols), self.hidden_layers[0])
        for i,h in enumerate(self.hidden_layers[1:],1):
            setattr(self, 'linear_'+str(i+1), nn.Linear( self.hidden_layers[i-1], self.hidden_layers[i] ))

        self.output = nn.Linear(self.hidden_layers[-1]+self.wide_dim, n_class)

    def forward(self, X_w, X_d):

        emb = [getattr(self, 'emb_layer_'+col)(X_d[:,self.deep_column_idx[col]])
               for col,_,_ in self.embeddings_input]

        cont_idx = [self.deep_column_idx[col] for col in self.continuous_cols]
        cont = [X_d[:, cont_idx].float()]

        deep_inp = torch.cat(emb+cont, 1)

        x_deep = F.relu(self.linear_1(deep_inp))
        for i in range(1,len(self.hidden_layers)):
            x_deep = F.relu( getattr(self, 'linear_'+str(i+1))(x_deep) )

        wide_deep_input = torch.cat([x_deep, X_w], 1)

        out = F.sigmoid(self.output(wide_deep_input))

        return out

In [52]:
wide_deep_model = WideDeep(wide_dim, embeddings_input, continuous_cols, deep_column_idx, hidden_layers, n_class)

In [54]:
wide_deep_model

WideDeep (
  (emb_layer_workclass): Embedding(9, 10)
  (emb_layer_education): Embedding(16, 10)
  (emb_layer_native_country): Embedding(42, 10)
  (emb_layer_relationship): Embedding(6, 8)
  (emb_layer_occupation): Embedding(15, 10)
  (linear_1): Linear (50 -> 100)
  (linear_2): Linear (100 -> 50)
  (output): Linear (848 -> 1)
)

In [57]:
class WideDeepLoader(Dataset):
    """Helper to facilitate loading the data to the pytorch models.

    Parameters:
    --------
    data: namedtuple with 3 elements - (wide_input_data, deep_inp_data, target)
    """
    def __init__(self, data):

        self.X_wide = data.wide
        self.X_deep = data.deep
        self.Y = data.labels

    def __getitem__(self, idx):

        xw = self.X_wide[idx]
        xd = self.X_deep[idx]
        y  = self.Y[idx]

        return xw, xd, y

    def __len__(self):
        return len(self.Y)


train_dataset = wd_dataset['train_dataset']
widedeep_dataset = WideDeepLoader(train_dataset)
train_loader = torch.utils.data.DataLoader(dataset=widedeep_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

In [59]:
optimizer = torch.optim.Adam(wide_deep_model.parameters())

batch_size = 64
n_epochs = 10
for epoch in range(n_epochs):
    total=0
    correct=0
    for i, (X_wide, X_deep, target) in enumerate(train_loader):
        X_d = Variable(X_deep)
        X_w = Variable(X_wide).float()
        y = Variable(target).float()

        optimizer.zero_grad()
        y_pred = wide_deep_model(X_w, X_d)
        loss = F.binary_cross_entropy(y_pred, y)
        loss.backward()
        optimizer.step()

        total+= y.size(0)
        y_pred_cat = (y_pred > 0.5).squeeze(1).float()
        correct+= float((y_pred_cat == y).sum().data[0])

    print ('Epoch {} of {}, Loss: {}, accuracy: {}'.format(epoch+1,
        n_epochs, round(loss.data[0],3), round(correct/total,4)))


Epoch 1 of 10, Loss: 0.532, accuracy: 0.8176
Epoch 2 of 10, Loss: 0.238, accuracy: 0.8355
Epoch 3 of 10, Loss: 0.382, accuracy: 0.838
Epoch 4 of 10, Loss: 0.659, accuracy: 0.8372
Epoch 5 of 10, Loss: 0.26, accuracy: 0.8396
Epoch 6 of 10, Loss: 0.435, accuracy: 0.8411
Epoch 7 of 10, Loss: 0.326, accuracy: 0.8387
Epoch 8 of 10, Loss: 0.378, accuracy: 0.8416
Epoch 9 of 10, Loss: 0.13, accuracy: 0.8426
Epoch 10 of 10, Loss: 0.267, accuracy: 0.8421
