In [1]:
from __future__ import print_function
import torch
import numpy as np
import pandas as pd
from wide_deep.torch_model import WideDeep
from wide_deep.data_utils import prepare_data

DF = pd.read_csv('data/adult_data.csv')
DF['income_label'] = (DF["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
age_groups = [0, 25, 50, 90]

DF.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket,income_label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


In [2]:
# Experiment set up
wide_cols = ['age','hours_per_week','education', 'relationship','workclass',
             'occupation','native_country','gender']
crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])
embeddings_cols = [('education',10), ('relationship',8), ('workclass',10),
                    ('occupation',10),('native_country',10)]
continuous_cols = ["age","hours_per_week"]
target = 'income_label'
method = 'logistic'
wd_dataset = prepare_data(DF, wide_cols,crossed_cols, embeddings_cols, continuous_cols, target)

In [3]:
# Network set up
wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_class=1 # for logistic and regression
deep_column_idx = wd_dataset['deep_column_idx']
embeddings_input= wd_dataset['embeddings_input']
encoding_dict   = wd_dataset['encoding_dict']
hidden_layers = [100,50]

# Build the model
model = WideDeep(wide_dim,embeddings_input,continuous_cols,deep_column_idx,hidden_layers,encoding_dict,n_class)

# Run it as your usual sklearn fit/predict
train_dataset = wd_dataset['train_dataset']
model.fit(dataset=train_dataset, n_epochs=10, batch_size=64)

  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


Epoch 1 of 10, Loss: 0.459, accuracy: 0.8134
Epoch 2 of 10, Loss: 0.309, accuracy: 0.8348
Epoch 3 of 10, Loss: 0.381, accuracy: 0.8374
Epoch 4 of 10, Loss: 0.121, accuracy: 0.8396
Epoch 5 of 10, Loss: 0.177, accuracy: 0.84
Epoch 6 of 10, Loss: 0.205, accuracy: 0.8405
Epoch 7 of 10, Loss: 0.547, accuracy: 0.8405
Epoch 8 of 10, Loss: 0.481, accuracy: 0.8406
Epoch 9 of 10, Loss: 0.23, accuracy: 0.8409
Epoch 10 of 10, Loss: 0.298, accuracy: 0.8428


In [5]:
# Test your results
test_dataset  = wd_dataset['test_dataset']
pred = model.predict(test_dataset)

from sklearn.metrics import accuracy_score
print(accuracy_score(pred, test_dataset.labels))

0.838121886303


In [None]:
print(model.get_embeddings('workclass'))

In [6]:
age_groups = [0, 25, 50, 90]
age_labels = range(len(age_groups) - 1)
DF['age_group'] = pd.cut(DF['age'], age_groups, labels=age_labels)
wide_cols = ['hours_per_week','education', 'relationship','workclass',
             'occupation','native_country','gender']
crossed_cols  = (['education', 'occupation'], ['native_country', 'occupation'])
embeddings_cols  = [('education',10), ('relationship',8), ('workclass',10),
                    ('occupation',10),('native_country',10)]
continuous_cols = ["hours_per_week"]
target = 'age_group'
method = 'multiclass'

wd_dataset = prepare_data(DF, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target)

wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_unique = len(np.unique(wd_dataset['train_dataset'].labels))
n_class=3
deep_column_idx = wd_dataset['deep_column_idx']
embeddings_input= wd_dataset['embeddings_input']
encoding_dict   = wd_dataset['encoding_dict']
hidden_layers = [100,50]

model = WideDeep(wide_dim,embeddings_input,continuous_cols,deep_column_idx,hidden_layers,encoding_dict,n_class,dropout=0.2)
model.compile(method=method)
train_dataset = wd_dataset['train_dataset']
model.fit(dataset=train_dataset, n_epochs=10, batch_size=64)

Epoch 1 of 10, Loss: 1.014, accuracy: 0.6636
Epoch 2 of 10, Loss: 0.929, accuracy: 0.6867
Epoch 3 of 10, Loss: 0.909, accuracy: 0.69
Epoch 4 of 10, Loss: 0.8, accuracy: 0.6935
Epoch 5 of 10, Loss: 0.715, accuracy: 0.6946
Epoch 6 of 10, Loss: 0.772, accuracy: 0.6965
Epoch 7 of 10, Loss: 0.918, accuracy: 0.6974
Epoch 8 of 10, Loss: 0.77, accuracy: 0.6969
Epoch 9 of 10, Loss: 0.883, accuracy: 0.701
Epoch 10 of 10, Loss: 0.802, accuracy: 0.7006


In [8]:
test_dataset  = wd_dataset['test_dataset']
pred = model.predict_proba(test_dataset)
print(pred)

[[  9.99934077e-01   6.59206926e-05   9.15232445e-10]
 [  1.75810200e-09   9.99999881e-01   9.33926714e-08]
 [  3.12858418e-07   9.99998152e-01   1.50581332e-06]
 ..., 
 [  1.66599937e-02   9.58539546e-01   2.48004813e-02]
 [  5.63067124e-06   9.99965906e-01   2.84431426e-05]
 [  8.40241969e-01   1.57500654e-01   2.25735293e-03]]


In [15]:
from sklearn.metrics import f1_score, accuracy_score
print(f1_score(model.predict(test_dataset), test_dataset.labels, average="weighted"))
print(accuracy_score(model.predict(test_dataset), test_dataset.labels))

0.72878063226
0.700266157101


In [16]:
wide_cols = ['hours_per_week','education', 'relationship','workclass',
             'occupation','native_country','gender']
crossed_cols  = (['education', 'occupation'], ['native_country', 'occupation'])
embeddings_cols  = [('education',10), ('relationship',8), ('workclass',10),
                    ('occupation',10),('native_country',10)]
continuous_cols = ["hours_per_week"]
target = 'age'
hidden_layers = [100,50]
method = 'regression'

wd_dataset = prepare_data(DF, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target)

wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_unique = len(np.unique(wd_dataset['train_dataset'].labels))
n_class=1
deep_column_idx = wd_dataset['deep_column_idx']
embeddings_input= wd_dataset['embeddings_input']
encoding_dict   = wd_dataset['encoding_dict']
hidden_layers = [100,50]

model = WideDeep(wide_dim,embeddings_input,continuous_cols,deep_column_idx,hidden_layers,encoding_dict,n_class)
model.compile(method=method)
train_dataset = wd_dataset['train_dataset']
model.fit(dataset=train_dataset, n_epochs=10, batch_size=64)

Epoch 1 of 10, Loss: 60.014
Epoch 2 of 10, Loss: 164.762
Epoch 3 of 10, Loss: 118.137
Epoch 4 of 10, Loss: 221.99
Epoch 5 of 10, Loss: 106.474
Epoch 6 of 10, Loss: 100.781
Epoch 7 of 10, Loss: 56.632
Epoch 8 of 10, Loss: 56.965
Epoch 9 of 10, Loss: 142.998
Epoch 10 of 10, Loss: 59.393


In [17]:
test_dataset  = wd_dataset['test_dataset']
pred = model.predict(test_dataset)
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(pred, test_dataset.labels)))

11.1512667138
