In [21]:
import numpy as np
import pandas as pd
import torch

In [22]:
from pytorch_widedeep.preprocessing import WidePreprocessor, DeepPreprocessor
from pytorch_widedeep.models import Wide, DeepDense, WideDeep
from pytorch_widedeep.metrics import BinaryAccuracy

In [23]:
df = pd.read_csv('../data/adult/adult.csv.zip')

In [24]:
# let's do some formating for convenience
df.columns = [c.replace("-", "_") for c in df.columns]

# let's add one categorical variable 
df['age_buckets'] = pd.cut(df.age, bins=[16, 25, 30, 35, 40, 45, 50, 55, 60, 91], labels=np.arange(9))

# let's create a binary target
df['income_label'] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop('income', axis=1, inplace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,age_buckets,income_label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,3,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,4,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0,0


In [25]:
wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
    'native_country','gender']
crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
cat_embed_cols = [('education',16), ('relationship',8), ('workclass',16),
    ('occupation',16),('native_country',16)]
continuous_cols = ["age","hours_per_week"]
target = 'income_label'

In [26]:
# TARGET
target = df[target].values

# WIDE
prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = prepare_wide.fit_transform(df)

# DEEP
prepare_deep = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
X_deep = prepare_deep.fit_transform(df)

In [27]:
X_wide

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
X_deep.shape

(48842, 7)

In [29]:
wide = Wide( wide_dim=X_wide.shape[1], output_dim=1)
deepdense = DeepDense(hidden_layers=[64,32], dropout=[0.5],
                      deep_column_idx=prepare_deep.deep_column_idx,
                      embed_input=prepare_deep.embeddings_input,
                      continuous_cols=continuous_cols)

In [30]:
wide

Wide(
  (wide_linear): Linear(in_features=805, out_features=1, bias=True)
)

In [31]:
deepdense

DeepDense(
  (embed_layers): ModuleDict(
    (emb_layer_education): Embedding(16, 16)
    (emb_layer_native_country): Embedding(42, 16)
    (emb_layer_occupation): Embedding(15, 16)
    (emb_layer_relationship): Embedding(6, 8)
    (emb_layer_workclass): Embedding(9, 16)
  )
  (dense): Sequential(
    (dense_layer_0): Sequential(
      (0): Linear(in_features=74, out_features=64, bias=True)
      (1): LeakyReLU(negative_slope=0.01, inplace=True)
      (2): Dropout(p=0.0, inplace=False)
    )
    (dense_layer_1): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): LeakyReLU(negative_slope=0.01, inplace=True)
      (2): Dropout(p=0.5, inplace=False)
    )
  )
)

In [32]:
model = WideDeep(wide=wide, deepdense=deepdense)

In [33]:
model

WideDeep(
  (wide): Wide(
    (wide_linear): Linear(in_features=805, out_features=1, bias=True)
  )
  (deepdense): Sequential(
    (0): DeepDense(
      (embed_layers): ModuleDict(
        (emb_layer_education): Embedding(16, 16)
        (emb_layer_native_country): Embedding(42, 16)
        (emb_layer_occupation): Embedding(15, 16)
        (emb_layer_relationship): Embedding(6, 8)
        (emb_layer_workclass): Embedding(9, 16)
      )
      (dense): Sequential(
        (dense_layer_0): Sequential(
          (0): Linear(in_features=74, out_features=64, bias=True)
          (1): LeakyReLU(negative_slope=0.01, inplace=True)
          (2): Dropout(p=0.0, inplace=False)
        )
        (dense_layer_1): Sequential(
          (0): Linear(in_features=64, out_features=32, bias=True)
          (1): LeakyReLU(negative_slope=0.01, inplace=True)
          (2): Dropout(p=0.5, inplace=False)
        )
      )
    )
    (1): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [34]:
model.compile(method='logistic', metrics=[BinaryAccuracy])

In [30]:
model.fit(X_wide=X_wide, X_deep=X_deep, target=target, n_epochs=10, batch_size=256, val_split=0.2)

epoch 1: 100%|██████████| 153/153 [00:01<00:00, 93.95it/s, loss=0.441, metrics={'acc': 0.7879}]
valid: 100%|██████████| 39/39 [00:00<00:00, 141.92it/s, loss=0.373, metrics={'acc': 0.7966}]
epoch 2: 100%|██████████| 153/153 [00:01<00:00, 100.67it/s, loss=0.359, metrics={'acc': 0.834}] 
valid: 100%|██████████| 39/39 [00:00<00:00, 145.02it/s, loss=0.363, metrics={'acc': 0.8339}]
epoch 3: 100%|██████████| 153/153 [00:01<00:00, 101.43it/s, loss=0.353, metrics={'acc': 0.838}] 
valid: 100%|██████████| 39/39 [00:00<00:00, 144.21it/s, loss=0.357, metrics={'acc': 0.8375}]
epoch 4: 100%|██████████| 153/153 [00:01<00:00, 101.58it/s, loss=0.348, metrics={'acc': 0.84}]  
valid: 100%|██████████| 39/39 [00:00<00:00, 141.01it/s, loss=0.355, metrics={'acc': 0.8394}]
epoch 5: 100%|██████████| 153/153 [00:01<00:00, 99.78it/s, loss=0.345, metrics={'acc': 0.8415}] 
valid: 100%|██████████| 39/39 [00:00<00:00, 142.29it/s, loss=0.354, metrics={'acc': 0.8408}]
epoch 6: 100%|██████████| 153/153 [00:01<00:00, 97.