# Simple Binary Classification with defaults

In this notebook we will train a Wide and Deep model and simply a "Deep" model using the well known adult dataset

In [16]:
import numpy as np
import pandas as pd
import torch

from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, TabNet
from pytorch_widedeep.metrics import Accuracy, Precision
from pytorch_widedeep.datasets import load_adult

In [2]:
df = load_adult(as_frame=True)
df.head()

  and should_run_async(code)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop("income", axis=1, inplace=True)
df.head()

  and should_run_async(code)


Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


In [4]:
df.drop(["fnlwgt", "educational_num"], axis=1, inplace=True)

  and should_run_async(code)


### Preparing the data

In [5]:
# Define wide, crossed and deep tabular columns
wide_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country",
]
crossed_cols = [("education", "occupation"), ("native_country", "occupation")]

  and should_run_async(code)


In [6]:
cat_embed_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "native_country",
]
continuous_cols = ["age", "hours_per_week"]

In [7]:
cat_embed_cols

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital_gain',
 'capital_loss',
 'native_country']

In [8]:
# TARGET
target_col = "income_label"
target = df[target_col].values

let's see what the preprocessors do

In [12]:
# wide
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df)

In [10]:
# # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary
# wide_preprocessor.encoding_dict

In [9]:
# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols, continuous_cols=continuous_cols
)
X_tab = tab_preprocessor.fit_transform(df)

In [10]:
# check the docs to understand the useful attributes that the tab_preprocessor has. For example,
# as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input
# that specifies the categortical columns that will be represented as embeddings, the number
# of different categories per feature, and the dimension of the embeddings as defined by some
# of the internal rules of thumb that the preprocessor has (have a look to the docs)
tab_preprocessor.cat_embed_input

  and should_run_async(code)


[('workclass', 9, 5),
 ('education', 16, 8),
 ('marital_status', 7, 5),
 ('occupation', 15, 7),
 ('relationship', 6, 4),
 ('race', 5, 4),
 ('gender', 2, 2),
 ('capital_gain', 123, 24),
 ('capital_loss', 99, 21),
 ('native_country', 42, 13)]

In [13]:
print(X_wide)
print(X_wide.shape)

[[  1  10  26 ...  61 103 328]
 [  1  11  27 ...  61 104 329]
 [  2  12  27 ...  61 105 330]
 ...
 [  1  11  28 ...  61 115 335]
 [  1  11  26 ...  61 115 335]
 [  7  11  27 ...  61 127 336]]
(48842, 10)


  and should_run_async(code)


In [14]:
print(X_tab)
print(X_tab.shape)

[[ 1.          1.          1.         ...  1.         -0.99512893
  -0.03408696]
 [ 1.          2.          2.         ...  1.         -0.04694151
   0.77292975]
 [ 2.          3.          2.         ...  1.         -0.77631645
  -0.03408696]
 ...
 [ 1.          2.          3.         ...  1.          1.41180837
  -0.03408696]
 [ 1.          2.          1.         ...  1.         -1.21394141
  -1.64812038]
 [ 7.          2.          2.         ...  1.          0.97418341
  -0.03408696]]
(48842, 12)


### Defining the model

In [19]:
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
model = TabNet(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
)

  and should_run_async(code)


Let's first find out how a linear model performs 

In [None]:
model_checkpoint = ModelCheckpoint(
    filepath=f'./checkpoints/{args.model}/{args.country}/chkp',
    save_best_only=True,
    max_save=1,
)

callbacks = [
    LRHistory(n_epochs=10),
    EarlyStopping(patience=10),
    model_checkpoint,
]
metrics = [auroc]

trainer = Trainer(
    model,
    objective="binary",
    optimizers=torch.optim.Adam(model.parameters(), lr=0.01),
    callbacks=callbacks,
    metrics=metrics,
    verbose=True,
    seed=1,
)

In [20]:
tab_net_widedeep = WideDeep(deeptabular=model)

  and should_run_async(code)


In [21]:
tab_net_widedeep

WideDeep(
  (deeptabular): Sequential(
    (0): TabNet(
      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(
        (cat_embed): DiffSizeCatEmbeddings(
          (embed_layers): ModuleDict(
            (emb_layer_workclass): Embedding(10, 5, padding_idx=0)
            (emb_layer_education): Embedding(17, 8, padding_idx=0)
            (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)
            (emb_layer_occupation): Embedding(16, 7, padding_idx=0)
            (emb_layer_relationship): Embedding(7, 4, padding_idx=0)
            (emb_layer_race): Embedding(6, 4, padding_idx=0)
            (emb_layer_gender): Embedding(3, 2, padding_idx=0)
            (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)
            (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)
            (emb_layer_native_country): Embedding(43, 13, padding_idx=0)
          )
          (embedding_dropout): Dropout(p=0.1, inplace=False)
        )
        (cont_norm): Identity()
      

In [24]:
tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2)

epoch 1: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 96.66it/s, loss=0.418, metrics={'acc': 0.8024, 'prec': 0.6664}]
valid: 100%|██████████████████████████████████████| 77/77 [00:00<00:00, 168.33it/s, loss=0.355, metrics={'acc': 0.8317, 'prec': 0.8402}]
epoch 2: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 98.26it/s, loss=0.376, metrics={'acc': 0.8222, 'prec': 0.7253}]
valid: 100%|██████████████████████████████████████| 77/77 [00:00<00:00, 172.84it/s, loss=0.338, metrics={'acc': 0.8546, 'prec': 0.7901}]
epoch 3: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 99.54it/s, loss=0.362, metrics={'acc': 0.8305, 'prec': 0.7402}]
valid: 100%|████████████████████████████████████████| 77/77 [00:00<00:00, 173.62it/s, loss=0.327, metrics={'acc': 0.863, 'prec': 0.761}]
epoch 4: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 97.46it/s, loss=0.351, metrics={'acc': 0.8373, 'prec': 0.7478}]
valid: 100%|█████████████████████████████

The best result I ever obtained with `LightGBM` on this dataset is 0.8782...so we are pretty close.

Let's combine the `wide` and `tab_mlp` components see if it helps

In [27]:
wd_trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2
)

epoch 1: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 87.74it/s, loss=0.511, metrics={'acc': 0.7659, 'prec': 0.5139}]
valid: 100%|███████████████████████████████████████| 77/77 [00:00<00:00, 144.96it/s, loss=0.406, metrics={'acc': 0.795, 'prec': 0.5659}]
epoch 2: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 90.59it/s, loss=0.405, metrics={'acc': 0.8114, 'prec': 0.6346}]
valid: 100%|██████████████████████████████████████| 77/77 [00:00<00:00, 148.66it/s, loss=0.371, metrics={'acc': 0.8189, 'prec': 0.6099}]
epoch 3: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 89.61it/s, loss=0.368, metrics={'acc': 0.8298, 'prec': 0.6874}]
valid: 100%|██████████████████████████████████████| 77/77 [00:00<00:00, 150.18it/s, loss=0.353, metrics={'acc': 0.8325, 'prec': 0.6342}]
epoch 4: 100%|███████████████████████████████████| 306/306 [00:03<00:00, 88.74it/s, loss=0.348, metrics={'acc': 0.8407, 'prec': 0.7187}]
valid: 100%|█████████████████████████████

For this particular case, the combination of both did not lead to better results that using just the tab_mlp model. 

Note that we have use a `TabMlp` model, but we could use any other model in the library using the same syntax