# Simple Binary Classification with defaults

In this notebook we will train a Wide and Deep model and simply a "Deep" model using the well known adult dataset

In [1]:
import numpy as np
import pandas as pd
import torch

from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, Precision
from pytorch_widedeep.datasets import load_adult

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = load_adult(as_frame=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop("income", axis=1, inplace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


In [4]:
df.drop(["fnlwgt", "educational_num"], axis=1, inplace=True)

### Preparing the data

In [5]:
# Define wide, crossed and deep tabular columns
wide_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country",
]
crossed_cols = [("education", "occupation"), ("native_country", "occupation")]

In [6]:
cat_embed_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "native_country",
]
continuous_cols = ["age", "hours_per_week"]

In [7]:
# TARGET
target_col = "income_label"
target = df[target_col].values

let's see what the preprocessors do

In [8]:
# wide
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df)

In [9]:
# # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary
# wide_preprocessor.encoding_dict

In [10]:
# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols,
    continuous_cols=continuous_cols,
    cols_to_scale=continuous_cols,
)
X_tab = tab_preprocessor.fit_transform(df)

In [11]:
# check the docs to understand the useful attributes that the tab_preprocessor has. For example,
# as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input
# that specifies the categortical columns that will be represented as embeddings, the number
# of different categories per feature, and the dimension of the embeddings as defined by some
# of the internal rules of thumb that the preprocessor has (have a look to the docs)
tab_preprocessor.cat_embed_input

[('workclass', 9, 5),
 ('education', 16, 8),
 ('marital_status', 7, 5),
 ('occupation', 15, 7),
 ('relationship', 6, 4),
 ('race', 5, 4),
 ('gender', 2, 2),
 ('capital_gain', 123, 24),
 ('capital_loss', 99, 21),
 ('native_country', 42, 13)]

In [12]:
print(X_wide)
print(X_wide.shape)

[[  1  10  26 ...  61 103 328]
 [  1  11  27 ...  61 104 329]
 [  2  12  27 ...  61 105 330]
 ...
 [  1  11  28 ...  61 115 335]
 [  1  11  26 ...  61 115 335]
 [  7  11  27 ...  61 127 336]]
(48842, 10)


In [13]:
print(X_tab)
print(X_tab.shape)

[[ 1.          1.          1.         ...  1.         -0.99512893
  -0.03408696]
 [ 1.          2.          2.         ...  1.         -0.04694151
   0.77292975]
 [ 2.          3.          2.         ...  1.         -0.77631645
  -0.03408696]
 ...
 [ 1.          2.          3.         ...  1.          1.41180837
  -0.03408696]
 [ 1.          2.          1.         ...  1.         -1.21394141
  -1.64812038]
 [ 7.          2.          2.         ...  1.          0.97418341
  -0.03408696]]
(48842, 12)


### Defining the model

In [14]:
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[400, 200],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
)

Let's first find out how a linear model performs 

In [15]:
wide

Wide(
  (wide_linear): Embedding(809, 1, padding_idx=0)
)

Before being passed to the Trainer, the models need to be "constructed" with the ``WideDeep`` constructor class. For the particular case of the wide/linear model, not much really happens

In [16]:
lin_model = WideDeep(wide=wide)

In [17]:
lin_model

WideDeep(
  (wide): Wide(
    (wide_linear): Embedding(809, 1, padding_idx=0)
  )
)

In [18]:
lin_trainer = Trainer(
    model=lin_model,
    objective="binary",
    optimizers=torch.optim.AdamW(lin_model.parameters(), lr=0.01),
    metrics=[Accuracy, Precision],
)

In [19]:
lin_trainer.fit(X_wide=X_wide, target=target, n_epochs=4, batch_size=128, val_split=0.2)

epoch 1: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 109.04it/s, loss=0.426, metrics={'acc': 0.7983, 'prec': 0.6152}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 102.46it/s, loss=0.366, metrics={'acc': 0.832, 'prec': 0.6916}]
epoch 2: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 130.27it/s, loss=0.364, metrics={'acc': 0.8305, 'prec': 0.6933}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 150.46it/s, loss=0.361, metrics={'acc': 0.8357, 'prec': 0.6982}]
epoch 3: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 133.19it/s, loss=0.359, metrics={'acc': 0.8329, 'prec': 0.6994}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 145.75it/s, loss=0.361, metrics={'acc': 0.836, 'prec': 0.7009}]
epoch 4: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 130.91it/s, loss=0.358, metrics={'acc': 0.8333, 'prec': 0

Bear in mind that `wide` is a linear model where the non-linearities are captured via the crossed columns. For the crossed-columns to be effective one needs proper business knowledge. There is no magic formula to produce them

Let's have a look to the tabular model by itself

In [20]:
tab_model = WideDeep(deeptabular=tab_mlp)

In [21]:
tab_model

WideDeep(
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_embed): DiffSizeCatEmbeddings(
        (embed_layers): ModuleDict(
          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)
          (emb_layer_education): Embedding(17, 8, padding_idx=0)
          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)
          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)
          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)
          (emb_layer_race): Embedding(6, 4, padding_idx=0)
          (emb_layer_gender): Embedding(3, 2, padding_idx=0)
          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)
          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)
          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)
        )
        (embedding_dropout): Dropout(p=0.1, inplace=False)
      )
      (cont_norm): Identity()
      (encoder): MLP(
        (mlp): Sequential(
          (dense_layer_0): Sequential(
        

You can see how the `WideDeep` class has added a final prediction layer that collects the activations from the last layer of the model and plugs them into the output neuron. If this was a multiclass classification problem, the prediction dimension (i.e. the size of that final layer) needs to be specified via the `pred_dim` when instantiating the `WideDeep` class, as we will see later

In [22]:
tab_trainer = Trainer(
    model=tab_model,
    objective="binary",
    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)

In [23]:
tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2)

epoch 1: 100%|███████████████████████████████████████████| 306/306 [00:03<00:00, 97.00it/s, loss=0.37, metrics={'acc': 0.8267, 'prec': 0.7037}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 134.91it/s, loss=0.313, metrics={'acc': 0.8588, 'prec': 0.7577}]
epoch 2: 100%|███████████████████████████████████████████| 306/306 [00:03<00:00, 86.86it/s, loss=0.319, metrics={'acc': 0.8514, 'prec': 0.761}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:01<00:00, 73.13it/s, loss=0.296, metrics={'acc': 0.8675, 'prec': 0.7685}]
epoch 3: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 79.07it/s, loss=0.305, metrics={'acc': 0.8574, 'prec': 0.7646}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 130.11it/s, loss=0.289, metrics={'acc': 0.8696, 'prec': 0.7765}]
epoch 4: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 87.39it/s, loss=0.296, metrics={'acc': 0.8622, 'prec': 0

The best result I ever obtained with `LightGBM` on this dataset is 0.8782...so we are pretty close.

Let's combine the `wide` and `tab_mlp` components see if it helps

In [24]:
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[400, 200],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
)
wd_model = WideDeep(wide=wide, deeptabular=tab_mlp)

In [25]:
wd_trainer = Trainer(
    model=wd_model,
    objective="binary",
    optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)

In [26]:
wd_trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2
)

epoch 1: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 77.48it/s, loss=0.418, metrics={'acc': 0.8047, 'prec': 0.6154}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 110.51it/s, loss=0.321, metrics={'acc': 0.8521, 'prec': 0.7059}]
epoch 2: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 82.70it/s, loss=0.333, metrics={'acc': 0.8428, 'prec': 0.7141}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 112.52it/s, loss=0.299, metrics={'acc': 0.866, 'prec': 0.7447}]
epoch 3: 100%|██████████████████████████████████████████| 306/306 [00:04<00:00, 74.34it/s, loss=0.312, metrics={'acc': 0.8533, 'prec': 0.7404}]
valid: 100%|███████████████████████████████████████████████| 77/77 [00:00<00:00, 89.86it/s, loss=0.29, metrics={'acc': 0.8683, 'prec': 0.7496}]
epoch 4: 100%|██████████████████████████████████████████| 306/306 [00:04<00:00, 65.32it/s, loss=0.301, metrics={'acc': 0.8591, 'prec': 0

For this particular case, the combination of both did not lead to better results that using just the tab_mlp model, when using only 4 epochs. 

Note that we have use a `TabMlp` model, but we could use any other model in the library using the same syntax

In [27]:
from pytorch_widedeep.models import TabTransformer

The parameters for the `TabTransformer` are this

```
column_idx: Dict[str, int],
cat_embed_input: Optional[List[Tuple[str, int]]] = None,
cat_embed_dropout: Optional[float] = None,
use_cat_bias: Optional[bool] = None,
cat_embed_activation: Optional[str] = None,
shared_embed: Optional[bool] = None,
add_shared_embed: Optional[bool] = None,
frac_shared_embed: Optional[float] = None,
continuous_cols: Optional[List[str]] = None,
cont_norm_layer: Optional[Literal["batchnorm", "layernorm"]] = None,
embed_continuous: Optional[bool] = None,
embed_continuous_method: Optional[Literal["standard", "piecewise", "periodic"]] = None,
cont_embed_dropout: Optional[float] = None,
cont_embed_activation: Optional[str] = None,
quantization_setup: Optional[Dict[str, List[float]]] = None,
n_frequencies: Optional[int] = None,
sigma: Optional[float] = None,
share_last_layer: Optional[bool] = None,
full_embed_dropout: Optional[bool] = None,
input_dim: int = 32,
n_heads: int = 8,
use_qkv_bias: bool = False,
n_blocks: int = 4,
attn_dropout: float = 0.2,
ff_dropout: float = 0.1,
ff_factor: int = 4,
transformer_activation: str = "gelu",
use_linear_attention: bool = False,
use_flash_attention: bool = False,
mlp_hidden_dims: Optional[List[int]] = None,
mlp_activation: str = "relu",
mlp_dropout: float = 0.1,
mlp_batchnorm: bool = False,
mlp_batchnorm_last: bool = False,
mlp_linear_first: bool = True,
```

Please, see the documentation for details on each one of them, for now let's see how one could use a `TabTransformer` model in a few lines of code

In [28]:
tab_transformer = TabTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    embed_continuous_method="standard",
    cont_norm_layer="layernorm",
    cont_embed_dropout=0.2,
    cont_embed_activation="leaky_relu",
    n_heads=4,
    ff_dropout=0.2,
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
    mlp_linear_first="True",
)

In [29]:
tab_model = WideDeep(deeptabular=tab_transformer)

In [30]:
tab_model

WideDeep(
  (deeptabular): Sequential(
    (0): TabTransformer(
      (cat_embed): SameSizeCatEmbeddings(
        (embed): Embedding(325, 32, padding_idx=0)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (cont_norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
      (cont_embed): ContEmbeddings(
        INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]
        (linear): ContLinear(n_cont_cols=2, embed_dim=32, embed_dropout=0.2)
        (activation_fn): LeakyReLU(negative_slope=0.01, inplace=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (encoder): Sequential(
        (transformer_block0): TransformerEncoder(
          (attn): MultiHeadedAttention(
            (dropout): Dropout(p=0.2, inplace=False)
            (q_proj): Linear(in_features=32, out_features=32, bias=False)
            (kv_proj): Linear(in_features=32, out_features=64, bias=False)
            (out_proj): Linear(in_features=32, out_features=32,

In [31]:
tab_trainer = Trainer(
    model=tab_model,
    objective="binary",
    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)

In [32]:
tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=128, val_split=0.2)

epoch 1: 100%|██████████████████████████████████████████| 306/306 [00:11<00:00, 27.57it/s, loss=0.359, metrics={'acc': 0.8334, 'prec': 0.7082}]
valid: 100%|███████████████████████████████████████████████| 77/77 [00:01<00:00, 57.89it/s, loss=0.33, metrics={'acc': 0.8536, 'prec': 0.7152}]
