In [1]:
import numpy as np
import pandas as pd
import torch

from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import FTTransformer, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.datasets import load_adult

In [2]:
df = load_adult(as_frame=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["target"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop(["income", "educational_num"], axis=1, inplace=True)

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,target
0,25,Private,226802,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


In [4]:
cat_cols, cont_cols = [], []
for col in df.columns:
    # 50 is just a random number I choose here for this example
    if df[col].dtype == "O" or df[col].nunique() < 50 and col != "target":
        cat_cols.append(col)
    elif col != "target":
        cont_cols.append(col)
target_col = "target"

In [5]:
target = df[target_col].values

tab_preprocessor = TabPreprocessor(
    embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True
)
X_tab = tab_preprocessor.fit_transform(df)

In [7]:
ft_transformer = FTTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=tab_preprocessor.continuous_cols,
    n_blocks=3,
    n_heads=6,
    input_dim=36,
)

In [8]:
model = WideDeep(deeptabular=ft_transformer)
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2)

epoch 1: 100%|██████████| 153/153 [00:15<00:00,  9.80it/s, loss=0.35, metrics={'acc': 0.8361}] 
valid: 100%|██████████| 39/39 [00:01<00:00, 25.04it/s, loss=0.315, metrics={'acc': 0.8522}]


In [9]:
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)

In [10]:
# assuming is a test set with target col
X_vec, y = t2v.transform(df.sample(100), target_col="target")

In [11]:
# X vec is the dataframe turned into the embeddings
X_vec.shape

(100, 468)

`468 = input_dim (36) * n_cols (13)`

In [12]:
# ...or if we don't have target col
X_vec = t2v.transform(df.sample(100))