# AUC multiclass computation

## Initial imports

In [1]:
import numpy as np
import pandas as pd
from torch.optim import SGD, lr_scheduler

from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.models import TabMlp, WideDeep
from torchmetrics import AUC, AUROC
from pytorch_widedeep.initializers import XavierNormal
from pytorch_widedeep.datasets import load_ecoli
from pytorch_widedeep.utils import LabelEncoder

from sklearn.model_selection import train_test_split

# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

2021-12-19 11:41:25.345062: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-19 11:41:25.345109: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from .mio5_utils import VarReader5


In [2]:
df = load_ecoli(as_frame=True)
df.head()

Unnamed: 0,SequenceName,mcg,gvh,lip,chg,aac,alm1,alm2,class
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


In [4]:
# imbalance of the classes
df["class"].value_counts()

cp     143
im      77
pp      52
imU     35
om      20
omL      5
imS      2
imL      2
Name: class, dtype: int64

In [5]:
df = df.loc[~df["class"].isin(["omL", "imS", "imL"])]
df.reset_index(inplace=True, drop=True)

In [6]:
encoder = LabelEncoder(["class"])
df_enc = encoder.fit_transform(df)
df_enc["class"] = df_enc["class"] - 1

In [7]:
# drop columns we won't need in this example
df_enc = df_enc.drop(columns=["SequenceName"])

In [8]:
df_train, df_valid = train_test_split(
    df_enc, test_size=0.2, stratify=df_enc["class"], random_state=1
)
df_valid, df_test = train_test_split(
    df_valid, test_size=0.5, stratify=df_valid["class"], random_state=1
)

## Preparing the data

In [9]:
continuous_cols = df_enc.drop(columns=["class"]).columns.values.tolist()

In [10]:
# deeptabular
tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)
X_tab_train = tab_preprocessor.fit_transform(df_train)
X_tab_valid = tab_preprocessor.transform(df_valid)
X_tab_test = tab_preprocessor.transform(df_test)

# target
y_train = df_train["class"].values
y_valid = df_valid["class"].values
y_test = df_test["class"].values

X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}

## Define the model

In [11]:
deeptabular = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    continuous_cols=tab_preprocessor.continuous_cols,
)
model = WideDeep(deeptabular=deeptabular, pred_dim=df_enc["class"].nunique())
model

WideDeep(
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(
        (cont_norm): BatchNorm1d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (tab_mlp): MLP(
        (mlp): Sequential(
          (dense_layer_0): Sequential(
            (0): Dropout(p=0.1, inplace=False)
            (1): Linear(in_features=7, out_features=200, bias=True)
            (2): ReLU(inplace=True)
          )
          (dense_layer_1): Sequential(
            (0): Dropout(p=0.1, inplace=False)
            (1): Linear(in_features=200, out_features=100, bias=True)
            (2): ReLU(inplace=True)
          )
        )
      )
    )
    (1): Linear(in_features=100, out_features=5, bias=True)
  )
)

In [12]:
auroc = AUROC(num_classes=df_enc["class"].nunique())



In [13]:
# Optimizers
deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)
# LR Scheduler
deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)
# Hyperparameters
trainer = Trainer(
    model,
    objective="multiclass_focal_loss",
    lr_schedulers={"deeptabular": deep_sch},
    initializers={"deeptabular": XavierNormal},
    optimizers={"deeptabular": deep_opt},
    metrics=[auroc],
)

trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50)

epoch 1: 100%|██████████| 6/6 [00:00<00:00, 107.87it/s, loss=0.116, metrics={'AUROC': 0.4505}]
valid: 100%|██████████| 1/1 [00:00<00:00,  5.51it/s, loss=0.109, metrics={'AUROC': 0.4985}]
epoch 2: 100%|██████████| 6/6 [00:00<00:00, 111.62it/s, loss=0.11, metrics={'AUROC': 0.49}]
valid: 100%|██████████| 1/1 [00:00<00:00, 14.97it/s, loss=0.103, metrics={'AUROC': 0.532}]
epoch 3: 100%|██████████| 6/6 [00:00<00:00, 98.67it/s, loss=0.104, metrics={'AUROC': 0.5155}]
valid: 100%|██████████| 1/1 [00:00<00:00, 15.71it/s, loss=0.0979, metrics={'AUROC': 0.5642}]
epoch 4: 100%|██████████| 6/6 [00:00<00:00, 105.48it/s, loss=0.0987, metrics={'AUROC': 0.561}]
valid: 100%|██████████| 1/1 [00:00<00:00, 14.19it/s, loss=0.0976, metrics={'AUROC': 0.5703}]
epoch 5: 100%|██████████| 6/6 [00:00<00:00, 98.81it/s, loss=0.0993, metrics={'AUROC': 0.522}]
valid: 100%|██████████| 1/1 [00:00<00:00, 15.59it/s, loss=0.0969, metrics={'AUROC': 0.5776}]
