# AUC multiclass computation

## Initial imports

In [16]:
import numpy as np
import pandas as pd
from torch.optim import SGD, lr_scheduler

from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.models import TabMlp, WideDeep
from torchmetrics import AUC, AUROC
from pytorch_widedeep.initializers import XavierNormal
from pytorch_widedeep.datasets import load_ecoli, load_california_housing
from pytorch_widedeep.utils import LabelEncoder

from sklearn import model_selection

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pytorch_widedeep.training._wd_dataset import WideDeepDataset
from torch.utils.data.dataloader import DataLoader

from scipy.io import arff
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [66]:
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import gen_features

cont_cols_def = gen_features(
    columns=list(map(lambda x:[x], cont_cols)),
    classes=[StandardScaler]
)

cat_cols_def = gen_features(
    columns=list(map(lambda x:[x], cat_cols)),
    classes=[None]
)

target_col_def = ([target_col], None, {})

cont_cols_def.extend(cat_cols_def)
cont_cols_def.extend(target_col_def)

mapper_df = DataFrameMapper(cont_cols_def, df_out=True)
mapper_df.fit_transform(df)

In [24]:
wddt = WideDeepDataset(X_tab=df.drop(columns=["MedHouseVal"]).values , target=df["MedHouseVal"].values, lds=False, reweight="inverse")

Using re-weighting: [INVERSE]


In [90]:
lds = False
reweight="sqrt_inv"

In [91]:
keys = np.linspace(df["MedHouseVal"].min(), df["MedHouseVal"].max(), num=50, endpoint=True)
labels = df["MedHouseVal"].values
value_dict = dict(zip(keys[:-1], np.histogram(labels, keys)[0]))

if reweight == "sqrt_inv":
    value_dict = {k: np.sqrt(v) for k, v in value_dict.items()}
    #print(value_dict)
elif reweight == "inverse":
    value_dict = {k: np.clip(v, 5, 1000) for k, v in value_dict.items()}  # clip weights for inverse re-weight
    #print(value_dict)

num_per_label=[]
for label in labels:
    difference_array = np.absolute(keys[:-1]-label)
    index = difference_array.argmin()
    num_per_label.append(value_dict[keys[index]])

print(f"Using re-weighting: [{reweight.upper()}]")

if lds:
    lds_kernel_window = get_lds_kernel_window(lds_kernel, lds_ks, lds_sigma)
    print(f"Using LDS: [{lds_kernel.upper()}] ({lds_ks}/{lds_sigma})")
    smoothed_value = convolve1d(
        np.asarray([v for _, v in value_dict.items()]), weights=lds_kernel_window, mode="constant")
    num_per_label = [smoothed_value[min(max_target - 1, int(label))] for label in labels]

weights = [np.float32(1 / x) for x in num_per_label]
scaling = len(weights) / np.sum(weights)
weights = [scaling * x for x in weights]

Using re-weighting: [SQRT_INV]


In [92]:
weights

[2.587421367056114,
 1.6564268330127332,
 1.5688547458255422,
 1.3002281835150116,
 1.3002281835150116,
 1.0895009985300896,
 1.5176428241713207,
 1.0346927110839197,
 0.8611989266097908,
 1.0413754086639448,
 1.233504800545987,
 1.0346927110839197,
 0.8789033671906682,
 0.9117574871922768,
 0.7768481507498354,
 0.7829893441427713,
 0.6866303560772578,
 0.6866303560772578,
 0.7768481507498354,
 0.7768481507498354,
 0.7829893441427713,
 0.7768481507498354,
 0.8231684205829631,
 0.7883010436082976,
 0.7419939628705237,
 0.7883010436082976,
 0.7676760531594234,
 0.7883010436082976,
 0.7883010436082976,
 0.7419939628705237,
 0.8421339223252955,
 0.8231684205829631,
 0.8231684205829631,
 0.7883010436082976,
 0.8231684205829631,
 0.7676760531594234,
 0.7883010436082976,
 0.7883010436082976,
 0.9117574871922768,
 0.7957217462011265,
 0.6866303560772578,
 0.6866303560772578,
 0.8231684205829631,
 0.9117574871922768,
 0.7636532956012267,
 0.7636532956012267,
 0.7829893441427713,
 0.741993962870

In [3]:
df = load_california_housing(as_frame=True)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [21]:
# imbalance of the classes
df["class"].value_counts()

cp     143
im      77
pp      52
imU     35
om      20
omL      5
imS      2
imL      2
Name: class, dtype: int64

In [22]:
df = df.loc[~df["class"].isin(["omL", "imS", "imL"])]
df.reset_index(inplace=True, drop=True)

In [23]:
encoder = LabelEncoder(["class"])
df_enc = encoder.fit_transform(df)
df_enc["class"] = df_enc["class"]-1

In [7]:
# drop columns we won't need in this example
df_enc = df_enc.drop(columns=["SequenceName"])

In [8]:
df_train, df_valid = train_test_split(df_enc, test_size=0.2, stratify=df_enc["class"], random_state=1)
df_valid, df_test = train_test_split(df_valid, test_size=0.5, stratify=df_valid["class"], random_state=1)

## Preparing the data

In [9]:
continuous_cols = df_enc.drop(columns=["class"]).columns.values.tolist()

In [10]:
# deeptabular
tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)
X_tab_train = tab_preprocessor.fit_transform(df_train)
X_tab_valid = tab_preprocessor.transform(df_valid)
X_tab_test = tab_preprocessor.transform(df_test)

# target
y_train = df_train["class"].values
y_valid = df_valid["class"].values
y_test = df_test["class"].values

X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}

## Define the model

In [11]:
deeptabular = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    continuous_cols=tab_preprocessor.continuous_cols,
)
model = WideDeep(deeptabular=deeptabular, pred_dim=df_enc["class"].nunique())
model

WideDeep(
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_embed_and_cont): CatEmbeddingsAndCont(
        (cont_norm): BatchNorm1d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (tab_mlp): MLP(
        (mlp): Sequential(
          (dense_layer_0): Sequential(
            (0): Dropout(p=0.1, inplace=False)
            (1): Linear(in_features=7, out_features=200, bias=True)
            (2): ReLU(inplace=True)
          )
          (dense_layer_1): Sequential(
            (0): Dropout(p=0.1, inplace=False)
            (1): Linear(in_features=200, out_features=100, bias=True)
            (2): ReLU(inplace=True)
          )
        )
      )
    )
    (1): Linear(in_features=100, out_features=5, bias=True)
  )
)

In [12]:
auroc = AUROC(num_classes=df_enc["class"].nunique())



In [13]:
# Optimizers
deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)
# LR Scheduler
deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)
# Hyperparameters
trainer = Trainer(
    model,
    objective="multiclass_focal_loss",
    lr_schedulers={"deeptabular": deep_sch},
    initializers={"deeptabular": XavierNormal},
    optimizers={"deeptabular": deep_opt},
    metrics=[auroc],
)

trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50)

epoch 1: 100%|██████████| 6/6 [00:00<00:00, 84.27it/s, loss=0.111, metrics={'AUROC': 0.285}]
valid: 100%|██████████| 1/1 [00:00<00:00,  5.57it/s, loss=0.106, metrics={'AUROC': 0.3309}]
epoch 2: 100%|██████████| 6/6 [00:00<00:00, 111.92it/s, loss=0.106, metrics={'AUROC': 0.3124}]
valid: 100%|██████████| 1/1 [00:00<00:00,  4.99it/s, loss=0.102, metrics={'AUROC': 0.375}]
epoch 3: 100%|██████████| 6/6 [00:00<00:00, 109.51it/s, loss=0.102, metrics={'AUROC': 0.3459}]
valid: 100%|██████████| 1/1 [00:00<00:00,  6.70it/s, loss=0.0967, metrics={'AUROC': 0.4444}]
epoch 4: 100%|██████████| 6/6 [00:00<00:00, 106.40it/s, loss=0.0984, metrics={'AUROC': 0.3717}]
valid: 100%|██████████| 1/1 [00:00<00:00,  5.93it/s, loss=0.0963, metrics={'AUROC': 0.4516}]
epoch 5: 100%|██████████| 6/6 [00:00<00:00, 93.06it/s, loss=0.0975, metrics={'AUROC': 0.3877}]
valid: 100%|██████████| 1/1 [00:00<00:00,  5.98it/s, loss=0.0961, metrics={'AUROC': 0.4404}]
