# How to use the model

To understand the model it would be convenient if you have gone through demo1 and 2, however you can learn how to use the model simply reading this notebook.

### 0. Load the data

Note that, as long as your dataset is in a state similar to that of adult.csv or listings.csv (after `airbnb_data_preprocessing.py`), i.e. remove NaN, impute missing values, etc..., you are "good to go".

In [1]:
import torch
import numpy as np
import pandas as pd
import pickle

from pathlib import Path
from torchvision import transforms
from sklearn.metrics import mean_squared_error
from prepare_data import prepare_data_adult, prepare_data_airbnb

In [2]:
DATA_PATH=Path('data')

##  1. Logistic regression with the adult dataset

### 1.1 Set up and Prepare the Data

In [3]:
# the following will all happen if you simply run: python prepare_data.py --dataset adult
DF_adult = pd.read_csv(DATA_PATH/'adult/adult.csv')
DF_adult.columns = [c.replace("-", "_") for c in DF_adult.columns]
DF_adult['income_label'] = (DF_adult["income"].apply(lambda x: ">50K" in x)).astype(int)
DF_adult.drop("income", axis=1, inplace=True)
DF_adult['age_buckets'] = pd.cut(DF_adult.age, bins=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65],
    labels=np.arange(9))
out_dir = DATA_PATH/'adult/wide_deep_data/'

# WIDE
wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
    'native_country','gender']
crossed_cols = (['education', 'occupation'], ['native_country', 'occupation'])

# DEEP DENSE
embeddings_cols = [('education',16), ('relationship',16), ('workclass',16),
    ('occupation',16),('native_country',16)]
continuous_cols = ["age","hours_per_week"]
standardize_cols = continuous_cols

#TARGET: logistic
target = 'income_label'
DF_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_label,age_buckets
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0,0.0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0,3.0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1,1.0
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1,4.0
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0,


In [4]:
wd_dataset_adult = prepare_data_adult(
    DF_adult, wide_cols,
    crossed_cols,
    embeddings_cols,
    continuous_cols,
    standardize_cols,
    target, out_dir,
    scale=True
    )

Wide and Deep adult data preparation completed.


In [5]:
wd_dataset_adult.keys()

dict_keys(['train', 'valid', 'test', 'cat_embeddings_input', 'cat_embeddings_encoding_dict', 'continuous_cols', 'deep_column_idx'])

In [6]:
wd_dataset_adult['train'].keys()

dict_keys(['wide', 'deep_dense', 'target'])

### 1.2 Define the parameter dictionary

In [7]:
params = dict()
params['wide'] = dict(
    wide_dim = wd_dataset_adult['train']['wide'].shape[1]
    )
params['deep_dense'] = dict(
    embeddings_input = wd_dataset_adult['cat_embeddings_input'],
    embeddings_encoding_dict = wd_dataset_adult['cat_embeddings_encoding_dict'],
    continuous_cols = wd_dataset_adult['continuous_cols'],
    deep_column_idx = wd_dataset_adult['deep_column_idx'],
    hidden_layers = [64,32],
    dropout = [0.5]
    )

### 1.3 Build the Model

In [8]:
from widedeep.models.wide_deep import WideDeepLoader, WideDeep
model1 = WideDeep(output_dim=1, **params)

In [9]:
model1

WideDeep(
  (wide): Wide(
    (wlinear): Linear(in_features=805, out_features=1, bias=True)
  )
  (deep_dense): DeepDense(
    (emb_layer_education): Embedding(16, 16)
    (emb_layer_relationship): Embedding(6, 16)
    (emb_layer_workclass): Embedding(9, 16)
    (emb_layer_occupation): Embedding(15, 16)
    (emb_layer_native_country): Embedding(42, 16)
    (dense): Sequential(
      (dense_layer_0): Sequential(
        (0): Linear(in_features=82, out_features=64, bias=True)
        (1): LeakyReLU(negative_slope=0.01, inplace)
        (2): Dropout(p=0.0)
      )
      (dense_layer_1): Sequential(
        (0): Linear(in_features=64, out_features=32, bias=True)
        (1): LeakyReLU(negative_slope=0.01, inplace)
        (2): Dropout(p=0.5)
      )
      (last_linear): Linear(in_features=32, out_features=1, bias=True)
    )
  )
)

### 1.4 Compile and Run it

In [10]:
optimizer={'widedeep': ['Adam', 0.01]} 

In [11]:
model1.compile(method='logistic', optimizer=optimizer)

In [12]:
use_cuda = torch.cuda.is_available()

In [13]:
if use_cuda:
    model1 = model1.cuda()

In [14]:
train_set = WideDeepLoader(wd_dataset_adult['train'], mode='train')
valid_set = WideDeepLoader(wd_dataset_adult['valid'], mode='train')
test_set = WideDeepLoader(wd_dataset_adult['test'], mode='test')
train_loader = torch.utils.data.DataLoader(dataset=train_set,
    batch_size=128,shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_set,
    batch_size=128,shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_set,
    batch_size=32,shuffle=False)
model1.fit(n_epochs=10, train_loader=train_loader, eval_loader=valid_loader)

epoch 1: 100%|██████████| 229/229 [00:02<00:00, 114.23it/s, acc=0.83, loss=0.367] 
valid: 100%|██████████| 77/77 [00:00<00:00, 167.71it/s, acc=0.84, loss=0.356] 
epoch 2: 100%|██████████| 229/229 [00:01<00:00, 127.67it/s, acc=0.84, loss=0.346] 
valid: 100%|██████████| 77/77 [00:00<00:00, 211.71it/s, acc=0.841, loss=0.345]
epoch 3: 100%|██████████| 229/229 [00:01<00:00, 128.27it/s, acc=0.842, loss=0.342]
valid: 100%|██████████| 77/77 [00:00<00:00, 172.08it/s, acc=0.838, loss=0.348]
epoch 4: 100%|██████████| 229/229 [00:01<00:00, 128.06it/s, acc=0.842, loss=0.34] 
valid: 100%|██████████| 77/77 [00:00<00:00, 211.86it/s, acc=0.84, loss=0.349] 
epoch 5: 100%|██████████| 229/229 [00:01<00:00, 127.87it/s, acc=0.841, loss=0.339]
valid: 100%|██████████| 77/77 [00:00<00:00, 210.27it/s, acc=0.841, loss=0.351]
epoch 6: 100%|██████████| 229/229 [00:01<00:00, 127.71it/s, acc=0.844, loss=0.337]
valid: 100%|██████████| 77/77 [00:00<00:00, 210.53it/s, acc=0.84, loss=0.349] 
epoch 7: 100%|██████████| 22

## 2. Regression with the airbnb dataset using all: Wide, Deep_Dense, Deep_Text and Deep_Image. Also, multiple optimizers and learning rate schedulers

### 2.1. Set up and Prepare the Data

In [3]:
# I assume you have runned airbnb_data_preprocessing.py and the resulting file is at 
# DATA_PATH/'airbnb/listings_processed.csv'
DF_airbnb = pd.read_csv(DATA_PATH/'airbnb/listings_processed.csv')
DF_airbnb = DF_airbnb[DF_airbnb.description.apply(lambda x: len(x.split(' '))>=10)]
out_dir = DATA_PATH/'airbnb/wide_deep_data/'

# WIDE
crossed_cols = (['property_type', 'room_type'],)
already_dummies = [c for c in DF_airbnb.columns if 'amenity' in c] + ['has_house_rules']
wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender'] +\
    already_dummies

#DEEP_DENSE
embeddings_cols = [(c, 16) for c in DF_airbnb.columns if 'catg' in c] + [('neighbourhood_cleansed', 64)]
continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
standardize_cols = ['security_deposit', 'extra_people']

# DEEP_TEXT
text_col = 'description'
word_vectors_path = 'data/glove.6B/glove.6B.300d.txt'

# DEEP_IMAGE
img_id = 'id'
img_path = DATA_PATH/'airbnb/property_picture'

#TARGET
target = 'yield'

Prepare the dataset, easy with `prepare_data_airbnb`

In [4]:
wd_dataset_airbnb = prepare_data_airbnb(
    # let's use only 5000 observations (not all of them will have images, so we might end with 4900+)
    df = DF_airbnb.sample(5000),
    img_id = img_id,
    img_path = img_path,
    text_col = text_col,
    max_vocab = 20000,
    min_freq = 2,
    maxlen = 170,
    word_vectors_path = word_vectors_path,
    embeddings_cols = embeddings_cols,
    continuous_cols = continuous_cols,
    standardize_cols = standardize_cols,
    target = target,
    wide_cols = wide_cols,
    crossed_cols = crossed_cols,
    already_dummies = already_dummies,
    out_dir = out_dir,
    scale=True,
    seed=1
    )

Reading Images from data/airbnb/property_picture


  1%|          | 41/5000 [00:00<00:12, 405.03it/s]

Resizing


100%|██████████| 5000/5000 [00:12<00:00, 384.31it/s]


Our vocabulary contains 12433 words
Indexing word vectors...
Loaded 400000 word vectors
Preparing embeddings matrix...
6776 words in our vocabulary had glove vectors and appear more than the min frequency
Wide and Deep airbnb data preparation completed.


### 2.2 Define the parameter dictionary

In [56]:
# To understand what all these parameters mean, simple see demo1 and demo2 and the modules in widedeep.models
params = dict()
params['wide'] = dict(
    wide_dim = wd_dataset_airbnb['train']['wide'].shape[1]
    )
params['deep_dense'] = dict(
    embeddings_input = wd_dataset_airbnb['cat_embeddings_input'],
    embeddings_encoding_dict = wd_dataset_airbnb['cat_embeddings_encoding_dict'],
    continuous_cols = wd_dataset_airbnb['continuous_cols'],
    deep_column_idx = wd_dataset_airbnb['deep_column_idx'],
    hidden_layers = [64,32],
    dropout = [0.5]
    )
params['deep_text'] = dict(
    vocab_size = len(wd_dataset_airbnb['vocab'].itos),
    embedding_dim = wd_dataset_airbnb['word_embeddings_matrix'].shape[1],
    hidden_dim = 64,
    n_layers = 2,
    rnn_dropout = 0.5,
    spatial_dropout = 0.1,
    padding_idx = 1,
    attention = False,
    bidirectional = False,
    embedding_matrix = wd_dataset_airbnb['word_embeddings_matrix']
    )
params['deep_img'] = dict(
    pretrained = True,
    freeze=6,
    )

### 2.3 Build the Model

The model is built exactly as in the case of the adult dataset before

In [57]:
from widedeep.models.wide_deep import WideDeepLoader, WideDeep
model2 = WideDeep(output_dim=1, **params)

In [58]:
model2

WideDeep(
  (wide): Wide(
    (wlinear): Linear(in_features=213, out_features=1, bias=True)
  )
  (deep_dense): DeepDense(
    (emb_layer_neighbourhood_cleansed): Embedding(33, 64)
    (emb_layer_bathrooms_catg): Embedding(3, 16)
    (emb_layer_host_listings_count_catg): Embedding(4, 16)
    (emb_layer_minimum_nights_catg): Embedding(3, 16)
    (emb_layer_beds_catg): Embedding(4, 16)
    (emb_layer_bedrooms_catg): Embedding(4, 16)
    (emb_layer_guests_included_catg): Embedding(3, 16)
    (emb_layer_accommodates_catg): Embedding(3, 16)
    (dense): Sequential(
      (dense_layer_0): Sequential(
        (0): Linear(in_features=180, out_features=64, bias=True)
        (1): LeakyReLU(negative_slope=0.01, inplace)
        (2): Dropout(p=0.0)
      )
      (dense_layer_1): Sequential(
        (0): Linear(in_features=64, out_features=32, bias=True)
        (1): LeakyReLU(negative_slope=0.01, inplace)
        (2): Dropout(p=0.5)
      )
      (last_linear): Linear(in_features=32, out_features

### 2.4 Compile and Run

In [59]:
# For example...
optimizer=dict(
    wide=['Adam', 0.1],
    deep_dense=['Adam', 0.01],
    deep_text=['RMSprop', 0.01,0.1],
    deep_img= ['Adam', 0.01]
    )
lr_scheduler=dict(
    wide=['StepLR', 3, 0.1],
    deep_dense=['StepLR', 3, 0.1],
    deep_text=['MultiStepLR', [3,5,7], 0.1],
    deep_img=['MultiStepLR', [3,5,7], 0.1]
    )
# if you want just one optimizer and lr_scheduler call simply
# optimizer={'widedeep': ['Adam', 0.01]}
# lr_scheduler = {'widedeep': ['StepLR', 3, 0.1]}

In [60]:
model2.compile(method='regression', optimizer=optimizer, lr_scheduler=lr_scheduler)

In [61]:
model2 = model2.cuda()

In [62]:
# cv2 reads bgr
# mean=[0.485, 0.456, 0.406] #RGB
# std=[0.229, 0.224, 0.225]  #RGB
mean=[0.406, 0.456, 0.485] #RGB
std=[0.225, 0.224, 0.229]  #RGB
transform  = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])
train_set = WideDeepLoader(wd_dataset_airbnb['train'], transform, mode='train')
valid_set = WideDeepLoader(wd_dataset_airbnb['valid'], transform, mode='train')
test_set = WideDeepLoader(wd_dataset_airbnb['test'], transform, mode='test')
train_loader = torch.utils.data.DataLoader(dataset=train_set,
    batch_size=64,shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_set,
    batch_size=64,shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_set,
    batch_size=32,shuffle=False)

In [63]:
model2.fit(n_epochs=5, train_loader=train_loader, eval_loader=valid_loader)

epoch 1: 100%|██████████| 47/47 [00:22<00:00,  2.05it/s, loss=118]
valid: 100%|██████████| 16/16 [00:04<00:00,  4.06it/s, loss=117]
epoch 2: 100%|██████████| 47/47 [00:22<00:00,  2.14it/s, loss=105]
valid: 100%|██████████| 16/16 [00:04<00:00,  4.20it/s, loss=158]
epoch 3: 100%|██████████| 47/47 [00:22<00:00,  2.15it/s, loss=99.2]
valid: 100%|██████████| 16/16 [00:03<00:00,  4.27it/s, loss=99]  
epoch 4: 100%|██████████| 47/47 [00:22<00:00,  2.13it/s, loss=97]  
valid: 100%|██████████| 16/16 [00:04<00:00,  4.06it/s, loss=99.6]
epoch 5: 100%|██████████| 47/47 [00:22<00:00,  2.15it/s, loss=94.7]
valid: 100%|██████████| 16/16 [00:04<00:00,  4.24it/s, loss=99]  


In [64]:
preds = model2.predict(test_loader)
y = wd_dataset_airbnb['test']['target']
print(np.sqrt(mean_squared_error(y, preds)))

predict: 100%|██████████| 32/32 [00:04<00:00,  6.73it/s]

105.88571074249793





### 2.5 Extract the learned embeddings for a given categorical feature

In [65]:
neighbourhood_cleansed_emb = model2.get_embeddings(col_name='neighbourhood_cleansed')

In [66]:
neighbourhood_cleansed_emb

{'Hammersmith and Fulham': array([-1.851022,  0.703244,  0.34214 , -1.215795, ..., -0.218133, -0.107084, -0.050649, -1.276854], dtype=float32),
 'Barnet': array([-1.739752, -0.935879,  0.335114, -1.109356, ..., -2.491122, -0.632074,  2.492794,  1.231859], dtype=float32),
 'Ealing': array([-0.89164 ,  0.005264, -2.254685,  0.572432, ..., -1.358266, -1.876953,  1.084197,  0.825844], dtype=float32),
 'Greenwich': array([-0.686692, -0.623536,  1.663162,  1.130035, ..., -0.359084, -0.609575,  0.304735, -1.042224], dtype=float32),
 'Lambeth': array([-0.047345, -0.697568,  0.793924, -0.18951 , ...,  0.230893, -0.170741, -0.592736, -0.755723], dtype=float32),
 'Lewisham': array([-0.302935,  1.052123,  0.883626,  0.127071, ..., -0.047294, -0.667769,  1.237696,  1.278981], dtype=float32),
 'Richmond upon Thames': array([-1.108736,  0.175303, -1.596437, -0.13958 , ...,  0.557685,  0.076416, -0.171436,  1.561785], dtype=float32),
 'Wandsworth': array([-0.133121, -1.265229, -0.536881, -0.235154, ..

## 3. Multiclass Classification with the airbnb dataset using Wide, Deep_Dense and Deep_Text with one optimizers and one learning rate scheduler

Here we will fake a multiclass classification problem using the target `yield`

### 3.1. Set up and Prepare the Data

In [67]:
DF_airbnb['yield_cat'] = pd.cut(DF_airbnb['yield'], bins=[0.2, 65, 163, 600], labels=[0,1,2])
DF_airbnb.drop('yield', axis=1, inplace=True)

In [68]:
# The rest is the same
out_dir = DATA_PATH/'airbnb/wide_deep_data/'

# WIDE
crossed_cols = (['property_type', 'room_type'],)
already_dummies = [c for c in DF_airbnb.columns if 'amenity' in c] + ['has_house_rules']
wide_cols = ['is_location_exact', 'property_type', 'room_type', 'host_gender'] +\
    already_dummies

#DEEP_DENSE
embeddings_cols = [(c, 16) for c in DF_airbnb.columns if 'catg' in c] + [('neighbourhood_cleansed', 64)]
continuous_cols = ['latitude', 'longitude', 'security_deposit', 'extra_people']
standardize_cols = ['security_deposit', 'extra_people']

# DEEP_TEXT
text_col = 'description'
word_vectors_path = 'data/glove.6B/glove.6B.300d.txt'

# DEEP_IMAGE
img_id = 'id'
img_path = DATA_PATH/'airbnb/property_picture'

#TARGET
target = 'yield_cat'

In [69]:
wd_dataset_airbnb = prepare_data_airbnb(
    # let's use only 5000 observations (not all of them will have images, so we might end with 4900+)
    df = DF_airbnb.sample(5000),
    img_id = img_id,
    img_path = img_path,
    text_col = text_col,
    max_vocab = 20000,
    min_freq = 2,
    maxlen = 170,
    word_vectors_path = word_vectors_path,
    embeddings_cols = embeddings_cols,
    continuous_cols = continuous_cols,
    standardize_cols = standardize_cols,
    target = target,
    wide_cols = wide_cols,
    crossed_cols = crossed_cols,
    already_dummies = already_dummies,
    out_dir = out_dir,
    scale=True,
    seed=1
    )

Reading Images from data/airbnb/property_picture


  1%|          | 41/5000 [00:00<00:12, 402.21it/s]

Resizing


100%|██████████| 5000/5000 [00:12<00:00, 387.15it/s]


Our vocabulary contains 12675 words
Indexing word vectors...
Loaded 400000 word vectors
Preparing embeddings matrix...
6786 words in our vocabulary had glove vectors and appear more than the min frequency
Wide and Deep airbnb data preparation completed.


In [70]:
# Let's drop the image dataset, this time "only" with Wide, Deep_Dense and Deep_Text
del wd_dataset_airbnb['train']['deep_img']
del wd_dataset_airbnb['valid']['deep_img']
del wd_dataset_airbnb['test']['deep_img']

### 3.2 Define the parameter dictionary

In [71]:
params = dict()
params['wide'] = dict(
    wide_dim = wd_dataset_airbnb['train']['wide'].shape[1]
    )
params['deep_dense'] = dict(
    embeddings_input = wd_dataset_airbnb['cat_embeddings_input'],
    embeddings_encoding_dict = wd_dataset_airbnb['cat_embeddings_encoding_dict'],
    continuous_cols = wd_dataset_airbnb['continuous_cols'],
    deep_column_idx = wd_dataset_airbnb['deep_column_idx'],
    hidden_layers = [64,32],
    dropout = [0.5]
    )
params['deep_text'] = dict(
    vocab_size = len(wd_dataset_airbnb['vocab'].itos),
    embedding_dim = wd_dataset_airbnb['word_embeddings_matrix'].shape[1],
    hidden_dim = 64,
    n_layers = 3,
    rnn_dropout = 0.5,
    spatial_dropout = 0.1,
    padding_idx = 1,
    attention = False,
    bidirectional = False,
    embedding_matrix = wd_dataset_airbnb['word_embeddings_matrix']
    )

### 3.3 Build the model

In [72]:
from widedeep.models.wide_deep import WideDeepLoader, WideDeep
# We have 3 classes
model3 = WideDeep(output_dim=3, **params)

In [73]:
model3

WideDeep(
  (wide): Wide(
    (wlinear): Linear(in_features=213, out_features=3, bias=True)
  )
  (deep_dense): DeepDense(
    (emb_layer_neighbourhood_cleansed): Embedding(33, 64)
    (emb_layer_bathrooms_catg): Embedding(3, 16)
    (emb_layer_host_listings_count_catg): Embedding(4, 16)
    (emb_layer_minimum_nights_catg): Embedding(3, 16)
    (emb_layer_beds_catg): Embedding(4, 16)
    (emb_layer_bedrooms_catg): Embedding(4, 16)
    (emb_layer_guests_included_catg): Embedding(3, 16)
    (emb_layer_accommodates_catg): Embedding(3, 16)
    (dense): Sequential(
      (dense_layer_0): Sequential(
        (0): Linear(in_features=180, out_features=64, bias=True)
        (1): LeakyReLU(negative_slope=0.01, inplace)
        (2): Dropout(p=0.0)
      )
      (dense_layer_1): Sequential(
        (0): Linear(in_features=64, out_features=32, bias=True)
        (1): LeakyReLU(negative_slope=0.01, inplace)
        (2): Dropout(p=0.5)
      )
      (last_linear): Linear(in_features=32, out_features

###  3.4 Compile and run

In [74]:
optimizer={'widedeep': ['Adam', 0.01]}
lr_scheduler = {'widedeep': ['StepLR', 3, 0.1]}

In [75]:
model3.compile(method='multiclass', optimizer=optimizer, lr_scheduler=lr_scheduler)

In [76]:
model3 = model3.cuda()

In [77]:
train_set = WideDeepLoader(wd_dataset_airbnb['train'], mode='train')
valid_set = WideDeepLoader(wd_dataset_airbnb['valid'], mode='train')
test_set = WideDeepLoader(wd_dataset_airbnb['test'], mode='test')
train_loader = torch.utils.data.DataLoader(dataset=train_set,
    batch_size=128,shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_set,
    batch_size=128,shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_set,
    batch_size=32,shuffle=False)

In [78]:
train_set.input_types

['wide', 'deep_dense', 'deep_text', 'target']

In [79]:
model3.fit(n_epochs=5, train_loader=train_loader, eval_loader=valid_loader)

epoch 1: 100%|██████████| 24/24 [00:03<00:00,  7.05it/s, acc=0.547, loss=0.985]
valid: 100%|██████████| 8/8 [00:00<00:00, 37.03it/s, acc=0.588, loss=0.949]
epoch 2: 100%|██████████| 24/24 [00:03<00:00,  7.96it/s, acc=0.601, loss=0.936]
valid: 100%|██████████| 8/8 [00:00<00:00, 41.34it/s, acc=0.58, loss=0.951] 
epoch 3: 100%|██████████| 24/24 [00:02<00:00,  8.47it/s, acc=0.634, loss=0.904]
valid: 100%|██████████| 8/8 [00:00<00:00, 42.72it/s, acc=0.597, loss=0.936]
epoch 4: 100%|██████████| 24/24 [00:02<00:00,  8.47it/s, acc=0.663, loss=0.887]
valid: 100%|██████████| 8/8 [00:00<00:00, 43.66it/s, acc=0.581, loss=0.945]
epoch 5: 100%|██████████| 24/24 [00:02<00:00,  8.63it/s, acc=0.67, loss=0.876] 
valid: 100%|██████████| 8/8 [00:00<00:00, 43.78it/s, acc=0.576, loss=0.943]


Dont pay much attention to the results, this is just an artificial experiments for you to see how one would use it for multiclass classification. 

And with this, this is it, I guess you now have all the information to run as many experiments as you want combining all sorts of datasets