In [1]:
%load_ext autoreload
%autoreload 2

### Contains evaluation script for EmoRecCom dataset This will save binary results
- selecting an optimal threshold is an open problem.
    - A Study on Threshold Selection for Multi-label
Classification (https://www.csie.ntu.edu.tw/~cjlin/papers/threshold.pdf)
- interesting to read about selecting thresholds: https://stats.stackexchange.com/questions/261756/multi-label-classification-problem-choosing-the-right-threshold-value-for-y-1/262416#262416?newreg=66ace64b745e4ecea88fd1db813efe6b

In [2]:
import sys
sys.path.append("../")

In [16]:
import os
import numpy as np
import copy
import json
import cv2
import math
from tqdm import tqdm
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import IPython
from IPython.display import Image as img
from IPython.display import display

from PIL import Image

import torch
import torch.nn as nn
import torchvision
from torch.nn import functional as F
from torchmetrics import AUROC

from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import zero_one_loss

In [4]:
from src.datamodules.datasets.emoreccom import EmoRecComDataset
from src.models.bert_classifier_model import BertClassifierLitModel
from src.datamodules.datasets.dataset_modality import DatasetModality
from src.datamodules.emoreccom_datamodule import EmoRecComDataModule
from src.utils.text.text_utils import merge_comic_texts
from src.utils.text.goemotions_utils import goemotions_emotion_list, emoreccom_goemotions_emotion_mapping
from src.utils.metric.micro_auc import compute_micro_auc
from src.utils.emoreccom_label_transforms import normalize_and_take_top_n

Loading the model

In [11]:
use_goemotions_pretrained = False

In [12]:
device = "cpu"

In [129]:
untrained_classifier = BertClassifierLitModel(
    num_classes=8,
    num_train_steps=0)
bert_emoreccom_classifier = untrained_classifier

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
checkpoint_path = "/home/gsoykan20/Desktop/self_development/emotion-recognition-drawings/logs/custom_runs/goemtions_pretrained_emoreccom_epoch_002.ckpt" if use_goemotions_pretrained else "/home/gsoykan20/Desktop/self_development/emotion-recognition-drawings/logs/custom_runs/13-kas-12-09-epoch_002.ckpt"
# source: https://pytorch-lightning.readthedocs.io/en/latest/common/weights_loading.html
bert_emoreccom_classifier = BertClassifierLitModel.load_from_checkpoint(checkpoint_path,
                                                                        pretrained_lit_model_for_body_checkpoint=None)

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
bert_emoreccom_classifier.eval()
bert_emoreccom_classifier.to(device)

BertClassifierLitModel(
  (model): BertClassifier(
    (bert): SqueezeBertModel(
      (embeddings): SqueezeBertEmbeddings(
        (word_embeddings): Embedding(30528, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): SqueezeBertEncoder(
        (layers): ModuleList(
          (0): SqueezeBertModule(
            (attention): SqueezeBertSelfAttention(
              (query): Conv1d(768, 768, kernel_size=(1,), stride=(1,), groups=4)
              (key): Conv1d(768, 768, kernel_size=(1,), stride=(1,), groups=4)
              (value): Conv1d(768, 768, kernel_size=(1,), stride=(1,), groups=4)
              (dropout): Dropout(p=0.1, inplace=False)
              (softmax): Softmax(dim=-1)
              (matmul_qk): MatMulWrapper()
              (matmul_qkv): MatMulWrapper()
  

Computing Accuracy for Training data (meaning known labels)

In [97]:
use_transformer_tokenizer = True
emoreccom_path = "/home/gsoykan20/Desktop/datasets/multimodal_emotion_recognition_on_comics_scenes/"  # "/userfiles/comics_grp/multimodal_emotion_recognition_on_comics_scenes/"
datamodule = EmoRecComDataModule(batch_size=1,
                                 tokenizer_max_len=80,
                                 use_label_transform=False,
                                 data_dir=emoreccom_path,
                                 modality=DatasetModality.Text,
                                 use_tokenizer_instead_text_preprocessor=use_transformer_tokenizer)
datamodule.prepare_data()
datamodule.setup()

In [163]:
test_dataloader = iter(datamodule.test_dataloader())
val_dataloader = iter(datamodule.val_dataloader())
train_dataloader = iter(datamodule.train_dataloader())

In [164]:
# metrics
micro_auc_metric = compute_micro_auc

In [165]:
micro_auc_metric_results = []
accuracy_score_results = []
jaccard_score_results = []
zero_one_loss_results = []

In [166]:
for dataloader in [test_dataloader]:
    for idx, batch in tqdm(enumerate(dataloader)):
        # Getting the predictions
        with torch.no_grad():
            transformer_inputs = batch[3]
            ids = transformer_inputs["ids"].to(device)
            mask = transformer_inputs["mask"].to(device)
            scores = bert_emoreccom_classifier.forward(ids, mask)
            predictions = F.sigmoid(scores)
        (labels, polarities) = batch[2]
        labels = labels.int()
        if idx in [105, 349]:
            continue
        # metric computation
        for threshold in np.arange(0.1, 1, 0.1):
            thresholded_predictions = (predictions > threshold)
            micro_auc_metric_results.append(micro_auc_metric(thresholded_predictions, labels))
            accuracy_score_results.append(accuracy_score(labels, thresholded_predictions))
            jaccard_score_results.append(jaccard_score(labels, thresholded_predictions, average='micro'))
            zero_one_loss_results.append(zero_one_loss(labels, thresholded_predictions))

500it [00:26, 19.17it/s]


it seems we have some nan results!
Let's find out what they are
result: nan's are coming from all target elements being 1, e.g. [1, 1, 1, 1, 1, 1, 1, 1]

In [91]:
for idx, res in enumerate(micro_auc_metric_results):
    if math.isnan(res):
        print(idx)

105
349


In [60]:
micro_auc_metric_results[105]

nan

In [64]:
datamodule.test_dataloader().dataset[105]

([],
 [],
 [array([1., 1., 1., 1., 1., 1., 1., 1.]),
  array([0.104, 0.146, 0.104, 0.229, 0.104, 0.104, 0.104, 0.   ])],
 {'ids': tensor([  101,  2204,  1998,  1045,  2097,  2156,  2008,  2027,  2024,  7349,
           1010,  1998,  2991,  3632,  2092,  1996,  2455,  1997,  2422,  2097,
           9530,  2022,  2256,  2629,  2000, 25728,  3288,  2041,  1996,  7282,
           3924,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
  'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [102]:
filtered_results = list(filter(lambda x: not math.isnan(x), micro_auc_metric_results))

In [103]:
np.mean(filtered_results)

0.5809906291834004

Now we have scores for different metric and different thresholds,
    let's show them

In [167]:
np.array(micro_auc_metric_results).reshape((-1, 9)).mean(axis=0)

array([0.64963903, 0.63889367, 0.61112067, 0.57787101, 0.55285427,
       0.52044607, 0.5       , 0.5       , 0.5       ])

In [168]:
np.array(accuracy_score_results).reshape((-1, 9)).mean(axis=0)

array([0.0502008 , 0.05421687, 0.04016064, 0.02610442, 0.01807229,
       0.0060241 , 0.        , 0.        , 0.        ])

In [169]:
np.array(jaccard_score_results).reshape((-1, 9)).mean(axis=0)

array([0.38394531, 0.3409017 , 0.27128514, 0.18999331, 0.12459839,
       0.04725569, 0.        , 0.        , 0.        ])

In [170]:
np.array(zero_one_loss_results).reshape((-1, 9)).mean(axis=0)

array([0.9497992 , 0.94578313, 0.95983936, 0.97389558, 0.98192771,
       0.9939759 , 1.        , 1.        , 1.        ])

# Export Private Test Results for Evaluation

In [5]:
use_transformer_tokenizer = True
emoreccom_path = "/home/gsoykan20/Desktop/datasets/multimodal_emotion_recognition_on_comics_scenes/"  # "/userfiles/comics_grp/multimodal_emotion_recognition_on_comics_scenes/"
datamodule = EmoRecComDataModule(batch_size=1,
                                 tokenizer_max_len=80,
                                 use_label_transform=False,
                                 use_private_test_set=True,
                                 train_val_test_split=(0, 0, 2041),
                                 data_dir=emoreccom_path,
                                 modality=DatasetModality.Text,
                                 use_tokenizer_instead_text_preprocessor=use_transformer_tokenizer)
datamodule.prepare_data()
datamodule.setup()


In [33]:
test_dataloader = iter(datamodule.test_dataloader())

In [34]:
test_dataset = datamodule.test_dataloader().dataset

In [35]:
results = []

In [36]:
# 1	0_19_3	0	0	0	0	1	0	0	0
# id,image_id,angry,disgust,fear,happy,sad,surprise,neutral,other.
threshold = 0.4
# takes approximately 1 minute to compute results
for idx, batch in tqdm(enumerate(test_dataloader)):
    # Getting the predictions
    with torch.no_grad():
        transformer_inputs = batch[3]
        ids = transformer_inputs["ids"].to(device)
        mask = transformer_inputs["mask"].to(device)
        scores = bert_emoreccom_classifier.forward(ids, mask)
        predictions = F.sigmoid(scores)
    thresholded_predictions = np.array(predictions > threshold).astype('int')
    id = idx + 1
    image_id = test_dataset.files[idx].split('.')[0]
    results.append([id, image_id, *thresholded_predictions[0]])

2041it [00:59, 34.57it/s]


In [37]:
results_df = pd.DataFrame(results, columns=['id',
                                            'image_id',
                                            'angry',
                                            'disgust',
                                            'fear',
                                            'happy',
                                            'sad',
                                            'surprise',
                                            'neutral',
                                            'other'])

In [38]:
results_df.to_csv('./results.csv', header=False, index=False)