# 0. Imports and Constants
- Select user before proceeding

In [130]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
 ##################################################

############## TASK-SPECIFIC MODULES ##############
sys.path.append(os.path.join(os.getcwd(),"src"))
from vanilla_dataset import VanillaDataset
from vanilla_nn import VanillaNN
from trainer import Trainer
from eval import evaluate_model, write_eval_to_file, pretty_print_metrics
###################################################


####################### CONSTANTS ########################
SPLITS = ["train", "dev", "test", "gold"]
TRAIN, DEV, TEST, TXT, IMG = "train", "dev", "test", "txt", "img"
FE_METHODS = ["txt_embeddings", "img_embeddings", "concat", "sum", "mean", "hadamard"]
GOLD = "gold"
#FE_METHODS += ["concat_cos", "sum_cos", "mean_cos", "hadamard_cos"]
##########################################################

############## DATA SCIENCE & ML MODULES #################
from torch.utils.data import DataLoader
import torch
from sklearn.metrics import classification_report
##########################################################

####################### SELECT ###########################
users = ["patriziopalmisano", "onurdenizguler", "jockl"]
user = users[2] # SELECT USER
version = "v2" # SELECT DATASET VERSION
dataset_version = version
##########################################################

if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillann_hyperparam_search"
evals_dir = f"{models_dir}/gold_evals"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Load the Datasets

In [131]:
split_to_labels = {split: 
                        np.load(f"{labels_dir}/{split}_labels_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                        np.load(f"{labels_dir}/{split}_labels.pickle", allow_pickle=True)
                for split in SPLITS}

method_to_split_to_data = {fe_method: {
                                split: 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}.pickle", allow_pickle=True)
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

method_to_split_to_dataset = {fe_method: {
                                split:
                                        VanillaDataset(method_to_split_to_data[fe_method][split], split_to_labels[split]) 
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

# 2. Evaluate the best models

In this section, the best models for different features methods are evaluated on the gold test set. 

- Every model is evaluated on different prediction thresholds
- Evaluation results are written to a txt file along with model properties
- Models come from the runs under "prototyping/CIMC-XX_hyperparam-search-vanillann/runs".

In [132]:
# Used for every evaluation
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
shuffle = True
output_dim = [1]

## 2.1 Txt Embeddings

In [155]:
# Set feature method, model name, batch size, hidden dims
feature_method = "txt_embeddings"
model_name = "13-06-2023_01-03_txt_embeddings_32x16_lr_1e-05_batch-size_16_shuffled_f1_0.76"
batch_size = 16
hidden_dims = [32, 16]

In [156]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}.txt"

In [157]:
# Inspect dataset and dataloader
dataset = method_to_split_to_dataset[feature_method][GOLD]
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
print(f"Number of test examples: {len(dataset)}")
print(f"Shape of features in batch: {next(iter(dataloader))[0].shape}")
print(f"Shape of labels in batch: {next(iter(dataloader))[1].shape}")

Number of test examples: 736
Shape of features in batch: torch.Size([16, 768])
Shape of labels in batch: torch.Size([16])


In [158]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=768, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [159]:
# Evaluate the model on all thresholds
metrics_string = ""
for threshold in thresholds:
    scores_dict = evaluate_model(model=model, dataloader=dataloader, confidence=threshold)
    metrics_string += pretty_print_metrics(scores_dict, threshold) + "\n"
write_eval_to_file(file_path=eval_file, report_string=metrics_string)
print(metrics_string)

Threshold=0.1
0.0: {'precision': 0.949238578680203, 'recall': 0.4074074074074074, 'f1-score': 0.5701219512195123, 'support': 459}
1.0: {'precision': 0.49536178107606677, 'recall': 0.9638989169675091, 'f1-score': 0.6544117647058824, 'support': 277}
accuracy: 0.6168478260869565

Threshold=0.2
0.0: {'precision': 0.8654970760233918, 'recall': 0.644880174291939, 'f1-score': 0.7390761548064918, 'support': 459}
1.0: {'precision': 0.5862944162436549, 'recall': 0.8339350180505415, 'f1-score': 0.6885245901639345, 'support': 277}
accuracy: 0.7160326086956522

Threshold=0.3
0.0: {'precision': 0.828125, 'recall': 0.8082788671023965, 'f1-score': 0.8180815876515986, 'support': 459}
1.0: {'precision': 0.6944444444444444, 'recall': 0.7220216606498195, 'f1-score': 0.7079646017699115, 'support': 277}
accuracy: 0.7758152173913043

Threshold=0.4
0.0: {'precision': 0.796, 'recall': 0.8671023965141612, 'f1-score': 0.8300312825860271, 'support': 459}
1.0: {'precision': 0.7415254237288136, 'recall': 0.63176895

## 2.2 Concat Features

In [145]:
# Set feature method, model name, batch size, hidden dims
feature_method = "concat"
model_name = "13-06-2023_03-03_concat_128x64x32_lr_0.0001_batch-size_64_shuffled_f1_0.75"
batch_size = 64
hidden_dims = [128, 64, 32]

In [146]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}.txt"

In [147]:
# Inspect dataset and dataloader
dataset = method_to_split_to_dataset[feature_method][GOLD]
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
print(f"Number of test examples: {len(dataset)}")
print(f"Shape of features in batch: {next(iter(dataloader))[0].shape}")
print(f"Shape of labels in batch: {next(iter(dataloader))[1].shape}")

Number of test examples: 736
Shape of features in batch: torch.Size([64, 1536])
Shape of labels in batch: torch.Size([64])


In [148]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=1536, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [149]:
# Evaluate the model on all thresholds
metrics_string = ""
for threshold in thresholds:
    scores_dict = evaluate_model(model=model, dataloader=dataloader, confidence=threshold)
    metrics_string += pretty_print_metrics(scores_dict, threshold) + "\n"
write_eval_to_file(file_path=eval_file, report_string=metrics_string)
print(metrics_string)

Threshold=0.1
0.0: {'precision': 0.9023668639053254, 'recall': 0.664488017429194, 'f1-score': 0.7653701380175658, 'support': 459}
1.0: {'precision': 0.6130653266331658, 'recall': 0.8808664259927798, 'f1-score': 0.722962962962963, 'support': 277}
accuracy: 0.7459239130434783

Threshold=0.2
0.0: {'precision': 0.8640776699029126, 'recall': 0.775599128540305, 'f1-score': 0.817451205510907, 'support': 459}
1.0: {'precision': 0.6820987654320988, 'recall': 0.7978339350180506, 'f1-score': 0.7354409317803662, 'support': 277}
accuracy: 0.7839673913043478

Threshold=0.3
0.0: {'precision': 0.835920177383592, 'recall': 0.8213507625272332, 'f1-score': 0.8285714285714286, 'support': 459}
1.0: {'precision': 0.712280701754386, 'recall': 0.7328519855595668, 'f1-score': 0.7224199288256228, 'support': 277}
accuracy: 0.7880434782608695

Threshold=0.4
0.0: {'precision': 0.813141683778234, 'recall': 0.8627450980392157, 'f1-score': 0.8372093023255816, 'support': 459}
1.0: {'precision': 0.7469879518072289, 're

## 2.3 Mean Features

In [150]:
# Set feature method, model name, batch size, hidden dims
feature_method = "mean"
model_name = "13-06-2023_05-33_mean_128x64x32x16_lr_0.001_batch-size_8_shuffled_f1_0.71"
batch_size = 8
hidden_dims = [128, 64, 32, 16]

In [151]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}.txt"

In [152]:
# Inspect dataset and dataloader
dataset = method_to_split_to_dataset[feature_method][GOLD]
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
print(f"Number of test examples: {len(dataset)}")
print(f"Shape of features in batch: {next(iter(dataloader))[0].shape}")
print(f"Shape of labels in batch: {next(iter(dataloader))[1].shape}")

Number of test examples: 736
Shape of features in batch: torch.Size([8, 768])
Shape of labels in batch: torch.Size([8])


In [153]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=768, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [154]:
# Evaluate the model on all thresholds
metrics_string = ""
for threshold in thresholds:
    scores_dict = evaluate_model(model=model, dataloader=dataloader, confidence=threshold)
    metrics_string += pretty_print_metrics(scores_dict, threshold) + "\n"
write_eval_to_file(file_path=eval_file, report_string=metrics_string)
print(metrics_string)

Threshold=0.1
0.0: {'precision': 0.8773006134969326, 'recall': 0.6230936819172114, 'f1-score': 0.7286624203821657, 'support': 459}
1.0: {'precision': 0.5780487804878048, 'recall': 0.855595667870036, 'f1-score': 0.6899563318777293, 'support': 277}
accuracy: 0.7105978260869565

Threshold=0.2
0.0: {'precision': 0.8575581395348837, 'recall': 0.6427015250544662, 'f1-score': 0.734744707347447, 'support': 459}
1.0: {'precision': 0.5816326530612245, 'recall': 0.8231046931407943, 'f1-score': 0.6816143497757847, 'support': 277}
accuracy: 0.7105978260869565

Threshold=0.3
0.0: {'precision': 0.848314606741573, 'recall': 0.6579520697167756, 'f1-score': 0.7411042944785277, 'support': 459}
1.0: {'precision': 0.5868421052631579, 'recall': 0.8050541516245487, 'f1-score': 0.6788432267884323, 'support': 277}
accuracy: 0.7133152173913043

Threshold=0.4
0.0: {'precision': 0.8415300546448088, 'recall': 0.6710239651416122, 'f1-score': 0.7466666666666667, 'support': 459}
1.0: {'precision': 0.5918918918918918,