# 0. Imports and Constants
- Select user before proceeding

In [1]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
 ##################################################

############## TASK-SPECIFIC MODULES ##############
sys.path.append(os.path.join(os.getcwd(),"src"))
from vanilla_dataset import VanillaDataset
from vanilla_nn import VanillaNN
from trainer import Trainer
from eval import evaluate_model, write_eval_to_file, pretty_print_metrics, cross_validate_thresholds
###################################################


####################### CONSTANTS ########################
SPLITS = ["train", "dev", "test", "gold"]
TRAIN, DEV, TEST, TXT, IMG = "train", "dev", "test", "txt", "img"
FE_METHODS = ["txt_embeddings", "img_embeddings", "concat", "sum", "mean", "hadamard"]
GOLD = "gold"
RANDOM_SEED = 42
#FE_METHODS += ["concat_cos", "sum_cos", "mean_cos", "hadamard_cos"]
##########################################################

############## DATA SCIENCE & ML MODULES #################
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split,SubsetRandomSampler, ConcatDataset
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
##########################################################

####################### SELECT ###########################
users = ["patriziopalmisano", "onurdenizguler", "jockl"]
user = users[2] # SELECT USER
version = "v2" # SELECT DATASET VERSION
dataset_version = version
##########################################################

if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillann_hyperparam_search"
evals_dir = f"{models_dir}/threshold_cross_validation"

# 1. Load the Datasets

In [2]:
split_to_labels = {split: 
                        np.load(f"{labels_dir}/{split}_labels_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                        np.load(f"{labels_dir}/{split}_labels.pickle", allow_pickle=True)
                for split in SPLITS}

method_to_split_to_data = {fe_method: {
                                split: 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}.pickle", allow_pickle=True)
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

method_to_split_to_dataset = {fe_method: {
                                split:
                                        VanillaDataset(method_to_split_to_data[fe_method][split], split_to_labels[split]) 
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

# 2. Cross-Validation of Best Models

In this section, we cross-validate those models that yielded the best validation accuracy within training. We cross-validate the best model for the following feature methods respectively:

- Txt Embeddings
- Concat Features
- Mean Features

Those best models come from the runs saved and TensorBoard-logged under

- "prototyping/CIMC-XX_hyperparam-search-vanillann/runs".

The aim of the cross-validation is to optimize our models' prediction thresholds. The default prediction threshold the models were trained on is 0.5. Given the class imbalance of our dataset, however, it is reasonable to check if other thresholds perform better. Hence, the following routine is performed for every considered model:

- For all k cross-validation splits, the model is evaluated on different thresholds.
- We keep the best performing threshold for each of the splits folds.
- We mean over all the k best thresholds -> This is the final best threshold.

Finally:

- Every model (with its best threshold according to cross-val) is evaluated on the gold test set.

In [3]:
# Used for every cross-validation
k = 6
thresholds = np.array(range(10, 60, 5)) / 100
shuffle = True
output_dim = [1]

## 2.1 Txt Embeddings

In [4]:
# Set feature method, best model name, batch size, hidden dims
feature_method = "txt_embeddings"
model_name = "13-06-2023_01-03_txt_embeddings_32x16_lr_1e-05_batch-size_16_shuffled_f1_0.76"
batch_size = 16
hidden_dims = [32, 16]

In [5]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{k}_fold_{model_name}.txt"

In [6]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [7]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=768, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [8]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

Best on fold 0: Threshold=0.3
0.0: {'precision': 0.9125, 'recall': 0.8295454545454546, 'f1-score': 0.869047619047619, 'support': 264}
1.0: {'precision': 0.7727272727272727, 'recall': 0.8793103448275862, 'f1-score': 0.8225806451612904, 'support': 174}
accuracy: 0.8493150684931506

Best on fold 1: Threshold=0.35
0.0: {'precision': 0.9221789883268483, 'recall': 0.8172413793103448, 'f1-score': 0.8665447897623401, 'support': 290}
1.0: {'precision': 0.7071823204419889, 'recall': 0.8648648648648649, 'f1-score': 0.778115501519757, 'support': 148}
accuracy: 0.8333333333333334

Best on fold 2: Threshold=0.45
0.0: {'precision': 0.8829787234042553, 'recall': 0.89568345323741, 'f1-score': 0.8892857142857142, 'support': 278}
1.0: {'precision': 0.8141025641025641, 'recall': 0.79375, 'f1-score': 0.8037974683544304, 'support': 160}
accuracy: 0.8584474885844748

Best on fold 3: Threshold=0.3
0.0: {'precision': 0.9540229885057471, 'recall': 0.7955271565495208, 'f1-score': 0.867595818815331, 'support': 31

In [9]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.375
0.0: {'precision': 0.8032786885245902, 'recall': 0.8540305010893247, 'f1-score': 0.8278775079197466, 'support': 459}
1.0: {'precision': 0.7298387096774194, 'recall': 0.6534296028880866, 'f1-score': 0.6895238095238094, 'support': 277}
accuracy: 0.7785326086956522



## 2.2 Concat Features

In [10]:
# Set feature method, model name, batch size, hidden dims
feature_method = "concat"
model_name = "13-06-2023_03-03_concat_128x64x32_lr_0.0001_batch-size_64_shuffled_f1_0.75"
batch_size = 64
hidden_dims = [128, 64, 32]

In [11]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{k}_fold_{model_name}.txt"

In [12]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [13]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=1536, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [14]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

Best on fold 0: Threshold=0.45
0.0: {'precision': 0.9581749049429658, 'recall': 0.9545454545454546, 'f1-score': 0.9563567362428842, 'support': 264}
1.0: {'precision': 0.9314285714285714, 'recall': 0.9367816091954023, 'f1-score': 0.9340974212034384, 'support': 174}
accuracy: 0.9474885844748858

Best on fold 1: Threshold=0.5
0.0: {'precision': 0.9423728813559322, 'recall': 0.9586206896551724, 'f1-score': 0.9504273504273504, 'support': 290}
1.0: {'precision': 0.916083916083916, 'recall': 0.8851351351351351, 'f1-score': 0.9003436426116838, 'support': 148}
accuracy: 0.9337899543378996

Best on fold 2: Threshold=0.5
0.0: {'precision': 0.950354609929078, 'recall': 0.9640287769784173, 'f1-score': 0.9571428571428572, 'support': 278}
1.0: {'precision': 0.9358974358974359, 'recall': 0.9125, 'f1-score': 0.9240506329113924, 'support': 160}
accuracy: 0.9452054794520548

Best on fold 3: Threshold=0.35
0.0: {'precision': 0.9796610169491525, 'recall': 0.9233226837060703, 'f1-score': 0.9506578947368421,

In [15]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.4666666666666666
0.0: {'precision': 0.8047808764940239, 'recall': 0.8801742919389978, 'f1-score': 0.8407908428720082, 'support': 459}
1.0: {'precision': 0.7649572649572649, 'recall': 0.6462093862815884, 'f1-score': 0.700587084148728, 'support': 277}
accuracy: 0.7921195652173914



## 2.3 Mean Features

In [16]:
# Set feature method, model name, batch size, hidden dims
feature_method = "mean"
model_name = "13-06-2023_05-33_mean_128x64x32x16_lr_0.001_batch-size_8_shuffled_f1_0.71"
batch_size = 8
hidden_dims = [128, 64, 32, 16]

In [17]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{k}_fold_{model_name}.txt"

In [18]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
#gold_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [19]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=768, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [20]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

Best on fold 0: Threshold=0.55
0.0: {'precision': 0.9879032258064516, 'recall': 0.928030303030303, 'f1-score': 0.9570312499999999, 'support': 264}
1.0: {'precision': 0.9, 'recall': 0.9827586206896551, 'f1-score': 0.9395604395604396, 'support': 174}
accuracy: 0.9497716894977168

Best on fold 1: Threshold=0.5
0.0: {'precision': 0.9819494584837545, 'recall': 0.9379310344827586, 'f1-score': 0.9594356261022928, 'support': 290}
1.0: {'precision': 0.8881987577639752, 'recall': 0.9662162162162162, 'f1-score': 0.9255663430420712, 'support': 148}
accuracy: 0.9474885844748858

Best on fold 2: Threshold=0.55
0.0: {'precision': 0.9925925925925926, 'recall': 0.9640287769784173, 'f1-score': 0.9781021897810218, 'support': 278}
1.0: {'precision': 0.9404761904761905, 'recall': 0.9875, 'f1-score': 0.9634146341463415, 'support': 160}
accuracy: 0.9726027397260274

Best on fold 3: Threshold=0.45
0.0: {'precision': 0.9931506849315068, 'recall': 0.9265175718849841, 'f1-score': 0.9586776859504132, 'support': 3

In [21]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.5166666666666667
0.0: {'precision': 0.8302872062663186, 'recall': 0.6928104575163399, 'f1-score': 0.7553444180522565, 'support': 459}
1.0: {'precision': 0.6005665722379604, 'recall': 0.7653429602888087, 'f1-score': 0.673015873015873, 'support': 277}
accuracy: 0.720108695652174

