# 0. Imports and Constants
- Select user before proceeding

In [20]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
 ##################################################

############## TASK-SPECIFIC MODULES ##############
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
from ml.datasets import VanillaDataset
from ml.models import VanillaNN
from ml.trainer import Trainer
from ml.eval import evaluate_model, write_eval_to_file, pretty_print_metrics, cross_validate_thresholds
###################################################


####################### CONSTANTS ########################
SPLITS = ["train", "dev", "test", "gold"]
TRAIN, DEV, TEST, TXT, IMG = "train", "dev", "test", "txt", "img"
FE_METHODS = ["txt_embeddings", "img_embeddings", "concat", "sum", "mean", "hadamard", "paraphrase", "paraphrase_with_ocr"]
GOLD = "gold"
RANDOM_SEED = 42
#FE_METHODS += ["concat_cos", "sum_cos", "mean_cos", "hadamard_cos"]
##########################################################

############## DATA SCIENCE & ML MODULES #################
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split,SubsetRandomSampler, ConcatDataset
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
##########################################################

####################### SELECT ###########################
users = ["patriziopalmisano", "onurdenizguler", "jockl"]
user = users[2] # SELECT USER
version = "v2" # SELECT DATASET VERSION
dataset_version = version
##########################################################

if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillann_hyperparam_search"
evals_dir = f"{models_dir}/threshold_cross_validation"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Load the Datasets

In [None]:
split_to_labels = {split: 
                        np.load(f"{labels_dir}/{split}_labels_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                        np.load(f"{labels_dir}/{split}_labels.pickle", allow_pickle=True)
                for split in SPLITS}

method_to_split_to_data = {fe_method: {
                                split: 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}.pickle", allow_pickle=True)
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

method_to_split_to_dataset = {fe_method: {
                                split:
                                        VanillaDataset(method_to_split_to_data[fe_method][split], split_to_labels[split]) 
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

# 2. Cross-Validation of Best Models

In this section, we cross-validate those models that yielded the best validation accuracy within training. We cross-validate the best model for the following feature methods respectively:

- Txt Embeddings
- Concat Features
- Mean Features
- Paraphrase Tweet Features
- Paraphrase Tweet + OCR

Those best models come from the runs saved and TensorBoard-logged under

- "data/runs".

The aim of the cross-validation is to optimize our models' prediction thresholds. The default prediction threshold the models were trained on is 0.5. Given the class imbalance of our dataset, however, it is reasonable to check if other thresholds perform better. Hence, the following routine is performed for every considered model:

- For all k cross-validation splits, the model is evaluated on different thresholds.
- We keep the best performing threshold for each of the splits folds.
- We mean over all the k best thresholds -> This is the final best threshold.

Finally:

- Every model (with its best threshold according to cross-val) is evaluated on the gold test set.

In [37]:
# Used for every cross-validation
k = 3
thresholds = np.array(range(10, 60, 5)) / 100
shuffle = True
output_dim = [1]

## 2.1 Txt Embeddings

In [30]:
# Set feature method, best model name, batch size, hidden dims
feature_method = "txt_embeddings"
model_name = "12-06-2023_23-58_txt_embeddings_64x32x128_lr_1e-06_batch-size_8_shuffled_f1_0.74"
batch_size = 8
hidden_dims = [64, 32, 128]

In [31]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}/{k}_fold_{model_name}.txt"

In [32]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [33]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=768, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [34]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

Best on fold 0: Threshold=0.35
0.0: {'precision': 0.888235294117647, 'recall': 0.8176895306859205, 'f1-score': 0.8515037593984962, 'support': 554}
1.0: {'precision': 0.7240437158469946, 'recall': 0.8229813664596274, 'f1-score': 0.7703488372093024, 'support': 322}
accuracy: 0.819634703196347

Best on fold 1: Threshold=0.35
0.0: {'precision': 0.9090909090909091, 'recall': 0.8291032148900169, 'f1-score': 0.8672566371681416, 'support': 591}
1.0: {'precision': 0.7002967359050445, 'recall': 0.8280701754385965, 'f1-score': 0.7588424437299035, 'support': 285}
accuracy: 0.8287671232876712

Best on fold 2: Threshold=0.3
0.0: {'precision': 0.9423868312757202, 'recall': 0.7965217391304348, 'f1-score': 0.8633364750235626, 'support': 575}
1.0: {'precision': 0.699228791773779, 'recall': 0.9066666666666666, 'f1-score': 0.7895500725689405, 'support': 300}
accuracy: 0.8342857142857143

Mean of best thresholds: 0.3333333333333333
Mean of best f1-scores: 0.7729137845027155


In [35]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.3333333333333333
0.0: {'precision': 0.81419624217119, 'recall': 0.8496732026143791, 'f1-score': 0.8315565031982942, 'support': 459}
1.0: {'precision': 0.7315175097276264, 'recall': 0.6787003610108303, 'f1-score': 0.704119850187266, 'support': 277}
accuracy: 0.7853260869565217



## 2.2 Concat Features

In [43]:
# Set feature method, model name, batch size, hidden dims
feature_method = "concat"
model_name = "13-06-2023_02-15_concat_128x64x32x16_lr_1e-06_batch-size_16_shuffled_f1_0.72"
batch_size = 16
hidden_dims = [128, 64, 32, 16]

In [44]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}/{k}_fold_{model_name}.txt"

In [45]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [46]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=1536, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [47]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

Best on fold 0: Threshold=0.35
0.0: {'precision': 0.9212598425196851, 'recall': 0.8447653429602888, 'f1-score': 0.8813559322033898, 'support': 554}
1.0: {'precision': 0.7663043478260869, 'recall': 0.8757763975155279, 'f1-score': 0.817391304347826, 'support': 322}
accuracy: 0.8561643835616438

Best on fold 1: Threshold=0.45
0.0: {'precision': 0.917098445595855, 'recall': 0.8984771573604061, 'f1-score': 0.9076923076923078, 'support': 591}
1.0: {'precision': 0.797979797979798, 'recall': 0.8315789473684211, 'f1-score': 0.8144329896907218, 'support': 285}
accuracy: 0.8767123287671232

Best on fold 2: Threshold=0.4
0.0: {'precision': 0.9287054409005628, 'recall': 0.8608695652173913, 'f1-score': 0.8935018050541517, 'support': 575}
1.0: {'precision': 0.7660818713450293, 'recall': 0.8733333333333333, 'f1-score': 0.8161993769470405, 'support': 300}
accuracy: 0.8651428571428571

Mean of best thresholds: 0.4000000000000001
Mean of best f1-scores: 0.8160078903285294


In [48]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.4000000000000001
0.0: {'precision': 0.8048780487804879, 'recall': 0.8627450980392157, 'f1-score': 0.832807570977918, 'support': 459}
1.0: {'precision': 0.7418032786885246, 'recall': 0.6534296028880866, 'f1-score': 0.6948176583493281, 'support': 277}
accuracy: 0.7839673913043478



## 2.3 Mean Features

In [49]:
# Set feature method, model name, batch size, hidden dims
feature_method = "mean"
model_name = "13-06-2023_05-44_mean_128x64x32_lr_1e-06_batch-size_8_shuffled_f1_0.67"
batch_size = 8
hidden_dims = [128, 64, 32]

In [50]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}/{k}_fold_{model_name}.txt"

In [51]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
#gold_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [52]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=768, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [53]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

Best on fold 0: Threshold=0.3
0.0: {'precision': 0.9102844638949672, 'recall': 0.7509025270758123, 'f1-score': 0.8229475766567755, 'support': 554}
1.0: {'precision': 0.6706443914081146, 'recall': 0.8726708074534162, 'f1-score': 0.7584345479082322, 'support': 322}
accuracy: 0.795662100456621

Best on fold 1: Threshold=0.4
0.0: {'precision': 0.8981818181818182, 'recall': 0.8358714043993232, 'f1-score': 0.8659070990359333, 'support': 591}
1.0: {'precision': 0.7024539877300614, 'recall': 0.8035087719298246, 'f1-score': 0.7495908346972178, 'support': 285}
accuracy: 0.8253424657534246

Best on fold 2: Threshold=0.3
0.0: {'precision': 0.9368421052631579, 'recall': 0.7739130434782608, 'f1-score': 0.8476190476190476, 'support': 575}
1.0: {'precision': 0.675, 'recall': 0.9, 'f1-score': 0.7714285714285714, 'support': 300}
accuracy: 0.8171428571428572

Mean of best thresholds: 0.3333333333333333
Mean of best f1-scores: 0.7598179846780071


In [54]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.3333333333333333
0.0: {'precision': 0.8347826086956521, 'recall': 0.8366013071895425, 'f1-score': 0.8356909684439608, 'support': 459}
1.0: {'precision': 0.7282608695652174, 'recall': 0.7256317689530686, 'f1-score': 0.7269439421338155, 'support': 277}
accuracy: 0.7948369565217391



## 2.4 Paraphrase Model Tweet Features

In [55]:
# Set feature method, model name, batch size, hidden dims
feature_method = "paraphrase"
model_name = "10-07-2023_22-48_paraphrase_32x16_lr_1e-06_batch-size_16_shuffled_f1_0.60"
batch_size = 16
hidden_dims = [32, 16]

In [56]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}/{k}_fold_{model_name}.txt"

In [57]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [58]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=384, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [59]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best on fold 0: Threshold=0.35
0.0: {'precision': 0.8059149722735675, 'recall': 0.7870036101083032, 'f1-score': 0.7963470319634702, 'support': 554}
1.0: {'precision': 0.6477611940298508, 'recall': 0.6739130434782609, 'f1-score': 0.6605783866057839, 'support': 322}
accuracy: 0.7454337899543378

Best on fold 1: Threshold=0.4
0.0: {'precision': 0.8456375838926175, 'recall': 0.8527918781725888, 'f1-score': 0.8491996630160068, 'support': 591}
1.0: {'precision': 0.6892857142857143, 'recall': 0.6771929824561403, 'f1-score': 0.6831858407079646, 'support': 285}
accuracy: 0.795662100456621

Best on fold 2: Threshold=0.35
0.0: {'precision': 0.8419117647058824, 'recall': 0.7965217391304348, 'f1-score': 0.8185880250223414, 'support': 575}
1.0: {'precision': 0.6465256797583081, 'recall': 0.7133333333333334, 'f1-score': 0.6782884310618067, 'support': 300}
accuracy: 0.768

Mean of best thresholds: 0.3666666666666667
Mean of best f1-scores: 0.6740175527918518


In [60]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.3666666666666667
0.0: {'precision': 0.7340425531914894, 'recall': 0.9019607843137255, 'f1-score': 0.8093841642228738, 'support': 459}
1.0: {'precision': 0.7383720930232558, 'recall': 0.4584837545126354, 'f1-score': 0.5657015590200446, 'support': 277}
accuracy: 0.7350543478260869



## 2.4 Paraphrase Model Tweet + OCR Features

In [62]:
# Set feature method, model name, batch size, hidden dims
feature_method = "paraphrase_with_ocr"
model_name = "11-07-2023_00-12_paraphrase_with_ocr_32x16_lr_1e-06_batch-size_16_shuffled_f1_0.63"
batch_size = 16
hidden_dims = [32, 16]

In [63]:
# File to store eval data
eval_file = f"{evals_dir}/{feature_method}/{model_name}/{k}_fold_{model_name}.txt"

In [64]:
# Create cross-validation dataset
train_dataset = method_to_split_to_dataset[feature_method][TRAIN]
dev_dataset = method_to_split_to_dataset[feature_method][DEV]
cross_val_dataset = ConcatDataset([train_dataset, dev_dataset])
splits = KFold(n_splits=k, shuffle=shuffle, random_state=RANDOM_SEED)

# Inspect datasets
print(f"Number of train examples: {len(train_dataset)}")
print(f"Number of dev examples: {len(dev_dataset)}")
print(f"Number of cross-val examples: {len(cross_val_dataset)}")

Number of train examples: 2356
Number of dev examples: 271
Number of cross-val examples: 2627


In [65]:
# Instantiate model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(cross_val_dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=384, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [66]:
# Collect best f1 score and corresponding threshold for every split
report_string, mean_best_threshold = cross_validate_thresholds(model=model, cross_val_dataset=cross_val_dataset, splits=splits, thresholds=thresholds, batch_size=batch_size)
print(report_string)

Best on fold 0: Threshold=0.3
0.0: {'precision': 0.8343949044585988, 'recall': 0.7093862815884476, 'f1-score': 0.766829268292683, 'support': 554}
1.0: {'precision': 0.6024691358024692, 'recall': 0.7577639751552795, 'f1-score': 0.671251719394773, 'support': 322}
accuracy: 0.7271689497716894

Best on fold 1: Threshold=0.35
0.0: {'precision': 0.8655616942909761, 'recall': 0.7952622673434856, 'f1-score': 0.8289241622574955, 'support': 591}
1.0: {'precision': 0.6366366366366366, 'recall': 0.743859649122807, 'f1-score': 0.6860841423948221, 'support': 285}
accuracy: 0.7785388127853882

Best on fold 2: Threshold=0.3
0.0: {'precision': 0.8763557483731019, 'recall': 0.7026086956521739, 'f1-score': 0.7799227799227799, 'support': 575}
1.0: {'precision': 0.5869565217391305, 'recall': 0.81, 'f1-score': 0.680672268907563, 'support': 300}
accuracy: 0.7394285714285714

Mean of best thresholds: 0.31666666666666665
Mean of best f1-scores: 0.6793360435657195


In [67]:
# Evaluate the best threshold on gold
eval_dataset = method_to_split_to_dataset[feature_method][GOLD]
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=shuffle)
metrics_dict = evaluate_model(model=model, dataloader=eval_dataloader, confidence=mean_best_threshold)

# Print and save results
gold_result_string = f"PERFORMANCE ON GOLD SET\n{pretty_print_metrics(metrics_dict, mean_best_threshold)}"
report_string += f"\n\n{gold_result_string}"
write_eval_to_file(file_path=eval_file, report_string=report_string)
print(gold_result_string)

PERFORMANCE ON GOLD SET
Threshold=0.31666666666666665
0.0: {'precision': 0.7854166666666667, 'recall': 0.8213507625272332, 'f1-score': 0.8029818956336527, 'support': 459}
1.0: {'precision': 0.6796875, 'recall': 0.628158844765343, 'f1-score': 0.6529080675422139, 'support': 277}
accuracy: 0.748641304347826

