# 0. Imports and Constants
- Select user before proceeding

In [84]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
 ##################################################

############## TASK-SPECIFIC MODULES ##############
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
from ml.datasets import VanillaDataset
from ml.models import VanillaNN
from ml.trainer import Trainer
from ml.eval import get_tweet_ids_and_pred_from_model, create_submission_file_from_model
###################################################


####################### CONSTANTS ########################
SPLITS = ["train", "dev", "test", "gold"]
TRAIN, DEV, TEST, GOLD = "train", "dev", "test", "gold"
TXT, IMG = "txt", "img"
FE_METHODS = ["txt_embeddings", "img_embeddings", "concat", "sum", "mean", "hadamard", "paraphrase", "paraphrase_with_ocr"]
RANDOM_SEED = 42
##########################################################

############## DATA SCIENCE & ML MODULES #################
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split,SubsetRandomSampler, ConcatDataset
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
##########################################################

####################### SELECT ###########################
users = ["patriziopalmisano", "onurdenizguler", "jockl"]
user = users[2] # SELECT USER
version = "v2" # SELECT DATASET VERSION
dataset_version = version
##########################################################

if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
tweet_ids_dir = f"{data_dir}/tweet_ids"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillann_hyperparam_search"
evals_dir = f"{models_dir}/threshold_cross_validation"
submissions_dir = f"{cw_dir}/models/submission_files"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Load the Datasets

In [75]:
split_to_labels = {split: 
                        np.load(f"{labels_dir}/{split}_labels_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                        np.load(f"{labels_dir}/{split}_labels.pickle", allow_pickle=True)
                for split in SPLITS}

split_to_tweet_ids = {split: 
                        np.load(f"{tweet_ids_dir}/{split}_tweet_ids_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                        np.load(f"{tweet_ids_dir}/{split}_tweet_ids.pickle", allow_pickle=True)
                for split in SPLITS}

method_to_split_to_data = {fe_method: {
                                split: 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else 
                                        np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}.pickle", allow_pickle=True)
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

method_to_split_to_dataset = {fe_method: {
                                split:
                                        VanillaDataset(method_to_split_to_data[fe_method][split], split_to_labels[split], split_to_tweet_ids[split]) 
                                for split in SPLITS} 
                        for fe_method in FE_METHODS}

# 2. Crete the Submission File From a Model

In [87]:
# Set model properties
feature_method = "mean"
model_name = "13-06-2023_05-33_mean_128x64x32x16_lr_0.001_batch-size_8_shuffled_f1_0.71"
batch_size = 8
hidden_dims = [128, 64, 32, 16]
output_dim = [1]
confidence = 0.33333
run_id = model_name

In [77]:
# Set up gold dataset and gold dataloader
dataset = method_to_split_to_dataset[feature_method][GOLD]
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False)
print(f"Number of test examples: {len(dataset)}")
print(f"Shape of features in batch: {next(iter(dataloader))[0].shape}")
print(f"Shape of labels in batch: {next(iter(dataloader))[1].shape}")
print(f"Shape of tweet_ids in batch: {next(iter(dataloader))[2].shape}")

Number of test examples: 736
Shape of features in batch: torch.Size([8, 768])
Shape of labels in batch: torch.Size([8])
Shape of tweet_ids in batch: torch.Size([8])


In [78]:
# Instantiate the model
model_dir = f"{models_dir}/{feature_method}/{model_name}.pt"
input_dim = [len(dataset[0][0])]
init_params = input_dim + hidden_dims + output_dim
model = VanillaNN(init_params)
model.load_state_dict(torch.load(model_dir))
model.eval()

VanillaNN(
  (sequence): Sequential(
    (0): Linear(in_features=768, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [81]:
# Set submission file
submission_file = f"{submissions_dir}/{model_name}/subtask1A_english.tsv"

In [90]:
list(get_tweet_ids_and_pred_from_model(model=model, dataloader=dataloader, confidence=confidence))

[(1196687191747039232, 'No'),
 (981622723934859264, 'Yes'),
 (1473613189573058561, 'No'),
 (1222918922535604227, 'Yes'),
 (1164297159807852547, 'No'),
 (1221942799165247488, 'No'),
 (1386219570571194368, 'No'),
 (1222491887313018880, 'Yes'),
 (1222938447805079553, 'Yes'),
 (946308084368060416, 'Yes'),
 (1222483271679971330, 'Yes'),
 (1380548489046761473, 'No'),
 (1222394564725465088, 'Yes'),
 (1022193728247746561, 'Yes'),
 (1294608049899020293, 'No'),
 (1222672493292507140, 'Yes'),
 (1379012469075673089, 'Yes'),
 (1225418650569650177, 'Yes'),
 (974661838641270784, 'Yes'),
 (1474716201729277953, 'No'),
 (1432186935125159936, 'No'),
 (1247504779154243584, 'Yes'),
 (1047418463323545600, 'No'),
 (1215040506390073344, 'Yes'),
 (1222512061856567296, 'Yes'),
 (1128792024751255556, 'No'),
 (1039844252338073600, 'Yes'),
 (1340989825541935104, 'No'),
 (1379898742011260932, 'No'),
 (1381239983353311240, 'No'),
 (1245125842449154048, 'Yes'),
 (1034002753906003973, 'Yes'),
 (1249939118013874177, 'N

In [92]:
# Let the model perform on gold set
create_submission_file_from_model(model=model, dataloader=dataloader, confidence=confidence, file_path=submission_file, run_id=run_id)

Created submission file at: /home/jockl/Insync/check.worthiness@gmail.com/Google Drive/models/submission_files/13-06-2023_05-33_mean_128x64x32x16_lr_0.001_batch-size_8_shuffled_f1_0.71/subtask1A_english.tsv
