## 0. Imports and Constants
- Select user before proceeding

In [17]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import json
from PIL import Image
import os
import sys
import copy
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
 ##################################################

############## TASK-SPECIFIC MODULES ##############
sys.path.append(os.path.join(os.getcwd(),"src"))
from vanilla_dataset import VanillaDataset
from vanilla_nn import VanillaNN
from trainer import Trainer
###################################################


####################### CONSTANTS ########################
SPLITS = ["train", "dev", "test"]
TRAIN, DEV, TEST, TXT, IMG = "train", "dev", "test", "txt", "img"
FE_METHODS = ["txt_embeddings", "img_embeddings", "concat", "sum", "mean", "hadamard"]
#FE_METHODS += ["concat_cos", "sum_cos", "mean_cos", "hadamard_cos"]
##########################################################

############## DATA SCIENCE & ML MODULES #################
from transformers import CLIPTokenizerFast, CLIPProcessor, CLIPModel
import torch
import pandas as pd
from scipy import stats
from sklearn.svm import SVC
from sklearn.metrics import classification_report
##########################################################

####################### SELECT ###########################
users = ["patriziopalmisano", "onurdenizguler", "jockl"]
user = users[2] # SELECT USER
version = "v2" # SELECT DATASET VERSION
dataset_version = version
##########################################################

if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillannWithVal"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Load the Datasets

In [13]:
split_to_labels = {split: 
                   np.load(f"{labels_dir}/{split}_labels_{dataset_version}.pickle", allow_pickle=True) 
                   for split in SPLITS}

method_to_split_to_data = {fe_method: {
                                    split: 
                                    np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}_{dataset_version}.pickle", allow_pickle=True) 
                                    for split in SPLITS
                                    } 
                            for fe_method in FE_METHODS}

method_to_split_to_dataset = {fe_method: {
                                        split:
                                        VanillaDataset(method_to_split_to_data[fe_method][split], split_to_labels[split]) 
                                        for split in SPLITS
                                        } 
                                for fe_method in FE_METHODS}

# 2. Perform hyperparameter search on a model architecture with a selected method

In [4]:
# Select method 
dataset_method = "txt_embeddings"
dataset_dict = method_to_split_to_dataset[dataset_method]

# Select model architecture and model class
model = VanillaNN

# Instantiate a trainer on the selected model class, model architecture, and dataset
trainer_1 = Trainer(model, dataset_method, dataset_dict)

In [5]:
# Train model with selected architecture on the selected method
model_init_params = [64, 32]
learning_rates = [1e-5 , 5e-5, 1e-4]
batch_sizes = [8, 16]
device = "cpu"
trainer_1.hyperparameter_search(model_init_params = model_init_params,
                                device = device,
                                learning_rates = learning_rates,
                                batch_sizes = batch_sizes,
                                num_epochs = 500)

 12%|█▏        | 61/500 [00:48<05:45,  1.27it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Early quitting at epoch: 61


 15%|█▌        | 76/500 [00:19<01:46,  3.98it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Early quitting at epoch: 76


  3%|▎         | 16/500 [00:09<04:33,  1.77it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Early quitting at epoch: 16


  5%|▍         | 23/500 [00:06<02:04,  3.82it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Early quitting at epoch: 23


  2%|▏         | 10/500 [00:04<03:58,  2.06it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Early quitting at epoch: 10


  3%|▎         | 13/500 [00:03<02:03,  3.95it/s]

Early quitting at epoch: 13





In [14]:
# Select another method 
dataset_method = "concat"
dataset_dict = method_to_split_to_dataset[dataset_method]

# Select model architecture and model class
model = VanillaNN

# Instantiate a trainer on the selected model class, model architecture, and dataset
trainer_2 = Trainer(model, dataset_method, dataset_dict)

In [15]:
# Train model with selected architecture on the selected method
model_init_params = [64,32]
learning_rates = [1e-6]
batch_sizes = [16]
device = "cpu"
trainer_2.hyperparameter_search(model_init_params = model_init_params,
                                device = device,
                                learning_rates = learning_rates,
                                batch_sizes = batch_sizes,
                                num_epochs = 500)

 72%|███████▏  | 360/500 [02:42<01:03,  2.21it/s]

Early quitting at epoch: 360





# 3. Save trained models

In [16]:
models_dir = f"{cw_dir}/models/tests_jonas"
trainer_2.save_trained_models(models_dir)

# 4. Evaluate model on gold test set