In [50]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import json
from PIL import Image
import os
import sys
import copy
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
 ##################################################

############## TASK-SPECIFIC MODULES ##############
sys.path.append(os.path.join(os.getcwd(),"src"))
#import
###################################################


############## DATA SCIENCE & ML MODULES #################
from transformers import CLIPTokenizerFast, CLIPProcessor, CLIPModel
import torch
import pandas as pd
from scipy import stats
from sklearn.svm import SVC
from sklearn.metrics import classification_report
##########################################################


####################### CONSTANTS ########################
users = ["patriziopalmisano", "onurdenizguler", "jockl"]
SPLITS = ["train", "dev", "test"]
TRAIN, DEV, TEST, TXT, IMG = "train", "dev", "test", "txt", "img"
FE_METHODS = ["txt_embeddings", "img_embeddings", "concat", "sum", "mean", "hadamard"]
#FE_METHODS += ["concat_cos", "sum_cos", "mean_cos", "hadamard_cos"]
##########################################################


####################### SELECT ###########################
user = users[1] # SELECT USER
version = "v2" # SELECT DATASET VERSION
dataset_version = version
##########################################################

if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillann"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
# Load training and dev labels from pickle file as np.array
split_to_labels = {split: np.load(f"{labels_dir}/{split}_labels_{dataset_version}.pickle", allow_pickle=True) for split in SPLITS}
# Load all engineered feature matrices for all splits
split_to_method_to_matrix = {split: {fe_method: np.load(f"{features_dir}/{fe_method}/{fe_method}_{split}_{dataset_version}.pickle", allow_pickle=True) for fe_method in FE_METHODS} for split in SPLITS}

In [55]:
kernels = ['linear', 'rbf', 'poly']
for kernel in kernels[:2]:
    for method in FE_METHODS:
        clf = SVC(C=1, kernel=kernel, random_state=0)
        clf.fit(split_to_method_to_matrix[TRAIN][method], split_to_labels[TRAIN])
        labels = split_to_labels[TEST]
        predicted = clf.predict(split_to_method_to_matrix[TEST][method])
        print(f"Features: {method}, kernel: {kernel} \n")
        print(classification_report(labels, predicted))

Features: txt_embeddings, kernel: linear 

              precision    recall  f1-score   support

           0       0.85      0.80      0.83       374
           1       0.62      0.70      0.66       174

    accuracy                           0.77       548
   macro avg       0.74      0.75      0.74       548
weighted avg       0.78      0.77      0.77       548

Features: img_embeddings, kernel: linear 

              precision    recall  f1-score   support

           0       0.81      0.70      0.75       374
           1       0.50      0.66      0.57       174

    accuracy                           0.68       548
   macro avg       0.66      0.68      0.66       548
weighted avg       0.71      0.68      0.69       548

Features: concat, kernel: linear 

              precision    recall  f1-score   support

           0       0.84      0.79      0.81       374
           1       0.59      0.67      0.63       174

    accuracy                           0.75       548
   macr