# 0. Imports and costants

In [1]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import json
from PIL import Image
import os
import sys
import copy
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
##################################################

############## DATA SCIENCE & ML MODULES ##########
from scipy.spatial.distance import cosine
###################################################

############## CONSTANTS ##########################
user = "patriziopalmisano" #"onurdenizguler"
dataset_directory = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_v2"
###################################################

# 1. Load and separate embeddings

Load embeddings from train/dev/test sets and separate them into two lists each based on the respective label

In [2]:
keys = ['train', 'dev', 'test']
embeddings_txt_yes = {}
embeddings_txt_no = {}
embeddings_img_yes = {}
embeddings_img_no = {}

for key in keys:
    embeddings_file = os.path.join(dataset_directory, f"embeddings_{key}_v2.pickle")
    with open(embeddings_file, 'rb') as f:
        embeddings_dict = pickle.load(f)
    
    jsonl_file = os.path.join(dataset_directory, f"CT23_1A_checkworthy_multimodal_english_{key}.jsonl")
    with open(jsonl_file, 'r') as f:
        json_data = f.readlines()
    
    mask = []

    for json_object in json_data:
        tweet = json.loads(json_object)
        class_label = tweet['class_label']

        if class_label == 'Yes':
            mask.append(True)
        else:
            mask.append(False)
            
    embeddings_txt_yes[key] = [embedding for embedding, mask_value in zip(embeddings_dict['txt'], mask) if mask_value]
    embeddings_txt_no[key] = [embedding for embedding, mask_value in zip(embeddings_dict['txt'], mask) if not mask_value]
    embeddings_img_yes[key] = [embedding for embedding, mask_value in zip(embeddings_dict['img'], mask) if mask_value]
    embeddings_img_no[key] = [embedding for embedding, mask_value in zip(embeddings_dict['img'], mask) if not mask_value]

    print("Length of " + key)
    print(len(embeddings_txt_yes[key]))
    print(len(embeddings_txt_no[key]))
    print(len(embeddings_img_yes[key]))
    print(len(embeddings_img_no[key]))

Length of train
820
1536
820
1536
Length of dev
87
184
87
184
Length of test
174
374
174
374


# 2. Compare embeddings

In [3]:
similarity_yes = []
similarity_no = []
for key in keys:
    # Calculate cosine similarity between images and texts of the 'Yes' class
    similarity_yes = np.mean([cosine(embeddings_img_yes[key][i], embeddings_txt_yes[key][i]) for i in range(len(embeddings_img_yes[key]))])

    # Calculate cosine similarity between images and texts of the 'No' class
    similarity_no = np.mean([cosine(embeddings_img_no[key][i], embeddings_txt_no[key][i]) for i in range(len(embeddings_img_no[key]))])
    
    print(key + ": ")
    print("yes: " + str(similarity_yes))
    print("no: " + str(similarity_no))

train: 
yes: 0.7470897004190015
no: 0.7546117095528947
dev: 
yes: 0.7413280693621471
no: 0.7521881027876035
test: 
yes: 0.7426705300893591
no: 0.7587956940147648


# 3. Conclusions
No significant difference is present between the cosine similarity of the Yes and No classes of tweets