In [1]:
from collections import deque

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import nltk
from torch.utils.data import Dataset
import pickle

from utils.utils import *
from utils.label_decoding import *
from utils.HierarchicalLoss import *



In [2]:
class TestDataSet(Dataset):
    def __init__(self, df, image_features_files, text_features_files):
        super(TestDataSet, self).__init__()
        self.data_df = df
        
        self.image_features_dicts = []
        
        for idx in range(len(image_features_files)):
            with open(image_features_files[idx], 'rb') as f:
                self.image_features_dicts.append(pickle.load(f))
        
        
        self.text_features_dicts = []
        
        for idx in range(len(text_features_files)):
            with open(text_features_files[idx], 'rb') as f:
                self.text_features_dicts.append(pickle.load(f))
        
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, idx):
        id = self.data_df.iloc[idx]['id']
        text = self.data_df.iloc[idx]['cleaned_text']
        image_name = self.data_df.iloc[idx]['image']
        
        features = {f'image_features_{idx}': self.image_features_dicts[idx][image_name] for idx in range(len(self.image_features_dicts))}
        
        for idx in range(len(self.image_features_dicts)):
            features[f'text_features_{idx}'] = self.text_features_dicts[idx][id]
        
        
        features['id'] = id
        features['text'] = text
        
        return features

### ViT + OpenAI Small

In [78]:
from modules.nn.MultiModal import MultiModalBaseline

vit_openai_small = MultiModalBaseline(img_feature_size=512, text_feature_size=1536)
# vit_openai_small.load_state_dict(torch.load("models/subtask2a/ViT-OpenAI-Small/vit_openai_small.pt"))
vit_openai_small.load_state_dict(torch.load("models/subtask2a/MultiModal-OpenAI-Small/splendid-sweep-4.pth"))

<All keys matched successfully>

### ViT + OpenAI Large

In [81]:
from modules.nn.MultiModal import MultiModalBaseline

vit_openai_large = MultiModalBaseline(img_feature_size=512, text_feature_size=3072)
# vit_openai_large.load_state_dict(torch.load("models/subtask2a/ViT-OpenAI-Large/vit_openai_large.pt"))
vit_openai_large.load_state_dict(torch.load("models/subtask2a/MultiModal-OpenAI-Large/fresh-sweep-3.pth"))

<All keys matched successfully>

### ViT + OpenAI Large + ner

In [65]:
from modules.nn.MultiModal import MultiModalNER
openai_large_ner = MultiModalNER(512, 3072, 768)

openai_large_ner.load_state_dict(torch.load("models/subtask2a/MultiModal-OpenAI-Large-NER/polar-sweep-1.pth"))

<All keys matched successfully>

In [6]:
from torch.utils.data import DataLoader

In [12]:
import torch
import numpy as np
import json
from tqdm import tqdm
import subprocess

def evaluate_models(models, dataloader, pred_file_path, gold_file_path, 
                    evaluator_script_path, id2leaf_label, device, validation=False, format=None,
                    threshold=0.3):
    # Ensure all models are in evaluation mode
    for model in models:
        model.eval()
    
    predictions = []
    total_loss = 0 if validation else None
    HL = HierarchicalLoss(id2label=id2label_subtask_2a, hierarchical_labels=hierarchy_subtask_2a, persuasion_techniques=persuasion_techniques_2a, device=device)
    
    # model1, model2 = models
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            ids = batch['id'].detach().numpy().tolist() if not isinstance(batch['id'], list) else batch['id']
            
            # img_features1 = batch['img_features1'].to(device)
            # text_features1 = batch['text_features1'].to(device)
            # 
            # img_features2 = batch['img_features2'].to(device)
            # text_features2 = batch['text_features2'].to(device)
            
            # Collect predictions from all models
            batch_predictions = []
            
            for idx in range(len(models)):
                model = models[idx]
                if idx != 2:
                    img_features = batch[f'image_features_{idx}'].to(device)
                    text_features = batch[f'text_features_{idx}'].to(device)
                    batch_predictions.append(model(text_features, img_features))
                else:
                    img_features = batch[f'image_features_{idx}'].to(device)
                    text_features = batch[f'text_features_{idx-1}'].to(device)
                    ner_features = batch[f'text_features_{idx}'].to(device)
                    
                    batch_predictions.append(model(text_features, img_features, ner_features))
            
            # Soft voting: average the predictions across models
            # Assuming outputs are logits, use softmax to convert to probabilities
            avg_preds = [torch.stack([model_preds[j] for model_preds in batch_predictions]).mean(0) for j in range(5)]
            
            pred_1, pred_2, pred_3, pred_4, pred_5 = avg_preds
            
            if validation:
                y_1, y_2, y_3 = batch['level_1_target'], batch['level_2_target'], batch['level_3_target']
                y_4, y_5 = batch['level_4_target'], batch['level_5_target']
                y_1, y_2, y_3, y_4, y_5 = y_1.to(device), y_2.to(device), y_3.to(device), y_4.to(device), y_5.to(device)
                
                dloss = HL.calculate_dloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
                lloss = HL.calculate_lloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
                total_loss += (dloss + lloss).item()

            # Threshold predictions for classification
            pred_3 = (pred_3 > threshold).int().cpu().numpy()
            pred_4 = (pred_4 > threshold).int().cpu().numpy()
            pred_5 = (pred_5 > threshold).int().cpu().numpy()
            # print(ids)
            predictions += get_labels(id2leaf_label, ids, pred_3, pred_4, pred_5, format)

    # Writing JSON data
    with open(pred_file_path, 'w') as f:
        json.dump(predictions, f, indent=4)
    
    if gold_file_path is not None:
        command = ["python3", evaluator_script_path, "--gold_file_path", gold_file_path, "--pred_file_path", pred_file_path]
        result = subprocess.run(command, capture_output=True, text=True)
        if result.returncode == 0:
            print("Output:\n", result.stdout)
        else:
            print("Error:\n", result.stderr)
    
    if validation:
        return total_loss / len(dataloader)


### MultiModal OpenAI Large and MultiModal OpenAI Small

In [82]:
ar_pred_file_path = './Predictions/subtask2a/ar_predictions_subtask2a.txt'
ar_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask2a_ar.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

image_features_files = ['ImageFeatures/CLIP-ViT/ar_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/ar_test_images_features.pkl']

text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/ar_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/ar_test_text_features.pkl']

ar_test_data = process_json(ar_gold_file_path, techniques_to_level_2a, hierarchy_subtask_2a)
ar_test_dataset = TestDataSet(ar_test_data, image_features_files, text_features_files)

ar_test_dataloader =  DataLoader(ar_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large], 
                dataloader=ar_test_dataloader, pred_file_path=ar_pred_file_path, 
                gold_file_path=ar_gold_file_path, evaluator_script_path=evaluator_script, 
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=5, validation=False, threshold=0.35)

100%|██████████| 2/2 [00:00<00:00, 23.77it/s]


Output:
 f1_h=0.53272	prec_h=0.49157	rec_h=0.58140


In [83]:
bg_pred_file_path = './Predictions/subtask2a/bg_predictions_subtask2a.txt'
bg_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask2a_bg.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

bg_test_data = process_json(bg_gold_file_path, techniques_to_level_2a, hierarchy_subtask_2a)

image_features_files = ['ImageFeatures/CLIP-ViT/bulgarian_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/bulgarian_test_images_features.pkl']

text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/bg_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/bg_test_text_features.pkl']


bg_test_dataset = TestDataSet(df=bg_test_data, image_features_files=image_features_files,
                              text_features_files=text_features_files)

bg_test_dataloader =  DataLoader(bg_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large], 
                dataloader=bg_test_dataloader, pred_file_path=bg_pred_file_path, 
                gold_file_path=bg_gold_file_path, evaluator_script_path=evaluator_script, 
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=None, validation=False, threshold=0.3)

100%|██████████| 7/7 [00:00<00:00, 26.37it/s]


Output:
 f1_h=0.65156	prec_h=0.65382	rec_h=0.64931


In [84]:
md_pred_file_path = './Predictions/subtask2a/md_predictions_subtask2a.txt'
md_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask2a_md.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

md_test_data = process_json(md_gold_file_path, techniques_to_level_2a, hierarchy_subtask_2a)

image_features_files = ['ImageFeatures/CLIP-ViT/nm_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/nm_test_images_features.pkl',]
text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/md_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/md_test_text_features.pkl']

md_test_dataset = TestDataSet(df=md_test_data, image_features_files=image_features_files,
                              text_features_files=text_features_files)

md_test_dataloader =  DataLoader(md_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large], 
                dataloader=md_test_dataloader, pred_file_path=md_pred_file_path, 
                gold_file_path=md_gold_file_path, evaluator_script_path=evaluator_script, 
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=None, validation=False, threshold=0.3)

100%|██████████| 5/5 [00:00<00:00, 26.15it/s]


Output:
 f1_h=0.69425	prec_h=0.74139	rec_h=0.65275


In [55]:
en_pred_file_path = './Predictions/subtask2a/en_predictions_subtask2a.txt'

evaluator_script = './scorer-baseline/subtask_1_2a.py'

en_test_data = process_test_json('test_data/english/en_subtask2a_test_unlabeled.json')

image_features_files = ['ImageFeatures/CLIP-ViT/english_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/english_test_images_features.pkl']

text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/en_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/en_test_text_features.pkl']


en_test_dataset = TestDataSet(df=en_test_data, image_features_files=image_features_files, 
                              text_features_files=text_features_files)

en_test_dataloader =  DataLoader(en_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large], 
                dataloader=en_test_dataloader, pred_file_path=en_pred_file_path, 
                evaluator_script_path=evaluator_script, gold_file_path=None,
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=None, validation=False, threshold=0.3)

100%|██████████| 24/24 [00:02<00:00, 11.69it/s]


0.68318	0.71913	0.65065

### MultiModal OpenAI Large + MultiModal OpenAI Small + OpenAI Large with NER

In [85]:
ar_pred_file_path = './Predictions/subtask2a/ar_predictions_subtask2a.txt'
ar_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask2a_ar.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

image_features_files = ['ImageFeatures/CLIP-ViT/ar_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/ar_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/ar_test_images_features.pkl']

text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/ar_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/ar_test_text_features.pkl',
                       'TextFeatures/subtask2a/multilingual-ner/ar_test_text_features.pkl']

ar_test_data = process_json(ar_gold_file_path, techniques_to_level_2a, hierarchy_subtask_2a)
ar_test_dataset = TestDataSet(ar_test_data, image_features_files, text_features_files)

ar_test_dataloader =  DataLoader(ar_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large, openai_large_ner], 
                dataloader=ar_test_dataloader, pred_file_path=ar_pred_file_path, 
                gold_file_path=ar_gold_file_path, evaluator_script_path=evaluator_script, 
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=5, validation=False, threshold=0.35)

100%|██████████| 2/2 [00:00<00:00,  8.68it/s]


Output:
 f1_h=0.53378	prec_h=0.53601	rec_h=0.53156


In [57]:
bg_pred_file_path = './Predictions/subtask2a/bg_predictions_subtask2a.txt'
bg_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask2a_bg.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

bg_test_data = process_json(bg_gold_file_path, techniques_to_level_2a, hierarchy_subtask_2a)

image_features_files = ['ImageFeatures/CLIP-ViT/bulgarian_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/bulgarian_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/bulgarian_test_images_features.pkl']

text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/bg_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/bg_test_text_features.pkl',
                       'TextFeatures/subtask2a/multilingual-ner/bg_test_text_features.pkl']


bg_test_dataset = TestDataSet(df=bg_test_data, image_features_files=image_features_files,
                              text_features_files=text_features_files)

bg_test_dataloader =  DataLoader(bg_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large, openai_large_ner], 
                dataloader=bg_test_dataloader, pred_file_path=bg_pred_file_path, 
                gold_file_path=bg_gold_file_path, evaluator_script_path=evaluator_script, 
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=None, validation=False, threshold=0.3)

100%|██████████| 7/7 [00:00<00:00, 14.48it/s]


Output:
 f1_h=0.65638	prec_h=0.67555	rec_h=0.63828


In [58]:
md_pred_file_path = './Predictions/subtask2a/md_predictions_subtask2a.txt'
md_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask2a_md.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

md_test_data = process_json(md_gold_file_path, techniques_to_level_2a, hierarchy_subtask_2a)

image_features_files = ['ImageFeatures/CLIP-ViT/nm_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/nm_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/nm_test_images_features.pkl']

text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/md_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/md_test_text_features.pkl',
                       'TextFeatures/subtask2a/multilingual-ner/md_test_text_features.pkl']

md_test_dataset = TestDataSet(df=md_test_data, image_features_files=image_features_files,
                              text_features_files=text_features_files)

md_test_dataloader =  DataLoader(md_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large, openai_large_ner], 
                dataloader=md_test_dataloader, pred_file_path=md_pred_file_path, 
                gold_file_path=md_gold_file_path, evaluator_script_path=evaluator_script, 
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=None, validation=False, threshold=0.3)

100%|██████████| 5/5 [00:00<00:00, 14.03it/s]


Output:
 f1_h=0.69844	prec_h=0.76676	rec_h=0.64130


In [60]:
en_pred_file_path = './Predictions/subtask2a/en_predictions_subtask2a.txt'

evaluator_script = './scorer-baseline/subtask_1_2a.py'

en_test_data = process_test_json('test_data/english/en_subtask2a_test_unlabeled.json')

image_features_files = ['ImageFeatures/CLIP-ViT/english_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/english_test_images_features.pkl',
                        'ImageFeatures/CLIP-ViT/english_test_images_features.pkl']

text_features_files = ['TextFeatures/subtask2a/text-embedding-3-small/en_test_text_features.pkl',
                       'TextFeatures/subtask2a/text-embedding-3-large/en_test_text_features.pkl',
                       'TextFeatures/subtask2a/multilingual-ner/en_test_text_features.pkl']


en_test_dataset = TestDataSet(df=en_test_data, image_features_files=image_features_files, 
                              text_features_files=text_features_files)

en_test_dataloader =  DataLoader(en_test_dataset, batch_size=64, shuffle=True)

evaluate_models(models=[vit_openai_small, vit_openai_large, openai_large_ner], 
                dataloader=en_test_dataloader, pred_file_path=en_pred_file_path, 
                evaluator_script_path=evaluator_script, gold_file_path=None,
                device=torch.device('cpu'),
               id2leaf_label=id2leaf_label_subtask_2a, format=None, validation=False, threshold=0.3)

100%|██████████| 24/24 [00:01<00:00, 13.97it/s]


The score for above submission is 0.69666	0.73742	0.66018