In [1]:
from collections import deque

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import nltk
from torch.utils.data import Dataset
import pickle

from utils.utils import *
from utils.label_decoding import *
from utils.HierarchicalLoss import *

# SubTask 1

In [2]:
class DataSet(Dataset):
    def __init__(self, df, labels_at_level, features_file):
        super(DataSet, self).__init__()
        self.data_df = df
        self.labels_at_level = labels_at_level
        self.features_file = features_file
        self.features_dict = None
        with open(features_file, 'rb') as f:
            self.features_dict = pickle.load(f)
    
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, idx):
        id = self.data_df.iloc[idx]['id']
        text = self.data_df.iloc[idx]['cleaned_text']
        level_1_target = self.encode(self.data_df.iloc[idx]['Level 1'], 1)
        level_2_target = self.encode(self.data_df.iloc[idx]['Level 2'], 2)
        level_3_target = self.encode(self.data_df.iloc[idx]['Level 3'], 3)
        level_4_target = self.encode(self.data_df.iloc[idx]['Level 4'], 4)
        level_5_target = self.encode(self.data_df.iloc[idx]['Level 5'], 5)
            
        
        return {'id': id,
                'text': text, 
                'text_features': self.features_dict[id],
                'level_1_target': level_1_target, 
                'level_2_target': level_2_target, 
                'level_3_target': level_3_target, 
                'level_4_target': level_4_target, 
                'level_5_target': level_5_target }

    def encode(self, labels, level):
        level_ = f'Level {level}'
        
        target = torch.zeros(len(self.labels_at_level[level_])+1)
        
        for label in labels:
            label_idx = self.labels_at_level[level_][label]
            target[label_idx] = 1
        
        if len(labels) == 0:
            target[-1] = 1
        
        return target

In [3]:
class TestDataSet(Dataset):
    def __init__(self, df, features_file):
        super(TestDataSet, self).__init__()
        self.data_df = df
        self.features_file = features_file
        self.features_dict = None
        with open(features_file, 'rb') as f:
            self.features_dict = pickle.load(f)
    
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, idx):
        id = self.data_df.iloc[idx]['id']
        text = self.data_df.iloc[idx]['cleaned_text']
        
        return {'id': id,
                'text': text, 
                'text_features': self.features_dict[id] }

In [4]:
def evaluate_model(model, dataloader, pred_file_path, gold_file_path, 
                   evaluator_script_path, id2leaf_label, format=None,validation=False,
                   HL=None, batchsize=None, threshold=0.6):
    model.eval()
    predictions = []
    
    # HL = HierarchicalLoss(id2label=id2label_1, hierarchical_labels=hierarchy_1, persuasion_techniques=persuasion_techniques_1, device=device)
    total_loss = 0
    
    
    with torch.no_grad():
        
        for batch in dataloader:
            if not isinstance(batch['id'], list):
                ids = batch['id'].detach().numpy().tolist()
            else:
                ids = batch['id']
        
            embeddings = batch['text_features']
            embeddings = embeddings.to(device)
            pred_1, pred_2, pred_3, pred_4, pred_5 = model(embeddings)
            
            if validation:
                y_1, y_2, y_3 = batch['level_1_target'], batch['level_2_target'], batch['level_3_target']
                y_4, y_5 = batch['level_4_target'], batch['level_5_target']
                
                y_1, y_2, y_3, y_4, y_5 = y_1.to(device), y_2.to(device), y_3.to(device), y_4.to(device), y_5.to(device)
                
                dloss = HL.calculate_dloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
                lloss = HL.calculate_lloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
                
                total_loss += (dloss + lloss).detach().cpu().item()
                
            pred_3 = (pred_3.cpu().detach().numpy() > threshold).astype(int)
            pred_4 = (pred_4.cpu().detach().numpy() > threshold).astype(int)
            pred_5 = (pred_5.cpu().detach().numpy() > threshold).astype(int)
            
            predictions += get_labels(id2leaf_label, ids, pred_3, pred_4, pred_5, format)

        # Writing JSON data
        with open(pred_file_path, 'w') as f:
            json.dump(predictions, f, indent=4)
        
        if gold_file_path is None:
            return
        
        prec_h, rec_h, f1_h = evaluate_h(pred_file_path, gold_file_path)
        print("f1_h={:.5f}\tprec_h={:.5f}\trec_h={:.5f}".format(f1_h, prec_h, rec_h))
        if validation:
            return prec_h, rec_h, f1_h, total_loss / (len(dataloader) * batchsize)
            
        # command = [
        #         "python3", evaluator_script_path,
        #         "--gold_file_path", gold_file_path,
        #         "--pred_file_path", pred_file_path
        # ]
        # 
        # result = subprocess.run(command, capture_output=True, text=True)
        # 
        # if result.returncode == 0:
        #     print("Output:\n", result.stdout)
        # else:
        #     print("Error:\n", result.stderr)
        #     
        
    

In [6]:
from torch.utils.data import DataLoader

train_json = './semeval2024_dev_release/subtask1/train.json'
validation_json = './semeval2024_dev_release/subtask1/validation.json'

train_data = process_json(train_json, techniques_to_level_1, hierarchy_1)
# val_data = 
validation_data = process_json(validation_json, techniques_to_level_1, hierarchy_1)


training_dataset = DataSet(train_data, indexed_persuasion_techniques_1, 
                           './TextFeatures/subtask1a/mBERT/train_text_features.pkl')
validation_dataset = DataSet(validation_data, indexed_persuasion_techniques_1, 
                             './TextFeatures/subtask1a/mBERT/validation_text_features.pkl')

In [5]:
device = get_device()

device = torch.device('cpu')

Using MPS


In [8]:
import wandb

# Initialize WandB and log in to your account
wandb.login()

sweep_config = {
    'method': 'bayes',  # Using Bayesian optimization
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'learning_rate': {
            'min': 1e-5,
            'max': 1e-3
        },
        'batch_size': {
            'values': [128, 256]
        },
        'optimizer': {
            'values': ['adam']
        },
        'beta1': {  # Relevant for Adam
            'min': 0.85,
            'max': 0.95
        },
        # 'momentum': {  # Relevant for SGD
        #     'min': 0.8,
        #     'max': 0.99
        # }
        'alpha': {
            'min': 0.5,
            'max': 1.0
        },
        'beta': {
            'min': 0.5,
            'max': 1.0
        },
        'threshold':{
            'min': 0.6,
            'max': 0.8
            }
        
    }
}

sweep_id = wandb.sweep(sweep_config, project="FineTuning-mBERT-200-epoch")
# sweep_id = '44uz6ydx'

[34m[1mwandb[0m: Currently logged in as: [33miqbal_shaik[0m ([33mphoenix_nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: lftu2ej4
Sweep URL: https://wandb.ai/phoenix_nlp/FineTuning-mBERT-200-epoch/sweeps/lftu2ej4


In [9]:
models_dir = './models/subtask1a/mBERT-200/'
num_epochs = 200

In [10]:
from tqdm import tqdm
import json
import subprocess
from subtask_1_2a import *

from modules.nn.mBERT import mBERT

def train():
    
    wandb.init()

    # Use WandB configurations
    config = wandb.config
    batch_size = config.batch_size
    learning_rate = config.learning_rate
    
    train_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True)
    validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
    
    model = mBERT()
    model.to(device)
    
    optimizer = None
    
    if config.optimizer == 'adam':
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=learning_rate,
            betas=(config.beta1, 0.999)
        )
    elif config.optimizer == 'sgd':
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=learning_rate,
            momentum=config.momentum
        )
        
    HL = HierarchicalLoss(id2label=id2label_1, hierarchical_labels=hierarchy_1,
                          persuasion_techniques=persuasion_techniques_1, device=device, 
                          alpha=config.alpha, beta=config.beta, threshold=config.threshold)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for batch_idx, batch in enumerate(train_dataloader):
            
            y_1, y_2, y_3 = batch['level_1_target'], batch['level_2_target'], batch['level_3_target']
            y_4, y_5 = batch['level_4_target'], batch['level_5_target']
            
            embeddings = batch['text_features']
            embeddings = embeddings.to(device)
            y_1, y_2, y_3, y_4, y_5 = y_1.to(device), y_2.to(device), y_3.to(device), y_4.to(device), y_5.to(device)
            
            
            optimizer.zero_grad()
            pred_1, pred_2, pred_3, pred_4, pred_5 = model(embeddings)
            
            dloss = HL.calculate_dloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
            lloss = HL.calculate_lloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
    
            total_loss = lloss + dloss
            
            total_loss.backward()
            optimizer.step()
            
            running_loss += total_loss.detach().item()
            
            # if batch_idx % 20 == 19:
            #     print(f"[{epoch + 1}, {batch_idx + 1}] loss: {running_loss / 20:.3f}")
            #     running_loss = 0.0
        
        running_loss /= (len(train_dataloader) * batch_size)
        
        val_pred_file_path = './Predictions/val_predictions_subtask1.json'
        val_gold_file_path = './semeval2024_dev_release/subtask1/validation.json'
        evaluator_script = './scorer-baseline/subtask_1_2a.py'
        
        prec_h, rec_h, f1_h, validation_loss = evaluate_model(model, validation_dataloader, val_pred_file_path, 
                                         val_gold_file_path, evaluator_script,id2leaf_label,
                                         validation=True, HL=HL, batchsize=batch_size)
        
        if epoch % 50 == 49:
            print(f'[{epoch+1}/{num_epochs}]')
            print("f1_h={:.5f}\tprec_h={:.5f}\trec_h={:.5f}".format(f1_h, prec_h, rec_h))
        
        # Log training metrics
        wandb.log({"epoch": epoch, "train_loss": running_loss})
        wandb.log({"val_loss": validation_loss})
        wandb.log({"h_precision": prec_h, "h_recall": rec_h, "h_f1-score":f1_h})
        
    
    torch.save(model.state_dict(), f"{models_dir}{wandb.run.name}.pth")
    wandb.join()

In [11]:
def main():
    wandb.agent(sweep_id, train, count=20)

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Agent Starting Run: 3847loy7 with config:
[34m[1mwandb[0m: 	alpha: 0.6907160287074812
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.5609451396964716
[34m[1mwandb[0m: 	beta1: 0.9119580642548668
[34m[1mwandb[0m: 	learning_rate: 0.00025252807109908345
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7710184089072092


[50/200]
f1_h=0.44412	prec_h=0.66815	rec_h=0.33260
[100/200]
f1_h=0.48864	prec_h=0.61632	rec_h=0.40478
[150/200]
f1_h=0.52488	prec_h=0.58087	rec_h=0.47874
[200/200]
f1_h=0.50782	prec_h=0.56652	rec_h=0.46014


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▄▅▅▆▆▇▇▇▇▇▇█████▇██▇███████████████████
h_precision,▁▇████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▇▇▇▇▇▇▇▇▇▇▇
h_recall,▁▃▃▄▅▄▆▅▆▇▆▆▇▇▇▇▇▆▇█▇█▇▇▇█▇█████▇█▇▇▇▇▇▇
train_loss,█▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁
val_loss,█▅▄▃▃▃▃▄▃▄▃▄▃▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▃▂▁▁▂▂▂▂▁▂▃▂

0,1
epoch,199.0
h_f1-score,0.50782
h_precision,0.56652
h_recall,0.46014
train_loss,1.33679
val_loss,4.48727


[34m[1mwandb[0m: Agent Starting Run: c22tos9z with config:
[34m[1mwandb[0m: 	alpha: 0.95772054344116
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.9232250756169256
[34m[1mwandb[0m: 	beta1: 0.8820015212800097
[34m[1mwandb[0m: 	learning_rate: 0.0002334758964183666
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.6084638898462581


[50/200]
f1_h=0.49436	prec_h=0.62842	rec_h=0.40744
[100/200]
f1_h=0.50803	prec_h=0.58633	rec_h=0.44818
[150/200]
f1_h=0.48371	prec_h=0.55716	rec_h=0.42737
[200/200]
f1_h=0.52390	prec_h=0.54441	rec_h=0.50487


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▄▅▆▆▆▆▇▇▇▇▇█▇▇█▇▇█▇████████████████████
h_precision,▁███▇▇▇▆▇▇▇▇▆▇▇▆▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
h_recall,▁▃▄▄▅▅▄▆▆▆▆▆▇▆▆▇▆▇▇▇▇▇▇▇█▇▇█▇██▇▇▇▇▇▇▇▇█
train_loss,█▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
val_loss,█▃▂▂▂▁▂▁▃▁▁▂▂▂▂▃▂▂▄▁▂▂▃▂▂▂▂▃▂▂▂▂▂▂▄▂▂▂▃▃

0,1
epoch,199.0
h_f1-score,0.5239
h_precision,0.54441
h_recall,0.50487
train_loss,1.16977
val_loss,7.38166


[34m[1mwandb[0m: Agent Starting Run: vgd6j6dy with config:
[34m[1mwandb[0m: 	alpha: 0.9147558374225776
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.8785164538883192
[34m[1mwandb[0m: 	beta1: 0.9044540477206732
[34m[1mwandb[0m: 	learning_rate: 0.0009607166616819854
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7456376984546297


[50/200]
f1_h=0.52219	prec_h=0.58899	rec_h=0.46900
[100/200]
f1_h=0.48780	prec_h=0.61779	rec_h=0.40301
[150/200]
f1_h=0.50639	prec_h=0.55585	rec_h=0.46501
[200/200]
f1_h=0.50861	prec_h=0.56186	rec_h=0.46457


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▄▄▇▆▆▇▇▇▇▇████████▇██▇▇▇█▇█▇███▇▇▇█▇▇██
h_precision,▂▅█▂▅▅▃▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▃▃▃▂▃▁▂▂▂▂▁▂▂▁▂▂▂▂
h_recall,▁▃▃▆▅▅▆▆▆▆▆▇▇██▇██▇▇██▇▇▇█▆█▇▇█▇▇▇▇█▇▇█▇
train_loss,█▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁
val_loss,█▅▃▃▃▄▃▃▃▃▅▃▃▃▃▃▃▃▂▂▂▂▃▂▁▂▁▂▂▂▂▁▂▁▂▁▂▂▁▁

0,1
epoch,199.0
h_f1-score,0.50861
h_precision,0.56186
h_recall,0.46457
train_loss,2.57776
val_loss,6.85464


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: z3q6cajj with config:
[34m[1mwandb[0m: 	alpha: 0.590279264545452
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.6461561942017204
[34m[1mwandb[0m: 	beta1: 0.9196321389903744
[34m[1mwandb[0m: 	learning_rate: 7.713525403661424e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7339877663261996


[50/200]
f1_h=0.38044	prec_h=0.65059	rec_h=0.26882
[100/200]
f1_h=0.45488	prec_h=0.64396	rec_h=0.35164
[150/200]
f1_h=0.49259	prec_h=0.59672	rec_h=0.41940
[200/200]
f1_h=0.49506	prec_h=0.57767	rec_h=0.43313


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▁▃▄▅▅▅▅▆▅▆▆▆▇▇▇▇▇▇▇██▇██████▇███▇██████
h_precision,▁▁▆▇████▇██▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▇▇▇▆▇
h_recall,▁▁▂▃▄▄▄▄▅▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███▇██
train_loss,██▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_loss,██▆▆▄▃▃▂▂▂▂▃▃▂▂▂▃▃▂▃▂▃▃▃▂▁▂▂▂▃▃▂▃▂▂▁▂▂▂▁

0,1
epoch,199.0
h_f1-score,0.49506
h_precision,0.57767
h_recall,0.43313
train_loss,2.78615
val_loss,5.20493


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3q002z39 with config:
[34m[1mwandb[0m: 	alpha: 0.7506952354988633
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.5138291870972287
[34m[1mwandb[0m: 	beta1: 0.9071820579300244
[34m[1mwandb[0m: 	learning_rate: 0.0002281542631937321
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7534216296440339


[50/200]
f1_h=0.44373	prec_h=0.65598	rec_h=0.33525
[100/200]
f1_h=0.50699	prec_h=0.54508	rec_h=0.47387
[150/200]
f1_h=0.50864	prec_h=0.58569	rec_h=0.44951
[200/200]
f1_h=0.52282	prec_h=0.57841	rec_h=0.47697


VBox(children=(Label(value='0.001 MB of 0.007 MB uploaded\r'), FloatProgress(value=0.14875239923224567, max=1.…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▄▄▆▆▆▆▇▆▇▇▇▇▇█▇████▇█▇▇▇▇██████████████
h_precision,▁▇█▇███▇█▇▇▇▇▇▇▇▆▇▇▇▇▇▇▆▇▆▆▇▆▇▇▇▆▇▇▆▆▆▆▇
h_recall,▁▃▃▅▄▄▅▆▄▆▅▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇██▇▇
train_loss,█▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁
val_loss,█▅▃▄▃▃▄▃▄▄▄▃▃▄▃▃▃▂▂▂▃▂▂▃▂▂▂▂▂▂▁▂▂▁▂▂▂▂▂▁

0,1
epoch,199.0
h_f1-score,0.52282
h_precision,0.57841
h_recall,0.47697
train_loss,1.27615
val_loss,3.87842


[34m[1mwandb[0m: Agent Starting Run: 8t3ds83x with config:
[34m[1mwandb[0m: 	alpha: 0.8677869267809326
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.5059341829943669
[34m[1mwandb[0m: 	beta1: 0.9027912215750044
[34m[1mwandb[0m: 	learning_rate: 0.00012293892506413828
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.6877841600738422


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011148506488896398, max=1.0…

[50/200]
f1_h=0.37907	prec_h=0.67808	rec_h=0.26306
[100/200]
f1_h=0.49055	prec_h=0.59077	rec_h=0.41940
[150/200]
f1_h=0.48263	prec_h=0.59468	rec_h=0.40611
[200/200]
f1_h=0.51295	prec_h=0.57874	rec_h=0.46058


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▁▅▅▆▆▆▆▆▇▇▆▇▇▇▇▇▇█▇█▇▇▇█████▇███▇██████
h_precision,▁▁▇██████▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
h_recall,▁▁▄▄▅▅▅▅▅▆▆▅▆▆▆▇▆▇▇▆▇▇▇▇▇▇▇██▇▇▇█▇█▇▇▇██
train_loss,██▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁
val_loss,█▇▇▃▂▁▂▂▃▁▃▃▃▂▃▃▄▂▂▃▁▂▂▃▂▁▃▁▁▂▂▁▂▂▂▁▁▁▂▁

0,1
epoch,199.0
h_f1-score,0.51295
h_precision,0.57874
h_recall,0.46058
train_loss,1.5806
val_loss,3.99516


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3341j3i6 with config:
[34m[1mwandb[0m: 	alpha: 0.7904404970059192
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.577806359706779
[34m[1mwandb[0m: 	beta1: 0.8543993772492797
[34m[1mwandb[0m: 	learning_rate: 2.074851088199188e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.787412798836406


[50/200]
f1_h=0.24049	prec_h=0.66004	rec_h=0.14703
[100/200]
f1_h=0.33345	prec_h=0.69620	rec_h=0.21922
[150/200]
f1_h=0.38418	prec_h=0.67295	rec_h=0.26882
[200/200]
f1_h=0.40507	prec_h=0.67111	rec_h=0.29008


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▆▁▁▁▁▁▂▃▄▄▅▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇▇█████
h_precision,▃▁▁▁▁▁▇▇▇▇▇██████████████▇█▇███████▇▇▇▇▇
h_recall,█▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
train_loss,████▆▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_loss,████▅▄▄▄▄▄▄▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▂▁

0,1
epoch,199.0
h_f1-score,0.40507
h_precision,0.67111
h_recall,0.29008
train_loss,4.33904
val_loss,4.96276


[34m[1mwandb[0m: Agent Starting Run: kijbqvnu with config:
[34m[1mwandb[0m: 	alpha: 0.8449557538786105
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.5238261098206018
[34m[1mwandb[0m: 	beta1: 0.9495473942931524
[34m[1mwandb[0m: 	learning_rate: 7.729019970085519e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7800005241374003


[50/200]
f1_h=0.38788	prec_h=0.65784	rec_h=0.27502
[100/200]
f1_h=0.46695	prec_h=0.63075	rec_h=0.37068
[150/200]
f1_h=0.45139	prec_h=0.62045	rec_h=0.35474
[200/200]
f1_h=0.47948	prec_h=0.60775	rec_h=0.39593


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▁▂▃▅▅▆▆▆▅▆▆▆▆▇▇▇▇▇▇██▇▇████████████████
h_precision,▁▁▆▇██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▇▇▇▇▇▇▇
h_recall,▁▁▂▂▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇█▇▇▇▇▇████▇██▇
train_loss,▇█▇▆▆▆▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁
val_loss,▅█▅▅▄▄▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▁▂▁

0,1
epoch,199.0
h_f1-score,0.47948
h_precision,0.60775
h_recall,0.39593
train_loss,2.42743
val_loss,4.28473


[34m[1mwandb[0m: Agent Starting Run: kkx37jb0 with config:
[34m[1mwandb[0m: 	alpha: 0.7894307959645357
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.5185437578472254
[34m[1mwandb[0m: 	beta1: 0.9450773951338626
[34m[1mwandb[0m: 	learning_rate: 4.1941852010293864e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7791764590302983


[50/200]
f1_h=0.37296	prec_h=0.67439	rec_h=0.25775
[100/200]
f1_h=0.46889	prec_h=0.67051	rec_h=0.36050
[150/200]
f1_h=0.46904	prec_h=0.61494	rec_h=0.37910
[200/200]
f1_h=0.48574	prec_h=0.58562	rec_h=0.41497


VBox(children=(Label(value='0.001 MB of 0.007 MB uploaded\r'), FloatProgress(value=0.15477888730385164, max=1.…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▁▃▄▅▆▆▆▆▆▇▆▆▇▇▇▇██▇▇██▇█▇██████▇███████
h_precision,▁▁▇▇██████▇████▇██▇██▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
h_recall,▁▁▃▃▄▄▅▅▅▅▆▅▅▆▆▆▆▇▇▇▆▇▇▇▇▇▇▇▇▇██▇███████
train_loss,██▇▆▆▅▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
val_loss,██▅▄▃▃▃▂▂▂▂▂▂▂▂▃▂▃▂▂▃▂▂▃▂▂▂▂▂▂▂▂▁▂▂▂▂▁▂▁

0,1
epoch,199.0
h_f1-score,0.48574
h_precision,0.58562
h_recall,0.41497
train_loss,2.55291
val_loss,4.25294


[34m[1mwandb[0m: Agent Starting Run: ye1aikcf with config:
[34m[1mwandb[0m: 	alpha: 0.8392937146792514
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.5339939946285632
[34m[1mwandb[0m: 	beta1: 0.9499741014246776
[34m[1mwandb[0m: 	learning_rate: 3.525676834479531e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.765536675376864


[50/200]
f1_h=0.34646	prec_h=0.70325	rec_h=0.22985
[100/200]
f1_h=0.41420	prec_h=0.68330	rec_h=0.29717
[150/200]
f1_h=0.44458	prec_h=0.67942	rec_h=0.33038
[200/200]
f1_h=0.48643	prec_h=0.61859	rec_h=0.40080


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▁▂▃▄▄▅▆▆▆▆▆▆▇▆▇▇▆▇▇▇▇█▇▇▇█▇██▇▇█████▇██
h_precision,▁▁▆▇▇█████████████▇█▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
h_recall,▁▁▂▂▃▃▄▅▅▅▅▅▅▆▆▆▆▅▆▆▆▆█▇▇▇▇▇▇▇▇▆▇██▇▇▇██
train_loss,▇█▇▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
val_loss,▅█▄▄▄▃▂▂▂▂▁▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▁▂▃▁▁

0,1
epoch,199.0
h_f1-score,0.48643
h_precision,0.61859
h_recall,0.4008
train_loss,2.88469
val_loss,4.47264


[34m[1mwandb[0m: Agent Starting Run: 54jwt0c4 with config:
[34m[1mwandb[0m: 	alpha: 0.8941070471424057
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.5181915616888193
[34m[1mwandb[0m: 	beta1: 0.9422587530422476
[34m[1mwandb[0m: 	learning_rate: 6.563138535930895e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7214368443873793


[50/200]
f1_h=0.36516	prec_h=0.67221	rec_h=0.25066
[100/200]
f1_h=0.41383	prec_h=0.65703	rec_h=0.30204
[150/200]
f1_h=0.46302	prec_h=0.61420	rec_h=0.37157
[200/200]
f1_h=0.50386	prec_h=0.60049	rec_h=0.43401


VBox(children=(Label(value='0.001 MB of 0.007 MB uploaded\r'), FloatProgress(value=0.1487320082248115, max=1.0…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▁▁▂▃▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇█████▇███████
h_precision,▁▁▆▇▇▇██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
h_recall,▁▁▁▂▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▆▇▇█▇▇▇▇▇███▇█
train_loss,██▇▇▆▆▆▅▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁
val_loss,██▆▆▆▄▃▃▂▂▁▂▃▂▃▂▃▂▂▃▃▃▃▃▂▂▃▂▂▁▂▂▄▂▁▂▁▁▁▁

0,1
epoch,199.0
h_f1-score,0.50386
h_precision,0.60049
h_recall,0.43401
train_loss,2.47316
val_loss,4.21044


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9p217tg0 with config:
[34m[1mwandb[0m: 	alpha: 0.8864253517248777
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beta: 0.5157787788308356
[34m[1mwandb[0m: 	beta1: 0.9415484280060994
[34m[1mwandb[0m: 	learning_rate: 3.952402814278007e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7992183713854618


[50/200]
f1_h=0.33266	prec_h=0.69382	rec_h=0.21878


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


### Evaluation

### multilingual BERT

In [39]:
from modules.nn.mBERT import mBERT
from torch.utils.data import DataLoader

model = mBERT()
device = torch.device('cpu')

model.load_state_dict(torch.load('./models/subtask1a/mBERT-200/fanciful-sweep-1.pth'))

<All keys matched successfully>

#### Bulgarian

In [12]:
from tqdm import tqdm
import json
import subprocess
from subtask_1_2a import *

bulgarian_pred_file_path = './Predictions/bulgarian_predictions_subtask1.txt'
bulgarian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

bg_test_data = process_test_json(bulgarian_gold_file_path)


bg_test_dataset = TestDataSet(bg_test_data, './TextFeatures/subtask1a/mBERT/bg_test_text_features.pkl')
bg_test_dataloader = DataLoader(bg_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, bg_test_dataloader, bulgarian_pred_file_path, bulgarian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.42486	prec_h=0.41434	rec_h=0.43593


#### North Macedonian

In [13]:
macedonian_pred_file_path = './Predictions/macedonian_predictions_subtask1.txt'
macedonian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_md.json'

md_test_data = process_test_json(macedonian_gold_file_path)

md_test_dataset = TestDataSet(md_test_data, './TextFeatures/subtask1a/mBERT/md_test_text_features.pkl')
md_test_dataloader = DataLoader(md_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, md_test_dataloader, macedonian_pred_file_path, macedonian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.40270	prec_h=0.44678	rec_h=0.36654


#### Arabian

In [14]:
arabian_pred_file_path = './Predictions/arabian_predictions_subtask1.txt'
arabian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'

ar_test_data = process_test_json(arabian_gold_file_path)

ar_test_dataset = TestDataSet(ar_test_data, './TextFeatures/subtask1a/mBERT/ar_test_text_features.pkl')
ar_test_dataloader = DataLoader(ar_test_dataset, batch_size=128, shuffle=True)

evaluate_model(model, ar_test_dataloader, arabian_pred_file_path, arabian_gold_file_path, evaluator_script, 
               id2leaf_label, format=5, validation=False, threshold=0.3)

f1_h=0.26642	prec_h=0.27757	rec_h=0.25614


#### English

In [10]:
en_pred_file_path = './Predictions/en_predictions_subtask1.txt'

en_test_data = process_test_json('./test_data/english/en_subtask1_test_unlabeled.json')

en_test_dataset = TestDataSet(en_test_data, './TextFeatures/subtask1a/mBERT/en_test_text_features.pkl')
en_test_dataloader = DataLoader(en_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_test_dataloader, en_pred_file_path, None, evaluator_script, id2leaf_label, validation=False)

0.48871	0.64459	0.39354

In [41]:
en_dev_pred_file_path = './Predictions/mBERT_en_dev_predictions_subtask1.txt'
en_dev_gold_file_path = './dev_gold_labels/dev_subtask1_en.json'

en_dev_test_data = process_test_json(en_dev_gold_file_path)

en_dev_test_dataset = TestDataSet(en_dev_test_data, './TextFeatures/subtask1a/mBERT/en_dev_text_features.pkl')
en_dev_test_dataloader = DataLoader(en_dev_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_dev_test_dataloader, en_dev_pred_file_path, en_dev_gold_file_path,
               evaluator_script, id2leaf_label, validation=False)

f1_h=0.47108	prec_h=0.63688	rec_h=0.37378


### XLM-RoBERTa

In [37]:
from modules.nn.XLMRoBERTa import XLMRoBERTa

model = XLMRoBERTa()
device = torch.device('cpu')

model.load_state_dict(torch.load('./models/subtask1a/XLM-RoBERTa/expert-sweep-1.pth'))

<All keys matched successfully>

#### Bulgarian

In [24]:
bulgarian_pred_file_path = './Predictions/bulgarian_predictions_subtask1.txt'
bulgarian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

bg_test_data = process_test_json(bulgarian_gold_file_path)


bg_test_dataset = TestDataSet(bg_test_data, './TextFeatures/subtask1a/XLM-RoBERTa/bg_test_text_features.pkl')
bg_test_dataloader = DataLoader(bg_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, bg_test_dataloader, bulgarian_pred_file_path, bulgarian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.23920	prec_h=0.31744	rec_h=0.19191


#### North Macedonian

In [25]:
macedonian_pred_file_path = './Predictions/macedonian_predictions_subtask1.txt'
macedonian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_md.json'

md_test_data = process_test_json(macedonian_gold_file_path)

md_test_dataset = TestDataSet(md_test_data, './TextFeatures/subtask1a/XLM-RoBERTa/md_test_text_features.pkl')
md_test_dataloader = DataLoader(md_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, md_test_dataloader, macedonian_pred_file_path, macedonian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.25316	prec_h=0.24823	rec_h=0.25830


#### Arabian

In [26]:
arabian_pred_file_path = './Predictions/arabian_predictions_subtask1.txt'
arabian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'

ar_test_data = process_test_json(arabian_gold_file_path)

ar_test_dataset = TestDataSet(ar_test_data, './TextFeatures/subtask1a/XLM-RoBERTa/ar_test_text_features.pkl')
ar_test_dataloader = DataLoader(ar_test_dataset, batch_size=128, shuffle=True)

evaluate_model(model, ar_test_dataloader, arabian_pred_file_path, arabian_gold_file_path, evaluator_script, 
               id2leaf_label, format=5, validation=False, threshold=0.3)

f1_h=0.32424	prec_h=0.28533	rec_h=0.37544


#### English

In [27]:
en_pred_file_path = './Predictions/en_predictions_subtask1.txt'

en_test_data = process_test_json('./test_data/english/en_subtask1_test_unlabeled.json')

en_test_dataset = TestDataSet(en_test_data, './TextFeatures/subtask1a/XLM-RoBERTa/en_test_text_features.pkl')
en_test_dataloader = DataLoader(en_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_test_dataloader, en_pred_file_path, None, evaluator_script, id2leaf_label, validation=False)

0.46873	0.62090	0.37647

In [38]:
en_dev_pred_file_path = './Predictions/en_dev_predictions_subtask1.txt'
en_dev_gold_file_path = './dev_gold_labels/dev_subtask1_en.json'

en_dev_test_data = process_test_json(en_dev_gold_file_path)

en_dev_test_dataset = TestDataSet(en_dev_test_data, './TextFeatures/subtask1a/XLM-RoBERTa/en_dev_text_features.pkl')
en_dev_test_dataloader = DataLoader(en_dev_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_dev_test_dataloader, en_dev_pred_file_path, en_dev_gold_file_path,
               evaluator_script, id2leaf_label, validation=False)

f1_h=0.45310	prec_h=0.60863	rec_h=0.36087


### text-embeddings-small

In [26]:
from modules.nn.OpenAiSmall import OpenAiSmall

model = OpenAiSmall()
device = torch.device('cpu')

model.load_state_dict(torch.load('./models/subtask1a/openAI-Small/crisp-sweep-2.pth'))

<All keys matched successfully>

#### Bulgarian

In [29]:
bulgarian_pred_file_path = './Predictions/bulgarian_predictions_subtask1.txt'
bulgarian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

bg_test_data = process_test_json(bulgarian_gold_file_path)


bg_test_dataset = TestDataSet(bg_test_data, './TextFeatures/subtask1a/text-embedding-3-small/bg_test_text_features.pkl')
bg_test_dataloader = DataLoader(bg_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, bg_test_dataloader, bulgarian_pred_file_path, bulgarian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.43489	prec_h=0.45970	rec_h=0.41263


#### North Macedonian

In [30]:
macedonian_pred_file_path = './Predictions/macedonian_predictions_subtask1.txt'
macedonian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_md.json'

md_test_data = process_test_json(macedonian_gold_file_path)

md_test_dataset = TestDataSet(md_test_data, './TextFeatures/subtask1a/text-embedding-3-small/md_test_text_features.pkl')
md_test_dataloader = DataLoader(md_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, md_test_dataloader, macedonian_pred_file_path, macedonian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.38950	prec_h=0.47849	rec_h=0.32841


#### Arabian

In [31]:
arabian_pred_file_path = './Predictions/arabian_predictions_subtask1.txt'
arabian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'

ar_test_data = process_test_json(arabian_gold_file_path)

ar_test_dataset = TestDataSet(ar_test_data, './TextFeatures/subtask1a/text-embedding-3-small/ar_test_text_features.pkl')
ar_test_dataloader = DataLoader(ar_test_dataset, batch_size=128, shuffle=True)

evaluate_model(model, ar_test_dataloader, arabian_pred_file_path, arabian_gold_file_path, evaluator_script, 
               id2leaf_label, format=5, validation=False, threshold=0.3)

f1_h=0.37881	prec_h=0.34911	rec_h=0.41404


#### English

In [32]:
en_pred_file_path = './Predictions/en_predictions_subtask1.txt'

en_test_data = process_test_json('./test_data/english/en_subtask1_test_unlabeled.json')

en_test_dataset = TestDataSet(en_test_data, './TextFeatures/subtask1a/text-embedding-3-small/en_test_text_features.pkl')
en_test_dataloader = DataLoader(en_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_test_dataloader, en_pred_file_path, None, evaluator_script, id2leaf_label, validation=False)

0.50858	0.73861	0.38780

In [27]:
en_dev_pred_file_path = './Predictions/en_dev_predictions_subtask1.txt'
en_dev_gold_file_path = './dev_gold_labels/dev_subtask1_en.json'

en_dev_test_data = process_test_json(en_dev_gold_file_path)

en_dev_test_dataset = TestDataSet(en_dev_test_data,
                                  './TextFeatures/subtask1a/text-embedding-3-small/en_dev_text_features.pkl')
en_dev_test_dataloader = DataLoader(en_dev_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_dev_test_dataloader, en_dev_pred_file_path, en_dev_gold_file_path,
               evaluator_script, id2leaf_label, validation=False)

f1_h=0.49366	prec_h=0.73151	rec_h=0.37253


### text-embeddings-Large

In [33]:
from modules.nn.OpenAiLarge import OpenAiLarge

model = OpenAiLarge()
device = torch.device('cpu')

model.load_state_dict(torch.load('./models/subtask1a/openAI-Large/trim-sweep-4.pth'))

<All keys matched successfully>

#### Bulgarian

In [35]:
bulgarian_pred_file_path = './Predictions/bulgarian_predictions_subtask1.txt'
bulgarian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

bg_test_data = process_test_json(bulgarian_gold_file_path)


bg_test_dataset = TestDataSet(bg_test_data, './TextFeatures/subtask1a/text-embedding-3-large/bg_test_text_features.pkl')
bg_test_dataloader = DataLoader(bg_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, bg_test_dataloader, bulgarian_pred_file_path, bulgarian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.44044	prec_h=0.43883	rec_h=0.44206


#### North Macedonian

In [36]:
macedonian_pred_file_path = './Predictions/macedonian_predictions_subtask1.txt'
macedonian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_md.json'

md_test_data = process_test_json(macedonian_gold_file_path)

md_test_dataset = TestDataSet(md_test_data, './TextFeatures/subtask1a/text-embedding-3-large/md_test_text_features.pkl')
md_test_dataloader = DataLoader(md_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, md_test_dataloader, macedonian_pred_file_path, macedonian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.38544	prec_h=0.58838	rec_h=0.28659


#### Arabian

In [37]:
arabian_pred_file_path = './Predictions/arabian_predictions_subtask1.txt'
arabian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'

ar_test_data = process_test_json(arabian_gold_file_path)

ar_test_dataset = TestDataSet(ar_test_data, './TextFeatures/subtask1a/text-embedding-3-large/ar_test_text_features.pkl')
ar_test_dataloader = DataLoader(ar_test_dataset, batch_size=128, shuffle=True)

evaluate_model(model, ar_test_dataloader, arabian_pred_file_path, arabian_gold_file_path, evaluator_script, 
               id2leaf_label, format=5, validation=False, threshold=0.3)

f1_h=0.38660	prec_h=0.30550	rec_h=0.52632


#### English

In [38]:
en_pred_file_path = './Predictions/en_predictions_subtask1.txt'

en_test_data = process_test_json('./test_data/english/en_subtask1_test_unlabeled.json')

en_test_dataset = TestDataSet(en_test_data, './TextFeatures/subtask1a/text-embedding-3-large/en_test_text_features.pkl')
en_test_dataloader = DataLoader(en_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_test_dataloader, en_pred_file_path, None, evaluator_script, id2leaf_label, validation=False)

0.52069	0.75674	0.39689