In [13]:
from collections import deque

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import nltk
from torch.utils.data import Dataset
import pickle

from utils.utils import *
from utils.label_decoding import *
from utils.HierarchicalLoss import *

In [14]:
class DataSet(Dataset):
    def __init__(self, df, labels_at_level, features_file, ner_features_file):
        super(DataSet, self).__init__()
        self.data_df = df
        self.labels_at_level = labels_at_level
        self.features_file = features_file
        self.features_dict = None
        with open(features_file, 'rb') as f:
            self.features_dict = pickle.load(f)
            
        with open(ner_features_file, 'rb') as f:
            self.ner_features_dict = pickle.load(f)
    
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, idx):
        id = self.data_df.iloc[idx]['id']
        text = self.data_df.iloc[idx]['cleaned_text']
        level_1_target = self.encode(self.data_df.iloc[idx]['Level 1'], 1)
        level_2_target = self.encode(self.data_df.iloc[idx]['Level 2'], 2)
        level_3_target = self.encode(self.data_df.iloc[idx]['Level 3'], 3)
        level_4_target = self.encode(self.data_df.iloc[idx]['Level 4'], 4)
        level_5_target = self.encode(self.data_df.iloc[idx]['Level 5'], 5)
            
        
        return {'id': id,
                'text': text, 
                'text_features': self.features_dict[id],
                'ner_features': self.ner_features_dict[id],
                'level_1_target': level_1_target, 
                'level_2_target': level_2_target, 
                'level_3_target': level_3_target, 
                'level_4_target': level_4_target, 
                'level_5_target': level_5_target }

    def encode(self, labels, level):
        level_ = f'Level {level}'
        
        target = torch.zeros(len(self.labels_at_level[level_])+1)
        
        for label in labels:
            label_idx = self.labels_at_level[level_][label]
            target[label_idx] = 1
        
        if len(labels) == 0:
            target[-1] = 1
        
        return target

In [15]:
class TestDataSet(Dataset):
    def __init__(self, df, features_file, ner_features_file):
        super(TestDataSet, self).__init__()
        self.data_df = df
        self.features_file = features_file
        self.features_dict = None
        with open(features_file, 'rb') as f:
            self.features_dict = pickle.load(f)
            
        with open(ner_features_file, 'rb') as f:
            self.ner_features_dict = pickle.load(f)
    
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, idx):
        id = self.data_df.iloc[idx]['id']
        text = self.data_df.iloc[idx]['cleaned_text']
        
        return {'id': id,
                'text': text, 
                'text_features': self.features_dict[id],
                'ner_features': self.ner_features_dict[id]}

In [16]:
from modules.nn.OpenAiLarge import *

In [20]:
def evaluate_model(model, dataloader, pred_file_path, gold_file_path, 
                   evaluator_script_path, id2leaf_label, format=None,validation=False, HL=None, batchsize=None,
                   threshold=0.6):
    model.eval()
    predictions = []
    
    # HL = HierarchicalLoss(id2label=id2label_1, hierarchical_labels=hierarchy_1, persuasion_techniques=persuasion_techniques_1, device=device)
    total_loss = 0
    
    
    with torch.no_grad():
        
        for batch in dataloader:
            if not isinstance(batch['id'], list):
                ids = batch['id'].detach().numpy().tolist()
            else:
                ids = batch['id']
        
            embeddings = batch['text_features']
            embeddings = embeddings.to(device)
            
            ner_embeddings = batch['ner_features'].to(device)
            
            pred_1, pred_2, pred_3, pred_4, pred_5 = model(embeddings, ner_embeddings)
            
            if validation:
                y_1, y_2, y_3 = batch['level_1_target'], batch['level_2_target'], batch['level_3_target']
                y_4, y_5 = batch['level_4_target'], batch['level_5_target']
                
                y_1, y_2, y_3, y_4, y_5 = y_1.to(device), y_2.to(device), y_3.to(device), y_4.to(device), y_5.to(device)
                
                dloss = HL.calculate_dloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
                lloss = HL.calculate_lloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
                
                total_loss += (dloss + lloss).detach().cpu().item()
                
            pred_3 = (pred_3.cpu().detach().numpy() > threshold).astype(int)
            pred_4 = (pred_4.cpu().detach().numpy() > threshold).astype(int)
            pred_5 = (pred_5.cpu().detach().numpy() > threshold).astype(int)
            
            predictions += get_labels(id2leaf_label, ids, pred_3, pred_4, pred_5, format)

        # Writing JSON data
        with open(pred_file_path, 'w') as f:
            json.dump(predictions, f, indent=4)
        
        if gold_file_path is None:
            return
        
        prec_h, rec_h, f1_h = evaluate_h(pred_file_path, gold_file_path)
        print("f1_h={:.5f}\tprec_h={:.5f}\trec_h={:.5f}".format(f1_h, prec_h, rec_h))
        if validation:
            return prec_h, rec_h, f1_h, total_loss / (len(dataloader))
            
        # command = [
        #         "python3", evaluator_script_path,
        #         "--gold_file_path", gold_file_path,
        #         "--pred_file_path", pred_file_path
        # ]
        # 
        # result = subprocess.run(command, capture_output=True, text=True)
        # 
        # if result.returncode == 0:
        #     print("Output:\n", result.stdout)
        # else:
        #     print("Error:\n", result.stderr)
        #     
        
    

In [6]:
from torch.utils.data import DataLoader

train_json = './semeval2024_dev_release/subtask1/train.json'
validation_json = './semeval2024_dev_release/subtask1/validation.json'

train_data = process_json(train_json, techniques_to_level_1, hierarchy_1)
# val_data = 
validation_data = process_json(validation_json, techniques_to_level_1, hierarchy_1)


training_dataset = DataSet(train_data, indexed_persuasion_techniques_1, 
                           './TextFeatures/subtask1a/text-embedding-3-large/train_text_features.pkl',
                           './TextFeatures/subtask1a/multilingual-ner/train_text_features.pkl')
validation_dataset = DataSet(validation_data, indexed_persuasion_techniques_1, 
                             './TextFeatures/subtask1a/text-embedding-3-large/validation_text_features.pkl',
                             './TextFeatures/subtask1a/multilingual-ner/validation_text_features.pkl')

In [7]:
device = get_device()

device = torch.device('cpu')

Using MPS


In [8]:
import wandb

# Initialize WandB and log in to your account
wandb.login()

sweep_config = {
    'method': 'bayes',  # Using Bayesian optimization
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'learning_rate': {
            'min': 1e-5,
            'max': 1e-4
        },
        'batch_size': {
            'values': [128, 256]
        },
        'optimizer': {
            'values': ['adam']
        },
        'beta1': {  # Relevant for Adam
            'min': 0.8,
            'max': 0.95
        },
        # 'momentum': {  # Relevant for SGD
        #     'min': 0.8,
        #     'max': 0.99
        # }
        'alpha': {
            'min': 0.65,
            'max': 1.0
        },
        'beta': {
            'min': 0.5,
            'max': 1.0
        },
        'threshold':{
            'min': 0.65,
            'max': 0.9
            }
        
    }
}

sweep_id = wandb.sweep(sweep_config, project="FineTuning-openAI-Large-m-NER")
# sweep_id = '44uz6ydx'

[34m[1mwandb[0m: Currently logged in as: [33miqbal_shaik[0m ([33mphoenix_nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: kd6j74uy
Sweep URL: https://wandb.ai/phoenix_nlp/FineTuning-openAI-Large-m-NER/sweeps/kd6j74uy


In [9]:
models_dir = './models/subtask1a/openAI-Large-m-NER/'
num_epochs = 100

In [10]:
from tqdm import tqdm
import json
import subprocess
from subtask_1_2a import *

def train():
    
    wandb.init()

    # Use WandB configurations
    config = wandb.config
    batch_size = config.batch_size
    learning_rate = config.learning_rate
    
    train_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True)
    validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
    
    model = OpenAiLargeMultiLingualNER()
    model.to(device)
    
    optimizer = None
    
    if config.optimizer == 'adam':
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=learning_rate,
            betas=(config.beta1, 0.999)
        )
    elif config.optimizer == 'sgd':
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=learning_rate,
            momentum=config.momentum
        )
        
    HL = HierarchicalLoss(id2label=id2label_1, hierarchical_labels=hierarchy_1,
                          persuasion_techniques=persuasion_techniques_1, device=device, 
                          alpha=config.alpha, beta=config.beta, threshold=config.threshold)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for batch_idx, batch in enumerate(train_dataloader):
            
            y_1, y_2, y_3 = batch['level_1_target'], batch['level_2_target'], batch['level_3_target']
            y_4, y_5 = batch['level_4_target'], batch['level_5_target']
            
            embeddings = batch['text_features']
            embeddings = embeddings.to(device)
            
            ner_embeddings = batch['ner_features'].to(device)
            
            y_1, y_2, y_3, y_4, y_5 = y_1.to(device), y_2.to(device), y_3.to(device), y_4.to(device), y_5.to(device)
            
            
            optimizer.zero_grad()
            pred_1, pred_2, pred_3, pred_4, pred_5 = model(embeddings, ner_embeddings)
            
            dloss = HL.calculate_dloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
            lloss = HL.calculate_lloss([pred_1, pred_2, pred_3, pred_4, pred_5], [y_1, y_2, y_3, y_4, y_5])
    
            total_loss = lloss + dloss
            
            total_loss.backward()
            optimizer.step()
            
            running_loss += total_loss.detach().item()
            
            # if batch_idx % 20 == 19:
            #     print(f"[{epoch + 1}, {batch_idx + 1}] loss: {running_loss / 20:.3f}")
            #     running_loss = 0.0
        
        running_loss /= (len(train_dataloader))
        
        val_pred_file_path = './Predictions/val_predictions_subtask1.json'
        val_gold_file_path = './semeval2024_dev_release/subtask1/validation.json'
        evaluator_script = './scorer-baseline/subtask_1_2a.py'
        
        prec_h, rec_h, f1_h, validation_loss = evaluate_model(model, validation_dataloader, val_pred_file_path, 
                                         val_gold_file_path, evaluator_script,id2leaf_label,
                                         validation=True, HL=HL, batchsize=batch_size)
        
        if epoch % 50 == 49:
            print(f'[{epoch+1}/{num_epochs}]')
            print("f1_h={:.5f}\tprec_h={:.5f}\trec_h={:.5f}".format(f1_h, prec_h, rec_h))
        
        # Log training metrics
        wandb.log({"epoch": epoch, "train_loss": running_loss})
        wandb.log({"val_loss": validation_loss})
        wandb.log({"h_precision": prec_h, "h_recall": rec_h, "h_f1-score":f1_h})
        
    
    torch.save(model.state_dict(), f"{models_dir}{wandb.run.name}.pth")
    wandb.join()

In [12]:
def main():
    wandb.agent(sweep_id, train, count=5)

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Agent Starting Run: ac89s9hl with config:
[34m[1mwandb[0m: 	alpha: 0.9717156696240714
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.6164075655850356
[34m[1mwandb[0m: 	beta1: 0.9311024175323688
[34m[1mwandb[0m: 	learning_rate: 1.710783097020877e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7322145301229332


[50/100]
f1_h=0.43920	prec_h=0.71202	rec_h=0.31754
[100/100]
f1_h=0.56899	prec_h=0.68300	rec_h=0.48760


VBox(children=(Label(value='0.001 MB of 0.007 MB uploaded\r'), FloatProgress(value=0.14632822044251392, max=1.…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▅▁▁▁▂▃▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████
h_precision,▂▁▁█▅▆▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
h_recall,█▁▁▁▂▂▂▂▃▂▃▃▃▃▃▃▃▃▄▃▃▃▄▄▄▄▄▄▄▄▄▄▄▄▄▅▅▅▄▅
train_loss,█▇▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁
val_loss,█▆▅▆▅▅▄▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,99.0
h_f1-score,0.56899
h_precision,0.683
h_recall,0.4876
train_loss,444.84526
val_loss,588.67854


[34m[1mwandb[0m: Agent Starting Run: yy6gaj1r with config:
[34m[1mwandb[0m: 	alpha: 0.9654478666765854
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.7870273949525248
[34m[1mwandb[0m: 	beta1: 0.9476979307645412
[34m[1mwandb[0m: 	learning_rate: 2.919434743481965e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.70148548183942


[50/100]
f1_h=0.53352	prec_h=0.71728	rec_h=0.42471
[100/100]
f1_h=0.59170	prec_h=0.67890	rec_h=0.52436


VBox(children=(Label(value='0.001 MB of 0.007 MB uploaded\r'), FloatProgress(value=0.1464474935470724, max=1.0…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▆▁▂▃▄▄▄▅▅▆▆▅▇▆▇▇▇▇▇▇▇▇▇▇██▇▇████████████
h_precision,▄▁█▇▇▇▇▇▇▇███████████████████▇▇██▇█▇▇▇█▇
h_recall,█▁▁▂▂▃▃▃▄▄▄▃▅▄▅▅▅▅▅▅▅▅▅▅▅▆▅▅▆▆▆▆▆▆▆▆▆▆▆▆
train_loss,█▇▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
val_loss,█▇▆▇▆▆▅▅▄▅▄▃▃▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂

0,1
epoch,99.0
h_f1-score,0.5917
h_precision,0.6789
h_recall,0.52436
train_loss,414.77332
val_loss,731.39096


[34m[1mwandb[0m: Agent Starting Run: t9vsveu7 with config:
[34m[1mwandb[0m: 	alpha: 0.8486148832967625
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.5394001868359992
[34m[1mwandb[0m: 	beta1: 0.9350183224876876
[34m[1mwandb[0m: 	learning_rate: 3.098721504881375e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.8493841086753553


[50/100]
f1_h=0.53029	prec_h=0.71194	rec_h=0.42250
[100/100]
f1_h=0.56872	prec_h=0.65106	rec_h=0.50487


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▆▁▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇█▇████████▇██████████
h_precision,▄▁▇▇▇▇██████████████████████████▇▇█▇██▇▇
h_recall,█▁▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▆
train_loss,██▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁
val_loss,█▇▆▆▅▅▄▄▄▄▃▃▂▃▃▂▂▂▃▂▂▂▂▂▂▂▂▁▂▂▂▁▁▂▁▂▁▁▁▂

0,1
epoch,99.0
h_f1-score,0.56872
h_precision,0.65106
h_recall,0.50487
train_loss,335.86248
val_loss,554.74922


[34m[1mwandb[0m: Agent Starting Run: ms8ny34x with config:
[34m[1mwandb[0m: 	alpha: 0.8265316301970744
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.5711067839857358
[34m[1mwandb[0m: 	beta1: 0.9426835775816698
[34m[1mwandb[0m: 	learning_rate: 2.4594431269102348e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7162944071458639


[50/100]
f1_h=0.50299	prec_h=0.70471	rec_h=0.39105
[100/100]
f1_h=0.55962	prec_h=0.69657	rec_h=0.46767


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▅▁▁▂▃▄▅▄▅▅▅▆▅▆▆▆▇▇▇▇▇▇▇█▇▇▇█████████████
h_precision,▃▁█▆▆▆▆▆▆▆▇▆▇▆▆▇▆▆▇▆▆▆▆▆▆▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆
h_recall,█▁▁▂▂▂▂▂▃▃▃▃▃▃▄▃▄▄▄▄▄▄▄▅▄▄▄▅▄▅▅▅▅▅▅▅▅▄▅▅
train_loss,█▇▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
val_loss,█▆▅▅▅▅▄▄▄▃▃▃▂▂▂▂▂▂▂▁▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,99.0
h_f1-score,0.55962
h_precision,0.69657
h_recall,0.46767
train_loss,344.51751
val_loss,530.54694


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: u3qk64cb with config:
[34m[1mwandb[0m: 	alpha: 0.8363820893739565
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beta: 0.5231395996281678
[34m[1mwandb[0m: 	beta1: 0.936562515625026
[34m[1mwandb[0m: 	learning_rate: 1.6470120713361317e-05
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	threshold: 0.7558571606564068


[50/100]
f1_h=0.46572	prec_h=0.69982	rec_h=0.34898
[100/100]
f1_h=0.56219	prec_h=0.68358	rec_h=0.47741


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
h_f1-score,▅▁▁▁▂▃▄▄▄▅▅▅▆▅▆▆▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇█▇██▇████
h_precision,▂▁▁█▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▇▇▆
h_recall,█▁▁▁▁▂▂▂▂▂▃▂▃▃▃▃▃▃▃▄▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
train_loss,█▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_loss,██▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▁▂▁▁

0,1
epoch,99.0
h_f1-score,0.56219
h_precision,0.68358
h_recall,0.47741
train_loss,402.28136
val_loss,507.50011


### Evaluation

### text-embeddings-Large + BERT-base-NER (only for English)

In [29]:
model = OpenAiLargeNER()

model.load_state_dict(torch.load('models/subtask1a/openAI-Large-NER/misty-sweep-4.pth'))

<All keys matched successfully>

#### English

In [22]:
en_pred_file_path = './Predictions/en_predictions_subtask1.txt'

en_test_data = process_test_json('./test_data/english/en_subtask1_test_unlabeled.json')

en_test_dataset = TestDataSet(en_test_data,
                              './TextFeatures/subtask1a/text-embedding-3-large/en_test_text_features.pkl',
                              './TextFeatures/subtask1a/BERT-NER/en_test_text_features.pkl')
en_test_dataloader = DataLoader(en_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_test_dataloader, en_pred_file_path, None, evaluator_script, id2leaf_label, validation=False,
               threshold=0.3)

0.62898	0.66361	0.59779

### text-embeddings-Large + BERT-Multilingual-NER

In [23]:
from modules.nn.OpenAiLarge import OpenAiLargeMultiLingualNER

model = OpenAiLargeMultiLingualNER()
device = torch.device('cpu')

model.load_state_dict(torch.load('./models/subtask1a/openAI-Large-m-NER/gallant-sweep-7.pth'))

<All keys matched successfully>

#### Bulgarian

In [24]:
bulgarian_pred_file_path = './Predictions/bulgarian_predictions_subtask1.txt'
bulgarian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
evaluator_script = './scorer-baseline/subtask_1_2a.py'

bg_test_data = process_test_json(bulgarian_gold_file_path)


bg_test_dataset = TestDataSet(bg_test_data, 
                              './TextFeatures/subtask1a/text-embedding-3-large/bg_test_text_features.pkl',
                              './TextFeatures/subtask1a/multilingual-ner/bg_test_text_features.pkl')
bg_test_dataloader = DataLoader(bg_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, bg_test_dataloader, bulgarian_pred_file_path, bulgarian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.46682	prec_h=0.52446	rec_h=0.42060


#### North Macedonian

In [25]:
macedonian_pred_file_path = './Predictions/macedonian_predictions_subtask1.txt'
macedonian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_md.json'

md_test_data = process_test_json(macedonian_gold_file_path)

md_test_dataset = TestDataSet(md_test_data, 
                              './TextFeatures/subtask1a/text-embedding-3-large/md_test_text_features.pkl',
                              './TextFeatures/subtask1a/multilingual-ner/md_test_text_features.pkl'
                              )
md_test_dataloader = DataLoader(md_test_dataset, batch_size=64, shuffle=True)

evaluate_model(model, md_test_dataloader, macedonian_pred_file_path, macedonian_gold_file_path,
               evaluator_script, id2leaf_label, validation=False, threshold=0.3)

f1_h=0.44731	prec_h=0.58300	rec_h=0.36285


#### Arabian

In [26]:
arabian_pred_file_path = './Predictions/arabian_predictions_subtask1.txt'
arabian_gold_file_path = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'

ar_test_data = process_test_json(arabian_gold_file_path)

ar_test_dataset = TestDataSet(ar_test_data, 
                              './TextFeatures/subtask1a/text-embedding-3-large/ar_test_text_features.pkl',
                              './TextFeatures/subtask1a/multilingual-ner/ar_test_text_features.pkl'
                              )
ar_test_dataloader = DataLoader(ar_test_dataset, batch_size=128, shuffle=True)

evaluate_model(model, ar_test_dataloader, arabian_pred_file_path, arabian_gold_file_path, evaluator_script, 
               id2leaf_label, format=5, validation=False, threshold=0.3)

f1_h=0.32889	prec_h=0.44848	rec_h=0.25965


#### English

In [27]:
en_pred_file_path = './Predictions/en_predictions_subtask1.txt'

en_test_data = process_test_json('./test_data/english/en_subtask1_test_unlabeled.json')

en_test_dataset = TestDataSet(en_test_data,
                              './TextFeatures/subtask1a/text-embedding-3-large/en_test_text_features.pkl',
                              './TextFeatures/subtask1a/multilingual-ner/en_test_text_features.pkl')
en_test_dataloader = DataLoader(en_test_dataset, batch_size=16, shuffle=True)

evaluate_model(model, en_test_dataloader, en_pred_file_path, None, evaluator_script, id2leaf_label, validation=False,
               threshold=0.3)

0.63941	0.65488	0.62465