In [25]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


In [26]:
train_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\train.json'
val_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\validation.json'
dev_unlabel_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\dev_unlabeled.json'
ar_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\ar_subtask1_test_unlabeled.json'
bg_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\bg_subtask1_test_unlabeled.json'
mk_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\mk_subtask1_test_unlabeled.json'
en_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\en_subtask1_test_unlabeled.json'

with open(train_json_path, 'r', encoding='utf-8') as f:
    train_json_data = json.load(f)
with open(val_json_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)
with open(dev_unlabel_json_path, 'r', encoding='utf-8') as f:
    dev_unlabel_data = json.load(f)
with open(ar_json_path, 'r', encoding='utf-8') as f:
    ar_data = json.load(f)
with open(bg_json_path, 'r', encoding='utf-8') as f:
    bg_data = json.load(f)
with open(mk_json_path, 'r', encoding='utf-8') as f:
    mk_data = json.load(f)
with open(en_json_path, 'r', encoding='utf-8') as f:
    en_data = json.load(f)


In [27]:
mlb = MultiLabelBinarizer()
train_texts = [item['text'] for item in train_json_data]
train_labels = mlb.fit_transform([item['labels'] for item in train_json_data])
val_texts = [item['text'] for item in val_data]
val_labels = mlb.transform([item['labels'] for item in val_data])


In [56]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [29]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)


In [57]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(mlb.classes_), problem_type="multi_label_classification")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
def compute_metrics(p):
    # Convert predictions to multilabel format
    preds = (p.predictions > 0.5).astype(int)
    
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [59]:
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")


Train Dataset Size: 7000
Validation Dataset Size: 500


In [60]:
trainer.train()

Step,Training Loss
500,0.3081
1000,0.2351
1500,0.2262
2000,0.2287
2500,0.2251
3000,0.215
3500,0.2199
4000,0.2094
4500,0.2071
5000,0.2002


TrainOutput(global_step=8750, training_loss=0.2044293491908482, metrics={'train_runtime': 1567.2624, 'train_samples_per_second': 44.664, 'train_steps_per_second': 5.583, 'total_flos': 1.349176059e+16, 'train_loss': 0.2044293491908482, 'epoch': 10.0})

In [61]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.24761347472667694,
 'eval_accuracy': 0.208,
 'eval_f1': 0.22597866390507823,
 'eval_precision': 0.37971374089869964,
 'eval_recall': 0.18951132300357568,
 'eval_runtime': 4.1195,
 'eval_samples_per_second': 121.373,
 'eval_steps_per_second': 15.293,
 'epoch': 10.0}

In [74]:
def predict_labels(texts, ids, model, tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt', max_length=512)
    dataset = Dataset(encodings, [[0] * len(mlb.classes_)] * len(texts))  # Dummy labels since we are predicting
    trainer = Trainer(model=model)
    predictions = trainer.predict(dataset)
    predicted_probs = predictions.predictions
    predicted_labels = (predicted_probs > 0.5).astype(int)  # Apply threshold to get predicted labels
    return [{"id": id_, "labels": labels} for id_, labels in zip(ids, mlb.inverse_transform(predicted_labels))]

ar_texts = [item['text'] for item in ar_data]
bg_texts = [item['text'] for item in bg_data]
mk_texts = [item['text'] for item in mk_data]
en_texts = [item['text'] for item in en_data]
dev_texts = [item['text'] for item in dev_unlabel_data]

ar_ids = [item['id'] for item in ar_data]
bg_ids = [item['id'] for item in bg_data]
mk_ids = [item['id'] for item in mk_data]
en_ids = [item['id'] for item in en_data]
dev_ids = [item['id'] for item in dev_unlabel_data]

ar_predictions = predict_labels(ar_texts, ar_ids, model, tokenizer)
bg_predictions = predict_labels(bg_texts, bg_ids, model, tokenizer)
mk_predictions = predict_labels(mk_texts, mk_ids, model, tokenizer)
en_predictions = predict_labels(en_texts, en_ids, model, tokenizer)
dev_predictions = predict_labels(dev_texts, dev_ids, model, tokenizer)

# Save predictions to JSON files
with open('C:\\Users\\harih\\Downloads\\bert\\predictions_ar.json', 'w') as f:
    json.dump(ar_predictions, f)
with open('C:\\Users\\harih\\Downloads\\bert\\predictions_bg.json', 'w') as f:
    json.dump(bg_predictions, f)
with open('C:\\Users\\harih\\Downloads\\bert\\predictions_mk.json', 'w') as f:
    json.dump(mk_predictions, f)
with open('C:\\Users\\harih\\Downloads\\bert\\predictions_en.json', 'w') as f:
    json.dump(en_predictions, f)
with open('C:\\Users\\harih\\Downloads\\bert\\predictions_dev.json', 'w') as f:
    json.dump(dev_predictions, f)

print(f"AR Predictions: {ar_predictions}")
print(f"BG Predictions: {bg_predictions}")
print(f"MK Predictions: {mk_predictions}")
print(f"EN Predictions: {en_predictions}")
print(f"DEV Predictions: {dev_predictions}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


AR Predictions: [{'id': '00001', 'labels': ('Smears',)}, {'id': '00002', 'labels': ()}, {'id': '00003', 'labels': ()}, {'id': '00004', 'labels': ()}, {'id': '00005', 'labels': ()}, {'id': '00006', 'labels': ('Smears',)}, {'id': '00008', 'labels': ()}, {'id': '00009', 'labels': ()}, {'id': '00011', 'labels': ()}, {'id': '00012', 'labels': ('Smears',)}, {'id': '00016', 'labels': ('Smears',)}, {'id': '00017', 'labels': ()}, {'id': '00019', 'labels': ('Smears',)}, {'id': '00020', 'labels': ()}, {'id': '00021', 'labels': ()}, {'id': '00022', 'labels': ('Smears',)}, {'id': '00023', 'labels': ()}, {'id': '00024', 'labels': ('Smears',)}, {'id': '00025', 'labels': ('Smears',)}, {'id': '00027', 'labels': ()}, {'id': '00028', 'labels': ()}, {'id': '00030', 'labels': ('Smears',)}, {'id': '00032', 'labels': ()}, {'id': '00034', 'labels': ('Smears',)}, {'id': '00035', 'labels': ()}, {'id': '00037', 'labels': ()}, {'id': '00039', 'labels': ('Smears',)}, {'id': '00040', 'labels': ('Smears',)}, {'id': 

In [75]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_bg.json -p C:\Users\harih\Downloads\bert\predictions_bg.json


f1_h=0.19551	prec_h=0.41223	rec_h=0.12814


In [76]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_ar.json -p C:\Users\harih\Downloads\bert\predictions_ar.json


f1_h=0.23113	prec_h=0.35252	rec_h=0.17193


In [77]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\dev_gold_labels\dev_gold_labels\dev_subtask1_en.json -p C:\Users\harih\Downloads\bert\predictions_dev.json

f1_h=0.10596	prec_h=0.45642	rec_h=0.05994


In [78]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_md.json -p C:\Users\harih\Downloads\bert\predictions_mk.json


f1_h=0.18898	prec_h=0.32727	rec_h=0.13284
