In [9]:
# Cell 1: Import necessary libraries
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


In [10]:
# Cell 2: Load the data
train_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\train.json'
val_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\validation.json'
dev_unlabel_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\dev_unlabeled.json'
ar_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\ar_subtask1_test_unlabeled.json'
bg_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\bg_subtask1_test_unlabeled.json'
mk_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\mk_subtask1_test_unlabeled.json'
en_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\en_subtask1_test_unlabeled.json'

with open(train_json_path, 'r', encoding='utf-8') as f:
    train_json_data = json.load(f)
with open(val_json_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)
with open(dev_unlabel_json_path, 'r', encoding='utf-8') as f:
    dev_unlabel_data = json.load(f)
with open(ar_json_path, 'r', encoding='utf-8') as f:
    ar_data = json.load(f)
with open(bg_json_path, 'r', encoding='utf-8') as f:
    bg_data = json.load(f)
with open(mk_json_path, 'r', encoding='utf-8') as f:
    mk_data = json.load(f)
with open(en_json_path, 'r', encoding='utf-8') as f:
    en_data = json.load(f)


In [11]:
# Cell 3: Preprocess the data (adjusted)
# Extract texts and labels from the JSON data
def extract_texts_and_labels(json_data):
    texts = [item['text'] for item in json_data]
    labels = [item['labels'] for item in json_data]
    return texts, labels

train_texts, train_labels = extract_texts_and_labels(train_json_data)
val_texts, val_labels = extract_texts_and_labels(val_data)

# Binarize the labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_labels)
val_labels = mlb.transform(val_labels)

# Create a custom dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()  # Ensure labels are float tensors
        return item

    def __len__(self):
        return len(self.labels)

# Tokenize the texts
tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)


In [12]:
# Cell 4: Tokenizer and model initialization (adjusted)
tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = BertForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=len(mlb.classes_))

# Ensure the model's output logits are properly shaped for multi-label classification
model.config.problem_type = "multi_label_classification"


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Cell 5: Define the compute_metrics function (adjusted)
def compute_metrics(p):
    preds = (p.predictions > 0.3).astype(int)  # Adjust the threshold to 0.3
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')  # Use 'micro' average for multi-label
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [18]:
# Cell 6: Training arguments and trainer (adjusted)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)




In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.21886,0.186,0.240828,0.571429,0.152563
2,0.182000,0.219911,0.19,0.271493,0.56391,0.178784
3,0.178900,0.220958,0.194,0.288014,0.577061,0.191895
4,0.170800,0.222209,0.194,0.303775,0.576667,0.206198
5,0.168700,0.223217,0.198,0.304196,0.570492,0.20739
6,0.161800,0.225299,0.196,0.301821,0.55414,0.20739
7,0.156300,0.226819,0.192,0.310663,0.525568,0.220501
8,0.152700,0.230147,0.19,0.317434,0.511936,0.230036
9,0.152700,0.231497,0.198,0.329011,0.513924,0.241955
10,0.147800,0.233857,0.194,0.333333,0.53562,0.241955


TrainOutput(global_step=21900, training_loss=0.11684841399867785, metrics={'train_runtime': 1114.7432, 'train_samples_per_second': 313.974, 'train_steps_per_second': 19.646, 'total_flos': 327514950000000.0, 'train_loss': 0.11684841399867785, 'epoch': 50.0})

In [20]:
trainer.evaluate()

{'eval_loss': 0.2919755280017853,
 'eval_accuracy': 0.184,
 'eval_f1': 0.3790087463556851,
 'eval_precision': 0.4878048780487805,
 'eval_recall': 0.3098927294398093,
 'eval_runtime': 2.473,
 'eval_samples_per_second': 202.185,
 'eval_steps_per_second': 3.235,
 'epoch': 50.0}

In [21]:
# Cell 8: Prediction function (adjusted)
def predict_labels(texts, ids, model, tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt', max_length=512)
    dataset = Dataset(encodings, [[0] * len(mlb.classes_)] * len(texts))  # Dummy labels since we are predicting
    trainer = Trainer(model=model)
    predictions = trainer.predict(dataset)
    predicted_probs = predictions.predictions
    predicted_labels = (predicted_probs > 0.3).astype(int)  # Adjust the threshold to 0.3
    return [{"id": id_, "labels": labels} for id_, labels in zip(ids, mlb.inverse_transform(predicted_labels))]


In [23]:
# Cell 9: Predict and save the results
ar_texts = [item['text'] for item in ar_data]
bg_texts = [item['text'] for item in bg_data]
mk_texts = [item['text'] for item in mk_data]
en_texts = [item['text'] for item in en_data]
dev_texts = [item['text'] for item in dev_unlabel_data]

ar_ids = [item['id'] for item in ar_data]
bg_ids = [item['id'] for item in bg_data]
mk_ids = [item['id'] for item in mk_data]
en_ids = [item['id'] for item in en_data]
dev_ids = [item['id'] for item in dev_unlabel_data]

ar_predictions = predict_labels(ar_texts, ar_ids, model, tokenizer)
bg_predictions = predict_labels(bg_texts, bg_ids, model, tokenizer)
mk_predictions = predict_labels(mk_texts, mk_ids, model, tokenizer)
en_predictions = predict_labels(en_texts, en_ids, model, tokenizer)
dev_predictions = predict_labels(dev_texts, dev_ids, model, tokenizer)

with open('C:\\Users\\harih\\Downloads\\tinybert\\predictions_ar.json', 'w') as f:
    json.dump(ar_predictions, f)
with open('C:\\Users\\harih\\Downloads\\tinybert\\predictions_bg.json', 'w') as f:
    json.dump(bg_predictions, f)
with open('C:\\Users\\harih\\Downloads\\tinybert\\predictions_mk.json', 'w') as f:
    json.dump(mk_predictions, f)
with open('C:\\Users\\harih\\Downloads\\tinybert\\predictions_en.json', 'w') as f:
    json.dump(en_predictions, f)
with open('C:\\Users\\harih\\Downloads\\tinybert\\predictions_dev.json', 'w') as f:
    json.dump(dev_predictions, f)

print(f"AR Predictions: {ar_predictions}")
print(f"BG Predictions: {bg_predictions}")
print(f"MK Predictions: {mk_predictions}")
print(f"EN Predictions: {en_predictions}")
print(f"DEV Predictions: {dev_predictions}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


AR Predictions: [{'id': '00001', 'labels': ('Name calling/Labeling', 'Thought-terminating cliché')}, {'id': '00002', 'labels': ('Name calling/Labeling', 'Repetition', 'Thought-terminating cliché')}, {'id': '00003', 'labels': ('Name calling/Labeling', 'Thought-terminating cliché')}, {'id': '00004', 'labels': ('Name calling/Labeling', 'Repetition', 'Thought-terminating cliché')}, {'id': '00005', 'labels': ('Name calling/Labeling', 'Repetition', 'Thought-terminating cliché')}, {'id': '00006', 'labels': ('Name calling/Labeling', 'Repetition', 'Thought-terminating cliché')}, {'id': '00008', 'labels': ('Name calling/Labeling', 'Thought-terminating cliché')}, {'id': '00009', 'labels': ('Name calling/Labeling', 'Repetition', 'Thought-terminating cliché')}, {'id': '00011', 'labels': ('Name calling/Labeling', 'Repetition', 'Thought-terminating cliché')}, {'id': '00012', 'labels': ('Name calling/Labeling', 'Repetition', 'Thought-terminating cliché')}, {'id': '00016', 'labels': ('Name calling/Labe

In [24]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_bg.json -p C:\Users\harih\Downloads\tinybert\predictions_bg.json


f1_h=0.29873	prec_h=0.40312	rec_h=0.23728


In [27]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_ar.json -p C:\Users\harih\Downloads\tinybert\predictions_ar.json


f1_h=0.27689	prec_h=0.19332	rec_h=0.48772


In [28]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\dev_gold_labels\dev_gold_labels\dev_subtask1_en.json -p C:\Users\harih\Downloads\tinybert\predictions_dev.json

f1_h=0.48558	prec_h=0.64096	rec_h=0.39084


In [29]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_md.json -p C:\Users\harih\Downloads\tinybert\predictions_mk.json


f1_h=0.37085	prec_h=0.44852	rec_h=0.31611
