In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\train.json'
val_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\validation.json'
dev_unlabel_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\dev_unlabeled.json'
ar_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\ar_subtask1_test_unlabeled.json'
bg_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\bg_subtask1_test_unlabeled.json'
mk_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\mk_subtask1_test_unlabeled.json'
en_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\en_subtask1_test_unlabeled.json'

with open(train_json_path, 'r', encoding='utf-8') as f:
    train_json_data = json.load(f)
with open(val_json_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)
with open(dev_unlabel_json_path, 'r', encoding='utf-8') as f:
    dev_unlabel_data = json.load(f)
with open(ar_json_path, 'r', encoding='utf-8') as f:
    ar_data = json.load(f)
with open(bg_json_path, 'r', encoding='utf-8') as f:
    bg_data = json.load(f)
with open(mk_json_path, 'r', encoding='utf-8') as f:
    mk_data = json


In [3]:
mlb = MultiLabelBinarizer()
train_texts = [item['text'] for item in train_json_data]
train_labels = mlb.fit_transform([item['labels'] for item in train_json_data])
val_texts = [item['text'] for item in val_data]
val_labels = mlb.transform([item['labels'] for item in val_data])


In [4]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)


In [6]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=len(mlb.classes_), problem_type="multi_label_classification")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def compute_metrics(p):
    # Convert predictions to multilabel format
    preds = (p.predictions > 0.5).astype(int)
    
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=25,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [8]:
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")


Train Dataset Size: 7000
Validation Dataset Size: 500


In [9]:
import torch

torch.cuda.empty_cache()
print("GPU cache cleared.")
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Additional information if using GPU
if device.type == 'cuda':
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Memory Allocated: {torch.cuda.memory_allocated(0)/1024**3:.1f} GB")
    print(f"Memory Cached: {torch.cuda.memory_reserved(0)/1024**3:.1f} GB")


GPU cache cleared.
Using device: cuda
GPU Name: NVIDIA GeForce RTX 4080 Laptop GPU
Memory Allocated: 0.5 GB
Memory Cached: 0.5 GB


In [10]:
trainer.train()

Step,Training Loss
500,0.3254
1000,0.2311
1500,0.2186
2000,0.2147
2500,0.2046
3000,0.1822
3500,0.1813
4000,0.1501
4500,0.1426
5000,0.1219


TrainOutput(global_step=21875, training_loss=0.0645168306187221, metrics={'train_runtime': 3084.8067, 'train_samples_per_second': 56.73, 'train_steps_per_second': 7.091, 'total_flos': 2.0879370897e+16, 'train_loss': 0.0645168306187221, 'epoch': 25.0})

In [11]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6049890518188477,
 'eval_accuracy': 0.19,
 'eval_f1': 0.3821861163770747,
 'eval_precision': 0.458284587047519,
 'eval_recall': 0.3420738974970203,
 'eval_runtime': 3.0252,
 'eval_samples_per_second': 165.278,
 'eval_steps_per_second': 20.825,
 'epoch': 25.0}

In [12]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Load the data
train_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\train.json'
val_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\validation.json'
dev_unlabel_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\dev_unlabeled.json'
ar_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\ar_subtask1_test_unlabeled.json'
bg_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\bg_subtask1_test_unlabeled.json'
mk_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\mk_subtask1_test_unlabeled.json'
en_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\en_subtask1_test_unlabeled.json'

# Correctly load the data from JSON files
with open(train_json_path, 'r', encoding='utf-8') as f:
    train_json_data = json.load(f)
with open(val_json_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)
with open(dev_unlabel_json_path, 'r', encoding='utf-8') as f:
    dev_unlabel_data = json.load(f)
with open(ar_json_path, 'r', encoding='utf-8') as f:
    ar_data = json.load(f)
with open(bg_json_path, 'r', encoding='utf-8') as f:
    bg_data = json.load(f)
with open(mk_json_path, 'r', encoding='utf-8') as f:
    mk_data = json.load(f)
with open(en_json_path, 'r', encoding='utf-8') as f:
    en_data = json.load(f)

# Print the type and content of mk_data to ensure it is loaded correctly
print(type(mk_data))
print(mk_data)

# Proceed with processing the data
ar_texts = [item['text'] for item in ar_data]
bg_texts = [item['text'] for item in bg_data]
mk_texts = [item['text'] for item in mk_data]
en_texts = [item['text'] for item in en_data]
dev_texts = [item['text'] for item in dev_unlabel_data]

ar_ids = [item['id'] for item in ar_data]
bg_ids = [item['id'] for item in bg_data]
mk_ids = [item['id'] for item in mk_data]
en_ids = [item['id'] for item in en_data]
dev_ids = [item['id'] for item in dev_unlabel_data]

# Define the prediction function
def predict_labels(texts, ids, model, tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt', max_length=512)
    dataset = Dataset(encodings, [[0] * len(mlb.classes_)] * len(texts))  # Dummy labels since we are predicting
    trainer = Trainer(model=model)
    predictions = trainer.predict(dataset)
    predicted_probs = predictions.predictions
    predicted_labels = (predicted_probs > 0.5).astype(int)  # Apply threshold to get predicted labels
    return [{"id": id_, "labels": labels} for id_, labels in zip(ids, mlb.inverse_transform(predicted_labels))]

# Predict and save the results
ar_predictions = predict_labels(ar_texts, ar_ids, model, tokenizer)
bg_predictions = predict_labels(bg_texts, bg_ids, model, tokenizer)
mk_predictions = predict_labels(mk_texts, mk_ids, model, tokenizer)
en_predictions = predict_labels(en_texts, en_ids, model, tokenizer)
dev_predictions = predict_labels(dev_texts, dev_ids, model, tokenizer)

# Save predictions to JSON files
with open('C:\\Users\\harih\\Downloads\\distilbert\\predictions25_ar.json', 'w') as f:
    json.dump(ar_predictions, f)
with open('C:\\Users\\harih\\Downloads\\distilbert\\predictions25_bg.json', 'w') as f:
    json.dump(bg_predictions, f)
with open('C:\\Users\\harih\\Downloads\\distilbert\\predictions25_mk.json', 'w') as f:
    json.dump(mk_predictions, f)
with open('C:\\Users\\harih\\Downloads\\distilbert\\predictions25_en.json', 'w') as f:
    json.dump(en_predictions, f)
with open('C:\\Users\\harih\\Downloads\\distilbert\\predictions25_dev.json', 'w') as f:
    json.dump(dev_predictions, f)

print(f"AR Predictions: {ar_predictions}")
print(f"BG Predictions: {bg_predictions}")
print(f"MK Predictions: {mk_predictions}")
print(f"EN Predictions: {en_predictions}")
print(f"DEV Predictions: {dev_predictions}")


<class 'list'>
[{'id': 'mk_memes_2', 'text': 'Нещата започват да излизат извън\nконтрол!! \n'}, {'id': 'mk_memes_3', 'text': 'МОЈАТА МАСКАТЕ ШТИТИ ТЕБЕ,\nТВОЈАТА МАСКА МЕ ШТИТИ МЕНЕ.\n\nТОА ГОВНО НЕ ШТИТИ НИ ОД ПРАШИНА КАМОЛИ ОД\nВИРУСИ, И ТВОИТЕ ГОСПОДАРИ го КОРИСТАТ КАКО\nПСИХОЛОШКО ОРУЖЈЕ ЗА ДА ДОЗНААТ КОЛКАВ ДЕЛ\nоД НАСЕЛЕНИЕТО БЕСПОГОВОРНО СЛУША НАРЕДБИ.\n'}, {'id': 'mk_memes_4', 'text': 'ЕДНАШ ГИ ПОБЕДИВМЕ "ДОБРОСОСЕДИТЕ".\n\nKЕ ГИ ПОБЕДИМЕ ПАК!\n11 ОКТОМВРИ\nНЕ ГО СЛАВИМЕ ЗАЕДНИЧКИ\n'}, {'id': 'mk_memes_5', 'text': 'ВОСОЧНА ФИГУРА НА ГОЦЕ\nДЕЛЧЕВ ОД МУЗЕЈОТ ВО ВАРНА\n\n'}, {'id': 'mk_memes_6', 'text': 'И после им рековме се е во главата\nод Груевски гласајте за нас нема да\nменуваме име ќе имате 500 евра\nплата! И ни поверуваа хахахахаха\n'}, {'id': 'mk_memes_7', 'text': 'Традиционна българска двойкa!\nМъж и жена!\n'}, {'id': 'mk_memes_9', 'text': 'Рамо до Рамо\nсо ФАШИСТИТЕ\n#WeAreLosers\n'}, {'id': 'mk_memes_10', 'text': 'НА ОВА МЕСТО НА 16 ЈУМИ 1943 г.\nОД СТРАНА НА БУГАРСКИТ

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


AR Predictions: [{'id': '00001', 'labels': ()}, {'id': '00002', 'labels': ()}, {'id': '00003', 'labels': ()}, {'id': '00004', 'labels': ('Loaded Language', 'Name calling/Labeling', 'Smears')}, {'id': '00005', 'labels': ('Loaded Language', 'Smears')}, {'id': '00006', 'labels': ('Loaded Language', 'Whataboutism')}, {'id': '00008', 'labels': ()}, {'id': '00009', 'labels': ()}, {'id': '00011', 'labels': ('Loaded Language', 'Smears')}, {'id': '00012', 'labels': ('Smears',)}, {'id': '00016', 'labels': ()}, {'id': '00017', 'labels': ()}, {'id': '00019', 'labels': ('Smears',)}, {'id': '00020', 'labels': ('Name calling/Labeling',)}, {'id': '00021', 'labels': ()}, {'id': '00022', 'labels': ()}, {'id': '00023', 'labels': ('Loaded Language', 'Smears')}, {'id': '00024', 'labels': ()}, {'id': '00025', 'labels': ()}, {'id': '00027', 'labels': ()}, {'id': '00028', 'labels': ()}, {'id': '00030', 'labels': ('Causal Oversimplification', 'Smears')}, {'id': '00032', 'labels': ('Smears',)}, {'id': '00034', 

In [13]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_bg.json -p C:\Users\harih\Downloads\distilbert\predictions25_bg.json


f1_h=0.38358	prec_h=0.42506	rec_h=0.34948


In [14]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_ar.json -p C:\Users\harih\Downloads\distilbert\predictions25_ar.json


f1_h=0.25152	prec_h=0.29808	rec_h=0.21754


In [15]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\dev_gold_labels\dev_gold_labels\dev_subtask1_en.json -p C:\Users\harih\Downloads\distilbert\predictions25_dev.json

f1_h=0.52775	prec_h=0.64361	rec_h=0.44724


In [16]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_md.json -p C:\Users\harih\Downloads\distilbert\predictions25_mk.json


f1_h=0.30422	prec_h=0.35585	rec_h=0.26568
