In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\train.json'
val_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\validation.json'
dev_unlabel_json_path = 'C:\\Users\\harih\\Downloads\\annotations_v2\\semeval2024_dev_release\\subtask1\\dev_unlabeled.json'
ar_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\ar_subtask1_test_unlabeled.json'
bg_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\bg_subtask1_test_unlabeled.json'
mk_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\mk_subtask1_test_unlabeled.json'
en_json_path = 'C:\\Users\\harih\\Downloads\\datasets\\test_data\\en_subtask1_test_unlabeled.json'

with open(train_json_path, 'r', encoding='utf-8') as f:
    train_json_data = json.load(f)
with open(val_json_path, 'r', encoding='utf-8') as f:
    val_data = json.load(f)
with open(dev_unlabel_json_path, 'r', encoding='utf-8') as f:
    dev_unlabel_data = json.load(f)
with open(ar_json_path, 'r', encoding='utf-8') as f:
    ar_data = json.load(f)
with open(bg_json_path, 'r', encoding='utf-8') as f:
    bg_data = json.load(f)
with open(mk_json_path, 'r', encoding='utf-8') as f:
    mk_data = json.load(f)
with open(en_json_path, 'r', encoding='utf-8') as f:
    en_data = json.load(f)

df_val = pd.DataFrame(val_data)
df_train = pd.DataFrame(train_json_data)

df_train['len'] = df_train['labels'].apply(lambda x: len(x))
df_train = df_train[df_train['len'] > 0].drop('len', axis=1)


In [3]:
def split_combined_labels(labels):
    new_labels = []
    for label in labels:
        if '/' in label:
            new_labels.extend(label.split('/'))
        else:
            new_labels.append(label)
    return new_labels

df_train['labels'] = df_train['labels'].apply(split_combined_labels)

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_train['labels'])
train_texts, val_texts, train_labels, val_labels = train_test_split(df_train['text'], y, test_size=0.1)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

class PersuasionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # Ensure labels are float
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])



In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train_dataset = PersuasionDataset(train_encodings, train_labels)
val_dataset = PersuasionDataset(val_encodings, val_labels)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(mlb.classes_))
model.to(device)  # Move model to the correct device

predicted_labels = []
actual_labels = []

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)

    for p, l in zip(preds, labels):
        predicted_labels.append(p)
        actual_labels.append(l)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
trainer.train()
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

predicted_labels = mlb.inverse_transform(np.array(predicted_labels))
actual_labels = mlb.inverse_transform(np.array(actual_labels))


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2745,0.274912,0.006969,0.048036,0.126189,0.032011
2,0.2576,0.255636,0.020906,0.084507,0.407294,0.068198
3,0.2474,0.242981,0.083624,0.261771,0.539908,0.196938
4,0.2304,0.241445,0.094077,0.307853,0.505081,0.235212
5,0.1718,0.248944,0.151568,0.394237,0.533694,0.331942
6,0.1576,0.258877,0.167247,0.407139,0.542567,0.380654
7,0.1449,0.264475,0.156794,0.429804,0.572173,0.380654
8,0.1254,0.287976,0.155052,0.425129,0.536181,0.376479
9,0.1195,0.306471,0.172474,0.456419,0.580506,0.392484
10,0.0817,0.313142,0.188153,0.479622,0.596617,0.4238


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation results: {'eval_loss': 0.7520573735237122, 'eval_accuracy': 0.1794425087108014, 'eval_f1': 0.5082838600523156, 'eval_precision': 0.567247592845393, 'eval_recall': 0.4759916492693111, 'eval_runtime': 2.1513, 'eval_samples_per_second': 266.817, 'eval_steps_per_second': 33.468, 'epoch': 100.0}


In [8]:
def save_predictions(data, file_path, model, tokenizer, device):
    texts = [entry['text'] for entry in data]
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to(device)  # Move to device
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(**encodings)
    predictions = torch.sigmoid(outputs.logits).cpu().numpy()  # Move back to CPU
    predicted_labels = mlb.inverse_transform((predictions > 0.5).astype(int))
    
    predictions_json = []
    for i, entry in enumerate(data):
        # Ensure the labels are in the correct format and handle empty labels
        labels = list(predicted_labels[i]) if i < len(predicted_labels) else []
        if not labels:
            labels = [""]
        prediction_entry = {
            "id": entry['id'],
            "labels": labels
        }
        predictions_json.append(prediction_entry)
    
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(predictions_json, f, indent=4)
    print(f"Predictions saved to {file_path}")


In [9]:
save_predictions(val_data, 'predictions_val.json', model, tokenizer, device)
save_predictions(dev_unlabel_data, 'roberta100_predictions_dev_unlabel.json', model, tokenizer, device)
save_predictions(ar_data, 'roberta100_predictions_ar.json', model, tokenizer, device)
save_predictions(bg_data, 'roberta100_predictions_bg.json', model, tokenizer, device)
save_predictions(mk_data, 'roberta100_predictions_mk.json', model, tokenizer, device)
save_predictions(en_data, 'roberta100_predictions_en.json', model, tokenizer, device)


Predictions saved to predictions_val.json
Predictions saved to roberta100_predictions_dev_unlabel.json
Predictions saved to roberta100_predictions_ar.json
Predictions saved to roberta100_predictions_bg.json
Predictions saved to roberta100_predictions_mk.json
Predictions saved to roberta100_predictions_en.json


In [10]:

!python C:\Users\harih\Downloads\subtask_1_2a.py -g C:\Users\harih\Downloads\gold_labels_ar_bg_md_version2\test_subtask1_md.json -p C:\Users\harih\Downloads\Roberta_multi\roberta100_predictions_mk.json

f1_h=0.33476	prec_h=0.33252	rec_h=0.33702




In [17]:
save_predictions(ar_data, 'roberta100_predictions_ar.json', model, tokenizer, device)

Predictions saved to roberta100_predictions_ar.json
