In [7]:
import pandas as pd
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizerFast, Trainer, TrainingArguments

import torch.nn as nn
import torch
import numpy as np
import os
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from preprocess import preprocess_data, get_dataset_and_labels
from sklearn.model_selection import train_test_split

In [8]:
max_sequence_length = 512
device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = max_sequence_length)
model = RobertaForSequenceClassification.from_pretrained('./results/twitter-rumour-classification_roberta_512_batch16_grad1/', local_files_only=True).to("cuda")

In [9]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")


def convert_label_to_rumour(label):
    if label == 1:
        return "rumour"
    elif label == 0:
        return "non-rumour"
    else:
        raise Exception("label classes must be 1 or 0")


def get_labels(label_path, sourceIds):
    with open(label_path) as f:
        labels = json.load(f)
    corresponding_labels = [labels[id] for id in sourceIds]
    numeric_labels = [convert_label(label) for label in corresponding_labels]

    return numeric_labels

In [10]:
test_path = "./project_data/test.data.jsonl"
test_texts, sourceIds = preprocess_data(data_path=test_path, max_sequence_length=max_sequence_length) 

In [16]:
test_encodings = tokenizer(test_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)

In [17]:
class TestDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.tokenized_texts.items()}

test_dataset = TestDataset(test_encodings)

In [20]:
from transformers import default_data_collator

label_ids: torch.Tensor = None
preds: torch.Tensor = None

with torch.no_grad():
    dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

    for batch in tqdm(dataloader):

        batch['input_ids'] = batch['input_ids'].cuda()
            
        predictions = model(input_ids=batch['input_ids']
                                   )
        
        predictions = predictions[0]

        if preds is None:
            preds = predictions.detach().sigmoid()
        else:
            preds = torch.cat((preds, predictions.detach()), dim=0)


        # if label_ids is None:
        #     label_ids = batch["labels"].detach()
        # else:
        #     label_ids = torch.cat((label_ids, batch["labels"].detach()), dim=0)
        

100%|██████████| 73/73 [00:10<00:00,  6.73it/s]


In [21]:
predictions = np.argmax(preds.to("cpu"), axis=1)

In [22]:
predictions_dict = {sourceId: convert_label_to_rumour(prediction) for sourceId, prediction in zip(sourceIds, predictions)}

In [23]:
with open("test-output.json", "w") as outputfile:
    json.dump(predictions_dict, outputfile)

In [24]:
data_val_path = './project_data/dev.data.jsonl'
val_labels_path = './project_data/dev.label.json'

val_texts, val_sourceIds = preprocess_data(data_path=data_val_path, max_sequence_length=max_sequence_length)
val_labels = get_labels(val_labels_path, val_sourceIds)
val_encodings = tokenizer(val_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)
val_dataset = TestDataset(val_encodings)

In [25]:
label_ids: torch.Tensor = None
val_preds: torch.Tensor = None

with torch.no_grad():
    dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=8)

    for batch in tqdm(dataloader):

        batch['input_ids'] = batch['input_ids'].cuda()
            
        val_predictions = model(input_ids=batch['input_ids']
                                   )
        
        val_predictions = val_predictions[0]

        if val_preds is None:
            val_preds = val_predictions.detach().sigmoid()
        else:
            val_preds = torch.cat((val_preds, val_predictions.detach()), dim=0)


100%|██████████| 73/73 [00:10<00:00,  6.97it/s]


In [26]:


from sklearn.metrics import precision_recall_fscore_support
val_predictions = np.argmax(val_preds.to("cpu"), axis=1)
p, r, f, _ = precision_recall_fscore_support(val_predictions, val_labels, pos_label=1, average="binary")

In [27]:
print("Precision: ", p)
print("Recall: ", r)
print("F1: ", f)

Precision:  0.8288770053475936
Recall:  0.842391304347826
F1:  0.8355795148247979
