In [1]:
import pandas as pd
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizerFast, Trainer, TrainingArguments
import torch.nn as nn
import torch
import numpy as np
import os
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from preprocess import preprocess_data, get_dataset_and_labels
from sklearn.model_selection import train_test_split

In [2]:
config = RobertaConfig()

config

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [3]:
torch.cuda.is_available()

True

In [4]:
max_sequence_length = 512
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"

In [5]:
# load model and tokenizer and define length of the text sequence
model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                           gradient_checkpointing=False,
                                                           )
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = max_sequence_length)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [6]:
model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [7]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")


def get_labels(label_path, sourceIds):
    with open(label_path) as f:
        labels = json.load(f)
    corresponding_labels = [labels[id] for id in sourceIds]
    numeric_labels = [convert_label(label) for label in corresponding_labels]

    return numeric_labels

In [8]:
data_path = "./project_data/train.data.jsonl"
labels_path = "./project_data/train.label.json"

data_val_path = './project_data/dev.data.jsonl'
val_labels_path = './project_data/dev.label.json'

test_path = "./project_data/test.data.jsonl"

texts, labels = get_dataset_and_labels(data_path=data_path, label_path=labels_path, max_sequence_length=max_sequence_length)
val_texts, val_sourceIds = preprocess_data(data_path=data_val_path, max_sequence_length=max_sequence_length)
val_labels = get_labels(val_labels_path, val_sourceIds)

test_texts, sourceIds = preprocess_data(data_path=test_path, max_sequence_length=max_sequence_length) 

In [9]:
# train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.2)
train_texts = texts
train_labels = labels

In [10]:
train_encodings = tokenizer(train_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)
val_encodings = tokenizer(val_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)
test_encodings = tokenizer(test_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)

In [11]:
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class TestDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.tokenized_texts.items()}

train_dataset = TwitterDataset(train_encodings, train_labels)
val_dataset = TestDataset(val_encodings)

In [12]:
# class weights: class x / size of largest class

def make_weights(train_labels):
    unique_labels = np.unique(train_labels)
    class_sizes_dict = {x: (np.array(train_labels) == x).sum() for x in unique_labels}
    max_class_size = max(class_sizes_dict.values())
    weights = torch.tensor([max_class_size / class_sizes_dict[x] for x in unique_labels]).float()

    return weights

In [13]:
class_weights = make_weights(train_labels).cuda()
print(class_weights)

tensor([1.0000, 1.9318], device='cuda:0')


In [14]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        outputs = model(input_ids=inputs["input_ids"])
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        try:
            loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                            labels.long())
        except Exception as err:
            print("\n")
            print(labels)
            print("\n")
            print(labels.float().view(-1, self.model.config.num_labels))
            print("\n")
            print(logits.view(-1, self.model.config.num_labels))
            print(err)

        return (loss, outputs) if return_outputs else loss

In [15]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = './roberta_results',
    num_train_epochs=7,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 1,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "no",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = True,
    logging_dir='./logs',
    dataloader_num_workers = 0,
    do_eval=False,
    run_name = 'roberta-classification'
)

trainer = WeightedTrainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,
    tokenizer=tokenizer         # training dataset
    # eval_dataset=val_dataset           # evaluation dataset
)

trainer.train()

.8096439585085506e-05, 'epoch': 4.78}
 68%|██████▊   | 2785/4067 [06:45<03:06,  6.88it/s]{'loss': 0.0158, 'learning_rate': 1.79843005326605e-05, 'epoch': 4.79}
 69%|██████▊   | 2793/4067 [06:46<03:05,  6.87it/s]{'loss': 0.0048, 'learning_rate': 1.7872161480235495e-05, 'epoch': 4.81}
 69%|██████▉   | 2801/4067 [06:48<03:04,  6.87it/s]{'loss': 0.3357, 'learning_rate': 1.7760022427810486e-05, 'epoch': 4.82}
 69%|██████▉   | 2809/4067 [06:49<03:01,  6.94it/s]{'loss': 0.1345, 'learning_rate': 1.764788337538548e-05, 'epoch': 4.83}
 69%|██████▉   | 2817/4067 [06:50<03:01,  6.88it/s]{'loss': 0.166, 'learning_rate': 1.7535744322960472e-05, 'epoch': 4.85}
 69%|██████▉   | 2825/4067 [06:51<03:01,  6.83it/s]{'loss': 0.2941, 'learning_rate': 1.7423605270535463e-05, 'epoch': 4.86}
 70%|██████▉   | 2833/4067 [06:52<03:02,  6.75it/s]{'loss': 0.256, 'learning_rate': 1.7311466218110458e-05, 'epoch': 4.87}
 70%|██████▉   | 2841/4067 [06:53<02:59,  6.84it/s]{'loss': 0.2003, 'learning_rate': 1.719932716568

TrainOutput(global_step=4067, training_loss=0.284460890784224, metrics={'train_runtime': 590.0191, 'train_samples_per_second': 6.893, 'epoch': 7.0})

In [14]:
trainer.save_model('./results/twitter-rumour-classification_roberta_512_batch4_grad16')

In [14]:
# model = LongformerForSequenceClassification.from_pretrained('./results/twitter-rumour-classification_1024_batch8_grad8//', local_files_only=True).to("cuda")

In [16]:
# from transformers import default_data_collator

# label_ids: torch.Tensor = None
# preds: torch.Tensor = None

# with torch.no_grad():
#     dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=8)

#     for batch in tqdm(dataloader):

#         batch['input_ids'] = batch['input_ids'].cuda()
            
#         predictions = model(input_ids=batch['input_ids']
#                                    )
        
#         predictions = predictions[0]

#         if preds is None:
#             preds = predictions.detach().sigmoid()
#         else:
#             preds = torch.cat((preds, predictions.detach()), dim=0)


#         # if label_ids is None:
#         #     label_ids = batch["labels"].detach()
#         # else:
#         #     label_ids = torch.cat((label_ids, batch["labels"].detach()), dim=0)

100%|██████████| 73/73 [00:05<00:00, 14.25it/s]


In [17]:
# from sklearn.metrics import precision_recall_fscore_support
# predictions = np.argmax(preds.to("cpu"), axis=1)
# p, r, f, _ = precision_recall_fscore_support(predictions, val_labels, pos_label=1, average="binary")

In [18]:
# print("Precision: ", p)
# print("Recall: ", r)
# print("F1: ", f)

Precision:  0.8021390374331551
Recall:  0.8379888268156425
F1:  0.8196721311475409


In [None]:
# import datasets
# imdbtrain, imdbtest_data = datasets.load_dataset('imdb', split =['train', 'test'], 
                                             )

In [35]:
# imdbtrain[0]

{'label': 1,
 'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'}