In [1]:
import pandas as pd
from transformers import LongformerTokenizerFast, LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig

import torch.nn as nn
import torch
import numpy as np
import os
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from preprocess import preprocess_data, get_dataset_and_labels
from sklearn.model_selection import train_test_split

In [2]:
config = LongformerConfig()

config

LongformerConfig {
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [3]:
torch.cuda.is_available()

True

In [4]:
max_sequence_length = 256
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "cpu"

In [5]:
# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                           attention_window = int(max_sequence_length/2))
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = max_sequence_length)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

In [6]:
model.config

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    128,
    128,
    128,
    128,
    128,
    128,
    128,
    128,
    128,
    128,
    128,
    128
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.5.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [7]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")


def get_labels(label_path, sourceIds):
    with open(label_path) as f:
        labels = json.load(f)
    corresponding_labels = [labels[id] for id in sourceIds]
    numeric_labels = [convert_label(label) for label in corresponding_labels]

    return numeric_labels

In [8]:
data_path = "./project_data/train.data.jsonl"
labels_path = "./project_data/train.label.json"

data_val_path = './project_data/dev.data.jsonl'
val_labels_path = './project_data/dev.label.json'

test_path = "./project_data/test.data.jsonl"

texts, labels = get_dataset_and_labels(data_path=data_path, label_path=labels_path, max_sequence_length=max_sequence_length)
val_texts, val_sourceIds = preprocess_data(data_path=data_val_path, max_sequence_length=max_sequence_length)
val_labels = get_labels(val_labels_path, val_sourceIds)

test_texts, sourceIds = preprocess_data(data_path=test_path, max_sequence_length=max_sequence_length) 

In [9]:
# train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.2)
train_texts = texts
train_labels = labels

In [10]:
train_encodings = tokenizer(train_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)
val_encodings = tokenizer(val_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)
test_encodings = tokenizer(test_texts, padding = 'max_length', truncation=True, max_length = max_sequence_length)

In [11]:
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class TestDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.tokenized_texts.items()}

train_dataset = TwitterDataset(train_encodings, train_labels)
val_dataset = TestDataset(val_encodings)

In [11]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "no",
    do_eval= False,
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps = 4,
    fp16 = True,
    logging_dir='./logs',
    dataloader_num_workers = 0,
    run_name = 'longformer-twitter-classification-rumour'
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    # eval_dataset=val_dataset           # evaluation dataset
)

trainer.train()

  1%|          | 4/360 [00:09<14:34,  2.46s/it]{'loss': 0.7326, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.06}
  2%|▏         | 8/360 [00:18<13:02,  2.22s/it]{'loss': 0.719, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.11}
  3%|▎         | 12/360 [00:26<12:33,  2.16s/it]{'loss': 0.7164, 'learning_rate': 3e-06, 'epoch': 0.17}
  4%|▍         | 16/360 [00:35<12:21,  2.16s/it]{'loss': 0.7107, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.22}
  6%|▌         | 20/360 [00:43<11:54,  2.10s/it]{'loss': 0.6929, 'learning_rate': 5e-06, 'epoch': 0.28}
  7%|▋         | 24/360 [00:52<11:41,  2.09s/it]{'loss': 0.6723, 'learning_rate': 6e-06, 'epoch': 0.33}
  8%|▊         | 28/360 [01:00<11:31,  2.08s/it]{'loss': 0.6423, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.39}
  9%|▉         | 32/360 [01:08<11:23,  2.08s/it]{'loss': 0.6653, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.44}
 10%|█         | 36/360 [01:17<11:15,  2.08s/it]{'loss': 0.6232, 'learning_rate': 9e-06, 

TrainOutput(global_step=360, training_loss=0.2956389978114102, metrics={'train_runtime': 759.9207, 'train_samples_per_second': 0.474, 'epoch': 4.99})

In [12]:
trainer.save_model('./results/twitter-rumour-classification')

In [14]:
model = LongformerForSequenceClassification.from_pretrained('./results/twitter-rumour-classification/', local_files_only=True).to("cuda")

In [16]:
from transformers import default_data_collator

label_ids: torch.Tensor = None
preds: torch.Tensor = None

with torch.no_grad():
    dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=8)

    for batch in tqdm(dataloader):

        batch['input_ids'] = batch['input_ids'].cuda()
            
        predictions = model(input_ids=batch['input_ids']
                                   )
        
        predictions = predictions[0]

        if preds is None:
            preds = predictions.detach().sigmoid()
        else:
            preds = torch.cat((preds, predictions.detach()), dim=0)


        # if label_ids is None:
        #     label_ids = batch["labels"].detach()
        # else:
        #     label_ids = torch.cat((label_ids, batch["labels"].detach()), dim=0)
        

100%|██████████| 73/73 [00:10<00:00,  7.07it/s]


In [17]:
preds

tensor([[ 0.7422,  0.2826],
        [ 0.9566,  0.0341],
        [ 0.9702,  0.0269],
        ...,
        [ 1.6122, -1.9080],
        [-0.0710, -0.2444],
        [ 2.9138, -3.1941]], device='cuda:0')

In [22]:
from sklearn.metrics import precision_recall_fscore_support
predictions = np.argmax(preds.to("cpu"), axis=1)
p, r, f, _ = precision_recall_fscore_support(predictions, val_labels, pos_label=1, average="binary")

In [23]:
print("Precision: ", p)
print("Recall: ", r)
print("F1: ", f)

Precision:  0.7967914438502673
Recall:  0.8713450292397661
F1:  0.8324022346368715


In [24]:
predictions
test

tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
        1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,

In [None]:
import datasets
imdbtrain, imdbtest_data = datasets.load_dataset('imdb', split =['train', 'test'], 
                                             )

In [35]:
imdbtrain[0]

{'label': 1,
 'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'}