In [1]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from collections import defaultdict
from tqdm import tqdm
from transformers import BigBirdTokenizer, BigBirdModel, BertTokenizer, AutoModel, AutoTokenizer, TFAutoModel, pipeline, AdamW, BertConfig, BertModel, get_linear_schedule_with_warmup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch
import argparse
import numba
import util

In [2]:
device = torch.device("cuda")
device

device(type='cuda')

In [3]:
def load_data(file_format, has_output=True):
  raw_lines = None
  with open(f"{file_format}.data.jsonl", "r") as f:
    raw_lines = f.readlines()

  raw_labels = None
  if has_output == True:
    with open(f"{file_format}.label.json", "r") as f:
      raw_labels = json.loads(f.read())

  return ([json.loads(line) for line in raw_lines], raw_labels)

In [4]:
def get_xy(tweets, labels):
  assert len(tweets) == len(labels)
  tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
  X = []
  Y = np.zeros(len(labels))
  for i, tweet_set in tqdm(enumerate(tweets)):
    arr = []
    for tweet in tweet_set:
      tweet = ' '.join(tokenizer.tokenize(tweet['text']))
      arr.append(tweet)
    X.append('.'.join(arr))
    if labels != None:
      Y[i] = 1 if labels[tweet_set[0]['id_str']] == 'rumour' else 0
  return X, Y

In [5]:
class TweetDataset(Dataset):
  def __init__(self, texts, targets, tokenizer, max_len):
    self.texts = texts
    self.targets = targets
    self.tokenzer = tokenizer
    self.max_len = max_len
    
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    review = self.texts[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length=self.max_len,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt'
    )

    return {
        'input_ids': encoding['input_ids'][0],
        'attention_mask': encoding['attention_mask'][0],
        'targets': torch.tensor(self.targets[item], dtype=torch.long)
    }

In [6]:
def create_data_loader(X, Y, tokenizer, max_len, batch_size):
  ds = TweetDataset(
      texts = X,
      targets = Y,
      tokenizer = tokenizer,
      max_len = max_len
  )
  return DataLoader(ds, batch_size=batch_size, num_workers=0)

In [7]:

class RumourClassifier(nn.Module):
  def __init__(self, n_classes):
    super(RumourClassifier, self).__init__()
    self.bert = BigBirdModel.from_pretrained("google/bigbird-roberta-base")
    self.cls_layer = nn.Linear(self.bert.config.hidden_size, 1)
  
  def forward(self, input_ids, attention_mask):
    output = self.bert(
        input_ids=input_ids, 
        attention_mask=attention_mask
    ).last_hidden_state
    
    cls_rep = output[:, 0]
    
    logits = self.cls_layer(cls_rep)
    
    
    return logits 




In [8]:
train_entries, train_labels = load_data("train")
trainX, trainY = get_xy(train_entries, train_labels)

dev_entries, dev_labels = load_data("dev")
devX, devY = get_xy(dev_entries, dev_labels)

4641it [00:07, 612.36it/s]
580it [00:00, 606.12it/s]


In [9]:
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

In [10]:
model = RumourClassifier(2)
model = model.to(device)

In [11]:
train_ds = create_data_loader(trainX, trainY, tokenizer, 1400, 1)
dev_ds = create_data_loader(devX, devY, tokenizer, 1400, 1)

In [13]:
data = next(iter(train_ds))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  'targets': torch.tensor(self.targets[item], dtype=torch.long)


In [12]:
NO_EPOCHS = 20

In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5)

total_steps = len(train_ds) * NO_EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = nn.BCEWithLogitsLoss().to(device)

In [14]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

In [17]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        for it, data in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = data['input_ids'].to(device), data['attention_mask'].to(device), data['targets'].to(device)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
            scheduler.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [18]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for data in dataloader:
            seq, attn_masks, labels = data['input_ids'].to(device), data['attention_mask'].to(device),data['targets'].to(device)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:

train(model, loss_fn, optimizer, train_ds, dev_ds, NO_EPOCHS)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  'targets': torch.tensor(self.targets[item], dtype=torch.long)


Iteration 0 of epoch 0 complete. Loss: 0.4924807846546173; Accuracy: 1.0; Time taken (s): 1.1453042030334473
Iteration 100 of epoch 0 complete. Loss: 0.9756990671157837; Accuracy: 0.0; Time taken (s): 38.87397360801697
Iteration 200 of epoch 0 complete. Loss: 0.4740241765975952; Accuracy: 1.0; Time taken (s): 38.74835205078125
Iteration 300 of epoch 0 complete. Loss: 1.155455231666565; Accuracy: 0.0; Time taken (s): 38.90450119972229
