In [121]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from collections import defaultdict
from tqdm import tqdm
from transformers import BertTokenizer, AutoModel, AutoTokenizer, TFAutoModel, pipeline, AdamW, BertConfig, BertModel, get_linear_schedule_with_warmup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch
from emoji import demojize
import re
%matplotlib inline

In [150]:
device = torch.device("cpu")
device

device(type='cpu')

In [151]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [124]:
with open("./train.data.jsonl", "r") as f:
    raw_lines_train = f.readlines()
with open("./train.label.json", "r") as f:
    raw_labels_train = f.readlines()

with open("./dev.data.jsonl", "r") as f:
    raw_lines_dev = f.readlines()

with open("./dev.label.json", "r") as f:
    raw_labels_dev = f.readlines()

In [125]:

json_lines_train = [json.loads(line) for line in raw_lines_train]
json_labels_train = [json.loads(line) for line in raw_labels_train][0]

json_lines_dev = [json.loads(line) for line in raw_lines_dev]
json_labels_dev = [json.loads(line) for line in raw_labels_dev][0]

In [126]:
def getText(json_lines, json_labels):
    X = []
    Y = np.zeros(len(json_lines))
    
    for i, lines in enumerate(json_lines):
        X.append(list(map(lambda x: "<s>" + x['text'].replace('.', '') + "</s>", lines)))
        Y[i] = 1 if json_labels[lines[0]['id_str']] == 'rumour' else 0
    return X, Y

In [127]:
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>', '<pad>', '<mask>']

In [128]:
Xtrain, Ytrain = getText(json_lines_train, json_labels_train)

In [129]:
Xdev, Ydev = getText(json_lines_dev, json_labels_dev)

In [130]:
tokenizer.tokenize(Xtrain[0][0])

['<s>',
 'How',
 'to',
 'respond',
 'to',
 'the',
 'murderous',
 'attack',
 'on',
 'Charlie',
 'Heb@@',
 'do',
 '?',
 'Every',
 'newspaper',
 'in',
 'the',
 'free',
 'world',
 'should',
 'print',
 'this',
 'HTTPURL',
 '</s>']

In [131]:
nltk.download('wordnet')
nltk.download('stopwords')
def build_corpus_and_X(X, vectorizer=None):
  corpus = []
  newX = []
  lem = WordNetLemmatizer()
  tokenizer = TweetTokenizer(preserve_case=True, strip_handles=False, reduce_len=True)

  for XArray in tqdm(X):
    newArray = []
    for sent in XArray:
      newSent = []
      for tok in tokenizer.tokenize(sent.lower()):
        if tok not in stopwords.words('english') and tok != '.':
          newSent.append(tok)
      newSent = ' '.join(newSent)
      corpus.append(newSent)
      newArray.append(newSent)
    newX.append(newArray)
  if vectorizer is not None:
    vect = vectorizer.transform(corpus)
  else:
    vectorizer = TfidfVectorizer()
    vect = vectorizer.fit_transform(corpus)
  return vectorizer, vect, newX

[nltk_data] Downloading package wordnet to C:\Users\Isitha
[nltk_data]     Subasinghe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Isitha
[nltk_data]     Subasinghe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [132]:
vectorizer, vect, newX = build_corpus_and_X(Xtrain)

100%|██████████████████████████████████████████████████████████████████████████████| 4641/4641 [03:38<00:00, 21.26it/s]


In [133]:
newX[0]

['<s> respond murderous attack charlie hebdo ? every newspaper free world print http://tco/sc2ot63f6j </s>',
 '<s> @heresy_corner @krustyallslopp jews label anyone like anti-semite campaign person / company finished </s>',
 '<s> @heresy_corner @krustyallslopp one </s>',
 '<s> @heresy_corner #imcharliehebdo </s>',
 '<s> @krustyallslopp ditto </s>',
 '<s> @grizzly_stats @tom_wein innocent muslims ought find insulting atrocity committed name , sodding cartoon </s>',
 '<s> @heresy_corner @krustyallslopp yes , becomes </s>',
 '<s> @heresy_corner @krustyallslopp insult people nothing ? people genuinely offended drawings </s>',
 '<s> @krustyallslopp @heresy_corner neither ! think little actual muslims </s>',
 '<s> @berg_han ah , like jews bye bye @krustyallslopp </s>',
 '<s> @heresy_corner also kid along benign stuff wham like river shite ! </s>',
 '<s> @berg_han @heresy_corner good point </s>',
 '<s> @heresy_corner @pjfny ? http://tco/d2qcavkf2h </s>',
 "<s> @heresy_corner @krustyallslopp or

In [134]:
class TweetDataset(Dataset):
  def __init__(self, texts, targets, tokenizer, max_len):
    self.texts = texts
    self.targets = targets
    self.tokenzer = tokenizer
    self.max_len = max_len
    
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    review = self.texts[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length=self.max_len,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt'
    )

    return {
        'input_ids': encoding['input_ids'][0],
        'attention_mask': encoding['attention_mask'][0],
        'targets': torch.tensor(self.targets[item], dtype=torch.long)
    }

In [135]:
def create_data_loader(X, Y, tokenizer, max_len, batch_size):
  ds = TweetDataset(
      texts = X,
      targets = Y,
      tokenizer = tokenizer,
      max_len = max_len
  )
  return DataLoader(ds, batch_size=batch_size, num_workers=0)

In [136]:
newXtrain = [ (' '.join(i))[:500] for i in newX ]

In [137]:
newXtrain[0]

'<s> respond murderous attack charlie hebdo ? every newspaper free world print http://tco/sc2ot63f6j </s> <s> @heresy_corner @krustyallslopp jews label anyone like anti-semite campaign person / company finished </s> <s> @heresy_corner @krustyallslopp one </s> <s> @heresy_corner #imcharliehebdo </s> <s> @krustyallslopp ditto </s> <s> @grizzly_stats @tom_wein innocent muslims ought find insulting atrocity committed name , sodding cartoon </s> <s> @heresy_corner @krustyallslopp yes , becomes </s> <s'

In [138]:
_, vect, newXdev = build_corpus_and_X(Xdev, vectorizer=vectorizer)

100%|████████████████████████████████████████████████████████████████████████████████| 580/580 [00:28<00:00, 20.48it/s]


In [139]:
newXdev = [ (' '.join(i))[:500] for i in newXdev ]

In [140]:
train_ds = create_data_loader(newXtrain, Ytrain, tokenizer, 512, 4)

In [141]:
test_ds = create_data_loader(newXdev, Ydev, tokenizer, 512, 4)

In [154]:
class RumourClassifier(nn.Module):
  def __init__(self, n_classes):
    super(RumourClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained("vinai/bertweet-base")
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)
  
  def forward(self, input_ids, attention_mask):
    output = self.bert(
        input_ids=input_ids, 
        attention_mask=attention_mask
    ).pooler_output

    output = self.drop(output)
    output = self.out(output)
    return self.softmax(output)
  

In [152]:
model = RumourClassifier(2)
model = model.to(device)

In [153]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_ds) * 150

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

In [155]:
def train_epoch(
    model, 
    dl, 
    loss_fn, 
    optimizer, 
    scheduler, 
    n_examples
):
  model = model.train()
  losses = []
  correct_preds = 0

  for d in dl:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    targets = d['targets'].to(device)

    output = model(input_ids, attention_mask)

    _, pred = torch.max(output, dim=1)

    loss = loss_fn(output, targets)

    correct_preds += torch.sum(pred == targets)

    del pred
    del targets
    del input_ids
    del attention_mask
    del output

    losses.append(loss.item())

    loss.backward()

    # Gradient clipping hack
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  return correct_preds.double() / n_examples, np.mean(losses)




In [156]:
def eval_model(model, data_loader, loss_fn, n_examples):
  model = model.eval()
  losses = []
  correct_preds = 0 

  with torch.no_grad():
    for d in data_loader:
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      targets = d['targets'].to(device)

      output = model(input_ids, attention_mask)

      _, pred = torch.max(output, dim=1)

      loss = loss_fn(output, targets)

      correct_preds += torch.sum(pred == targets)

      losses.append(loss.item())

      del pred
      del targets
      del input_ids
      del attention_mask
      del output

  return correct_preds.double() / n_examples, np.mean(losses)



In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(150)):

  print(f'Epoch {epoch + 1}/{150}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_ds,    
    loss_fn, 
    optimizer, 
    scheduler,
    len(Ytrain) 
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    test_ds,
    loss_fn,
    len(Ydev) 
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_bertweet_model_state.bin')
    best_accuracy = val_acc

  0%|                                                                                          | 0/150 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  'targets': torch.tensor(self.targets[item], dtype=torch.long)


Epoch 1/150
----------


In [149]:
min(Ytrain)

0.0