In [11]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from collections import defaultdict
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedShuffleSplit
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch
from transformers import RobertaModel, RobertaTokenizer, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re
%matplotlib inline

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
with open("./train.data.jsonl", "r") as f:
    raw_lines_train = f.readlines()
with open("./train.label.json", "r") as f:
    raw_labels_train = f.readlines()

with open("./dev.data.jsonl", "r") as f:
    raw_lines_dev = f.readlines()

with open("./dev.label.json", "r") as f:
    raw_labels_dev = f.readlines()

In [4]:
json_lines_train = [json.loads(line) for line in raw_lines_train]
json_labels_train = [json.loads(line) for line in raw_labels_train][0]

json_lines_dev = [json.loads(line) for line in raw_lines_dev]
json_labels_dev = [json.loads(line) for line in raw_labels_dev][0]

In [5]:
def getText(json_lines, json_labels):
    X = []
    Y = np.zeros(len(json_lines))
    
    for i, lines in enumerate(json_lines):
        X.append(list(map(lambda x: x['text'], lines)))
        Y[i] = 1 if json_labels[lines[0]['id_str']] == 'rumour' else 0
    return X, Y

In [6]:
Xtrain, Ytrain = getText(json_lines_train, json_labels_train)

In [7]:
Xdev, Ydev = getText(json_lines_dev, json_labels_dev)

In [8]:
Xtrain[0]

['How to respond to the murderous attack on Charlie Hebdo? Every newspaper in the free world should print this. http://t.co/sC2ot63F6j',
 "@Heresy_Corner @KrustyAllslopp \nJews label anyone they don't like as Anti-Semite and campaign until that person/company is finished.",
 '@Heresy_Corner @KrustyAllslopp \nNo one does.',
 '@Heresy_Corner #ImCharlieHebdo',
 '@KrustyAllslopp Ditto',
 '@Grizzly_Stats @tom_wein What innocent Muslims ought to find insulting is an atrocity committed in their name, not a sodding cartoon.',
 '@Heresy_Corner @KrustyAllslopp \nYes, until it becomes yours.',
 '@Heresy_Corner @KrustyAllslopp \nWhy insult people who have nothing to do with this? People are genuinely offended by such drawings.',
 '@KrustyAllslopp @Heresy_Corner \nAnd neither am I! I think this has little to do with actual Muslims.',
 "@berg_han Ah, you don't like Jews. Bye bye. @KrustyAllslopp",
 "@Heresy_Corner Also they kid you along with benign stuff then ... WHAM it's like a river of shite!",


In [12]:
def normalize(v):
    norm=np.linalg.norm(v, ord=1)
    if norm==0:
        norm=np.finfo(v.dtype).eps
    return v/norm

def normalised_child_score(data, accessor, modifier= lambda x: x, normalise=True, max_len=40):
    scores = np.zeros(max_len)
    i = 0
    
    for entry in data[1:max_len]:
        score = modifier(accessor(entry))
        scores[i] = score
        i += 1
    
    while i < max_len:
        scores[i] = 0
        i += 1
    
    if normalise==False:
        return scores
    
    
    return normalize(scores)
    
def get_feature(d, i, tweet_set):
    
    vec = np.zeros(7 + 40)
    i = 0
    x = []
    
    x.append(d['user'][i])
    x.append(d['fc'][0][i][0])
    x.append(d['rtc'][0][i][0])
    x.append(d['verified'][i])
    x.append(d['flc'][0][i][0])
    x.append(d['lc'][0][i][0])
    x.append(d['frc'][0][i][0])
    x = x + list(normalised_child_score(tweet_set, lambda x: x['user']['followers_count']))
    x = x + list(normalised_child_score(tweet_set, lambda x: x['user']['verified'], normalise=False))
    x = x + list(normalised_child_score(tweet_set, lambda x: x['favorite_count']))
    x = x + list(normalised_child_score(tweet_set, lambda x: x['retweet_count']))
    x = x + list(normalised_child_score(tweet_set, lambda x: x['user']['followers_count']))
    x = x + list(normalised_child_score(tweet_set, lambda x: x['user']['friends_count']))
    x = np.asarray(x)
    return x
    
def get_features(json_lines, json_labels):
    X = np.zeros((len(json_lines),247))
    Y = np.zeros(len(json_lines))
    
    accum = defaultdict(list)
    
    for i, tweet_set in enumerate(json_lines):
        accum['user'].append(int(tweet_set[0]['user']['id']))
        accum['favourite_count'].append(tweet_set[0]['favorite_count'])
        accum['retweet_count'].append(tweet_set[0]['retweet_count'])
        accum['verified'].append(tweet_set[0]['user']['verified'])
        accum['followers_count'].append(tweet_set[0]['user']['followers_count'])
        accum['listed_count'].append(tweet_set[0]['user']['listed_count'])
        accum['friends_count'].append(tweet_set[0]['user']['friends_count'])
    
    norm = {}
    
    norm['user']  = accum['user']
    
    fcscaler = MinMaxScaler()
    fcscaler.fit(np.asarray(accum['favourite_count']).reshape(-1, 1))
    norm['fc'] = (fcscaler.transform(np.asarray(accum['favourite_count']).reshape(-1, 1)), fcscaler)
    
    rtscaler = MinMaxScaler()
    rtscaler.fit(np.asarray(accum['retweet_count']).reshape(-1, 1))
    norm['rtc'] = (rtscaler.transform(np.asarray(accum['retweet_count']).reshape(-1, 1)), rtscaler)
    
    norm['verified'] = np.asarray(list(map(lambda x: int(x), accum['verified'])))
    
    flcscaler = MinMaxScaler()
    flcscaler.fit(np.asarray(accum['followers_count']).reshape(-1, 1))
    norm['flc'] = (flcscaler.transform(np.asarray(accum['followers_count']).reshape(-1, 1)), flcscaler)
    
    lcscaler = MinMaxScaler()
    lcscaler.fit(np.asarray(accum['listed_count']).reshape(-1, 1))
    norm['lc'] = (lcscaler.transform(np.asarray(accum['listed_count']).reshape(-1, 1)), lcscaler)
    
    frcscaler = MinMaxScaler()
    frcscaler.fit(np.asarray(accum['friends_count']).reshape(-1, 1))
    norm['frc'] = (frcscaler.transform(np.asarray(accum['friends_count']).reshape(-1, 1)), frcscaler)
    
    for i, tweet_set in enumerate(json_lines):
        X[i, :] = (get_feature(norm, i, tweet_set))
        Y[i] = 1 if json_labels[tweet_set[0]['id_str']] == 'rumour' else 0

    return X, Y
        
        

In [13]:
Xtrain, Ytrain = get_features(json_lines_train, json_labels_train)

In [14]:
Xdev, Ydev = get_features(json_lines_dev, json_labels_dev)

In [15]:
class TweetDataset(Dataset):
    def __init__(self, texts, others, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenzer = tokenizer
        self.others = others
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
  
    def __getitem__(self, item):
        review = ' '.join(tokenizer.tokenize(self.texts[item])[:470])
        encoding = tokenizer.encode_plus(
            review,
            max_length=self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'others': torch.tensor(self.others[item], dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.long)
        }
        

In [16]:
def create_data_loader(X, others, Y, tokenizer, max_len, batch_size):
    ds = TweetDataset(
        texts = X,
        others = others,
        targets = Y,
        tokenizer = tokenizer,
        max_len = max_len
      )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

In [17]:
def getText(json_lines, json_labels):
    X = []
    Y = np.zeros(len(json_lines))
    
    for i, lines in enumerate(json_lines):
        X.append(list(map(lambda x: x['text'], lines)))
        Y[i] = 1 if json_labels[lines[0]['id_str']] == 'rumour' else 0
    return X, Y

In [18]:
nXtrain, nYtrain = getText(json_lines_train, json_labels_train)

In [19]:
nXdev, nYdev = getText(json_lines_dev, json_labels_dev)

In [20]:
nltk.download('wordnet')
nltk.download('stopwords')
def build_corpus_and_X(X, vectorizer=None):
  corpus = []
  newX = []
  lem = WordNetLemmatizer()
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=False, reduce_len=True)

  for XArray in tqdm(X):
    newArray = []
    for sent in XArray:
      newSent = []
      for tok in tokenizer.tokenize(sent.lower()):
        if tok not in stopwords.words('english') and tok != '.':
          newSent.append(tok)
      newSent = ' '.join(newSent)
      corpus.append(newSent)
      newArray.append(newSent)
    newX.append(newArray)
  if vectorizer is not None:
    vect = vectorizer.transform(corpus)
  else:
    vectorizer = TfidfVectorizer()
    vect = vectorizer.fit_transform(corpus)
  return vectorizer, vect, newX

[nltk_data] Downloading package wordnet to C:\Users\Isitha
[nltk_data]     Subasinghe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Isitha
[nltk_data]     Subasinghe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
vectorizer, vect, newX = build_corpus_and_X(nXtrain)

100%|██████████████████████████████████████████████████████████████████████████████| 4641/4641 [03:41<00:00, 20.92it/s]


In [22]:
_, vect, newXdev = build_corpus_and_X(nXdev, vectorizer=vectorizer)

100%|████████████████████████████████████████████████████████████████████████████████| 580/580 [00:28<00:00, 20.14it/s]


In [23]:
class RumourClassifier(nn.Module):
  def __init__(self, n_classes):
    super(RumourClassifier, self).__init__()
    self.bert = BertModel.from_pretrained("bert-base-uncased")
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)
  
  def forward(self, input_ids, attention_mask):
    output = self.bert(
        input_ids=input_ids, 
        attention_mask=attention_mask
    ).pooler_output

    output = self.drop(output)
    output = self.out(output)
    return self.softmax(output)

In [30]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")     

In [25]:
newX_ = [' '.join(a) for a in newX]

In [26]:
newXdev_ = [' '.join(a) for a in newXdev]

In [27]:
newX_[0]

"respond murderous attack charlie hebdo ? every newspaper free world print http://t.co/sc2ot63f6j @heresy_corner @krustyallslopp jews label anyone like anti-semite campaign person / company finished @heresy_corner @krustyallslopp one @heresy_corner #imcharliehebdo @krustyallslopp ditto @grizzly_stats @tom_wein innocent muslims ought find insulting atrocity committed name , sodding cartoon @heresy_corner @krustyallslopp yes , becomes @heresy_corner @krustyallslopp insult people nothing ? people genuinely offended drawings @krustyallslopp @heresy_corner neither ! think little actual muslims @berg_han ah , like jews bye bye @krustyallslopp @heresy_corner also kid along benign stuff ... wham like river shite ! @berg_han @heresy_corner good point @heresy_corner @pjfny ? http://t.co/d2qcavkf2h @heresy_corner @krustyallslopp organised jewry , mean , actual people otherwise i'd hating ancestors @theedwardian81 @heresy_corner ... : http://t.co/lmyxpmzw3v @heresy_corner @berg_han explored @berg_

In [44]:
train_ds = create_data_loader(newX_, Xtrain, Ytrain, tokenizer, 512, 4)

In [45]:
test_ds = create_data_loader(newXdev_, Xdev, Ydev, tokenizer, 512, 4)

In [33]:
model = RumourClassifier(2)
model = model.to(device)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=570.0), HTML(value='')))




In [188]:
data = next(iter(train_ds))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  'targets': torch.tensor(self.targets[item], dtype=torch.long)


In [189]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
targets = data['targets'].to(device)
output = model(input_ids, attention_mask)
print(output)
print(output.shape)
print(input_ids.shape)
print(attention_mask.shape)
print(targets.shape)

tensor([[0.5568, 0.4432],
        [0.6460, 0.3540]], grad_fn=<SoftmaxBackward>)
torch.Size([2, 2])
torch.Size([2, 512])
torch.Size([2, 512])
torch.Size([2])


In [190]:
del input_ids
del attention_mask
del targets
del output
import gc
gc.collect()

885

In [34]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_ds) * 150

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

In [50]:
def train_epoch(
    model, 
    dl, 
    loss_fn, 
    optimizer, 
    scheduler, 
    n_examples
):
  model = model.train()
  losses = []
  correct_preds = 0

  for i, d in enumerate(dl):
    if i%100 == 0:
      print(i)
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    targets = d['targets'].to(device)

    output = model(input_ids, attention_mask)

    _, pred = torch.max(output, dim=1)
    loss = loss_fn(output, targets)

    correct_preds += torch.sum(pred == targets)

    del pred
    del targets
    del input_ids
    del attention_mask
    del output

    losses.append(loss.item())

    loss.backward()

    # Gradient clipping hack
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  return correct_preds.double() / n_examples, np.mean(losses)


In [51]:
def eval_model(model, data_loader, loss_fn, n_examples):
  model = model.eval()
  losses = []
  correct_preds = 0 

  with torch.no_grad():
    for d in data_loader:
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      targets = d['targets'].to(device)

      output = model(input_ids, attention_mask)

      _, pred = torch.max(output, dim=1)

      loss = loss_fn(output, targets)

      correct_preds += torch.sum(pred == targets)

      losses.append(loss.item())

      del pred
      del targets
      del input_ids
      del attention_mask
      del output

  return correct_preds.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(150)):

  print(f'Epoch {epoch + 1}/{150}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_ds,    
    loss_fn, 
    optimizer, 
    scheduler,
    len(Ytrain) 
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    test_ds,
    loss_fn,
    len(Ydev) 
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state1.bin')
    best_accuracy = val_acc

  'targets': torch.tensor(self.targets[item], dtype=torch.long)


Epoch 1/150
----------
0
100
200
300
400
500
600
700
800
900
1000
1100
Train loss 0.644572954024866 accuracy 0.6589097177332471
Val   loss 0.6286909202049519 accuracy 0.6775862068965517



  1%|▌                                                                             | 1/150 [06:10<15:20:42, 370.75s/it]

Epoch 2/150
----------
0
100
200
300
400
500
600
700
800
900
1000
1100
Train loss 0.6431218982256249 accuracy 0.658694246929541


  1%|█                                                                             | 2/150 [12:21<15:14:44, 370.84s/it]

Val   loss 0.6287589603456958 accuracy 0.6775862068965517

Epoch 3/150
----------
0
100
200
300


In [38]:
device

device(type='cuda')