In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# References
- [BERT to the rescue!](https://towardsdatascience.com/bert-to-the-rescue-17671379687f)
- [Understanding BERT Part 2: BERT Specifics
](https://medium.com/dissecting-bert/dissecting-bert-part2-335ff2ed9c73)

In [2]:
import io

import pandas as pd
import numpy as np

import tqdm
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
font_dirs = ['/usr/share/fonts/truetype/nanum']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
font_list = font_manager.createFontList(font_files)
font_manager.fontManager.ttflist.extend(font_list)
plt.rcParams['font.family'] = 'NanumGothic'

In [3]:
import sys
import numpy as np
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from torchnlp.datasets import imdb_dataset
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [5]:
train_data, test_data = imdb_dataset(train=True, test=True)
rn.shuffle(train_data)
rn.shuffle(test_data)
train_data = train_data[:1000]
test_data = test_data[:100]

In [7]:
train_data[0]

{'text': 'This review contains spoilers for those who are not aware of the details of the true story on which this movie is based.<br /><br />The right to be presumed "Innocent until proven guilty" is a basic entitlement of anyone in a civilised society; but according to Fred Schepisi\'s partisan but sadly convincing story of a famous Australian murder trial, it was not granted to Lindy Chamberlain, accused of killing her baby. The story suggesting her innocence was unlikely (a dingo was alleged to have taken it), but those implying her guilt even more so, and there was no solid evidence against her. But the Australian public was transfixed by the possibility of her guilt, and the deeply religious Chamberlains appeared creepy when appearing in the media (and the media themselves, of course, were anything but innocent in this process). So although cleared by an initial inquest, they were later prosecuted and convicted. Although Chamberlain was eventually released, this shamefully only f

In [6]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(1000, 1000, 100, 100)

In [8]:
train_texts[0]

'This review contains spoilers for those who are not aware of the details of the true story on which this movie is based.<br /><br />The right to be presumed "Innocent until proven guilty" is a basic entitlement of anyone in a civilised society; but according to Fred Schepisi\'s partisan but sadly convincing story of a famous Australian murder trial, it was not granted to Lindy Chamberlain, accused of killing her baby. The story suggesting her innocence was unlikely (a dingo was alleged to have taken it), but those implying her guilt even more so, and there was no solid evidence against her. But the Australian public was transfixed by the possibility of her guilt, and the deeply religious Chamberlains appeared creepy when appearing in the media (and the media themselves, of course, were anything but innocent in this process). So although cleared by an initial inquest, they were later prosecuted and convicted. Although Chamberlain was eventually released, this shamefully only followed t

In [10]:
train_labels[0]

'pos'

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [13]:
tokenizer.tokenize('Hi my name is Dima')

['hi', 'my', 'name', 'is', 'dim', '##a']

In [14]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)
print(train_tokens[0])
#print(test_tokens[0:2])

['[CLS]', 'this', 'review', 'contains', 'spoil', '##ers', 'for', 'those', 'who', 'are', 'not', 'aware', 'of', 'the', 'details', 'of', 'the', 'true', 'story', 'on', 'which', 'this', 'movie', 'is', 'based', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'the', 'right', 'to', 'be', 'presumed', '"', 'innocent', 'until', 'proven', 'guilty', '"', 'is', 'a', 'basic', 'en', '##ti', '##tlement', 'of', 'anyone', 'in', 'a', 'civil', '##ised', 'society', ';', 'but', 'according', 'to', 'fred', 'sc', '##he', '##pis', '##i', "'", 's', 'partisan', 'but', 'sadly', 'convincing', 'story', 'of', 'a', 'famous', 'australian', 'murder', 'trial', ',', 'it', 'was', 'not', 'granted', 'to', 'lin', '##dy', 'chamberlain', ',', 'accused', 'of', 'killing', 'her', 'baby', '.', 'the', 'story', 'suggesting', 'her', 'innocence', 'was', 'unlikely', '(', 'a', 'ding', '##o', 'was', 'alleged', 'to', 'have', 'taken', 'it', ')', ',', 'but', 'those', 'implying', 'her', 'guilt', 'even', 'more', 'so', ',', 'and', 'there', 'was',

In [16]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((1000, 512), (100, 512))

In [17]:
train_tokens_ids[0]

array([  101,  2023,  3319,  3397, 27594,  2545,  2005,  2216,  2040,
        2024,  2025,  5204,  1997,  1996,  4751,  1997,  1996,  2995,
        2466,  2006,  2029,  2023,  3185,  2003,  2241,  1012,  1026,
        7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,  2157,
        2000,  2022, 14609,  1000,  7036,  2127, 10003,  5905,  1000,
        2003,  1037,  3937,  4372,  3775, 24007,  1997,  3087,  1999,
        1037,  2942,  5084,  2554,  1025,  2021,  2429,  2000,  5965,
        8040,  5369, 18136,  2072,  1005,  1055, 14254,  2021, 13718,
       13359,  2466,  1997,  1037,  3297,  2827,  4028,  3979,  1010,
        2009,  2001,  2025,  4379,  2000, 11409,  5149, 13904,  1010,
        5496,  1997,  4288,  2014,  3336,  1012,  1996,  2466,  9104,
        2014, 12660,  2001,  9832,  1006,  1037, 22033,  2080,  2001,
        6884,  2000,  2031,  2579,  2009,  1007,  1010,  2021,  2216,
       20242,  2014,  8056,  2130,  2062,  2061,  1010,  1998,  2045,
        2001,  2053,

In [11]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((1000,), (100,), 0.489, 0.5)

In [19]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [21]:
print(train_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [14]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)



In [15]:
baseline_predicted = baseline_model.predict(test_texts)

In [16]:
print(classification_report(test_labels, baseline_predicted))

              precision    recall  f1-score   support

         neg       0.78      0.78      0.78        50
         pos       0.78      0.78      0.78        50

    accuracy                           0.78       100
   macro avg       0.78      0.78      0.78       100
weighted avg       0.78      0.78      0.78       100



In [17]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()

In [20]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

(torch.Size([3, 512]), torch.Size([3, 512, 768]), torch.Size([3, 768]))

In [21]:
y = bert_clf(x)
y.cpu().detach().numpy()

array([[0.495686  ],
       [0.41647682],
       [0.4135548 ]], dtype=float32)

In [22]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()

In [23]:
BATCH_SIZE = 4
EPOCHS = 1

In [24]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [25]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [26]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [27]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [28]:
torch.cuda.empty_cache()

In [29]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  1
249/250.0 loss: 0.6401839823722839 


In [30]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])


In [31]:
np.mean(bert_predicted)

0.56

In [32]:
print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

       False       0.82      0.72      0.77        50
        True       0.75      0.84      0.79        50

    accuracy                           0.78       100
   macro avg       0.78      0.78      0.78       100
weighted avg       0.78      0.78      0.78       100

