## BERT to the rescue
 - Following tutorial from https://towardsdatascience.com/bert-to-the-rescue-17671379687f

In [1]:
import sys
import warnings
import numpy as np
import random as rn
import torch
from torch import nn
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchnlp.datasets import imdb_dataset
from transformers import AutoModel, AutoTokenizer, BertTokenizer
from IPython.display import clear_output

In [2]:
warnings.filterwarnings("ignore")
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

## Prepare the Data

In [3]:
train_data, test_data = imdb_dataset(train=True, test=True)
rn.shuffle(train_data)
rn.shuffle(test_data)
train_data = train_data[:1000]
test_data = test_data[:100]

In [4]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))
len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(1000, 1000, 100, 100)

## Baseline

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [6]:
baseline_model = (make_pipeline(CountVectorizer(ngram_range=(1,3)), 
                               LogisticRegression())
                 .fit(train_texts, train_labels)
                 )
baseline_predicted = baseline_model.predict(test_texts)
print(classification_report(test_labels, baseline_predicted))

              precision    recall  f1-score   support

         neg       0.82      0.84      0.83        50
         pos       0.84      0.82      0.83        50

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.83      0.83      0.83       100



## BERT Model
 - Load BERT model using transformers
 - Initialize model

In [7]:
# Store the model we want to use
MODEL_NAME = "bert-base-cased"
# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

100%|██████████| 435779157/435779157 [00:19<00:00, 21800697.88B/s]
100%|██████████| 213450/213450 [00:00<00:00, 887369.50B/s]


In [8]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = model
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [9]:
bert_clf = BertBinaryClassifier()

## Train BERT Model

In [10]:
BATCH_SIZE = 4
EPOCHS = 10

In [11]:
train_token_ids = list(map(lambda x: tokenizer.encode_plus(x,
    add_special_tokens=True,
    max_length=512,
    return_tensors="pt"
    )["input_ids"][0], train_texts))
train_token_tensor = pad_sequence(train_token_ids).T
train_y_tensor = torch.tensor([i == "pos" for i in train_labels]).view(-1,1).float()

test_token_ids = list(map(lambda x: tokenizer.encode_plus(x,
    add_special_tokens=True,
    max_length=512,
    return_tensors="pt"
    )["input_ids"][0], test_texts))
test_token_tensor = pad_sequence(test_token_ids).T
test_y_tensor = torch.tensor([i == "pos" for i in test_labels]).view(-1,1).float()

Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (738 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1416 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1232 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (654 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (915 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (815 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (806 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (675 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (742 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (699 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (980 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (825 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (871 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (862 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (698 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1232 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (976 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

In [12]:
train_dataset = TensorDataset(train_token_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_token_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [13]:
criterion = nn.BCELoss()
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [None]:
train_loss = 0
for i in range(EPOCHS):
    train_loss = 0
    for j, (text, cls) in enumerate(train_dataloader):
        optimizer.zero_grad()
        output = bert_clf(text)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    

In [None]:
def train_func(sub_train_):
    # Train the model
    train_loss = 0
    y_true = np.array([])
    y_pred = np.array([])
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(DEVICE), offsets.to(DEVICE), cls.to(DEVICE)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        y_true = np.append(y_true, cls.numpy())
        y_pred = np.append(y_pred, output.argmax(1).numpy())
        
    # Adjust the learning rate
    scheduler.step()
    
    return train_loss / len(sub_train_), f1_score(y_true, y_pred)


In [18]:
optimizer.zero_grad??