<b> Following Coursera Sentiment Analysis with BERT Tutorial
https://www.coursera.org/learn/sentiment-analysis-bert/home/welcome

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
import transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import torch
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup

<b> Data Preprocessing

Read in data and set unique ID as index.

In [2]:
col_names = ['col1','col2','col3','col4','col5','col6','col7','col8','col9','col10']
train = pd.read_csv("train.csv", header=None, names=col_names, low_memory=False)
val = pd.read_csv("valid.csv", header=None, names=col_names, low_memory=False)

In [3]:
train = train[['col3','col4']][1:]
train.index.names =['id']
train.drop_duplicates(inplace=True)
train.columns = ['sentiment','text']

val = val[['col3','col4']][1:]
val.index.names =['id']
val.drop_duplicates(inplace=True)
val.columns = ['sentiment','text']

Create numeric labels for each class and add to train/val.

In [4]:
possible_labels = train.sentiment.unique()

In [5]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [6]:
train['label'] = train.sentiment.replace(label_dict)
val['label'] = val.sentiment.replace(label_dict)

In [7]:
train['data_type'] = 'train'
val['data_type'] = 'val'

<b> Loading Tokenizer 

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

<b> Encoding Data

In [9]:
print("Longest in train: ", train.text.map(len).max())
print("Longest in val: ", val.text.map(len).max())

Longest in train:  587
Longest in val:  725


Assign max_length.

In [10]:
max_length = 512

In [11]:
encoded_data_train = tokenizer.batch_encode_plus(
    train[train.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    return_tensors='pt',
    max_length=max_length,
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    val[val.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    return_tensors='pt',
    max_length=max_length,
    truncation=True
)

<b> Assign inputs

In [12]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train[train.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val[val.data_type=='val'].label.values)

<b> Assign datasets

In [13]:
dataset_train = TensorDataset(
    input_ids_train,
    attention_masks_train,
    labels_train
)

dataset_val = TensorDataset(
    input_ids_val,
    attention_masks_val,
    labels_val
)

<b> Loading Pretrained BERT

In [14]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<b> Creating Data Loaders

In [15]:
batch_size = 32

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size
)

<b> Set Up Optimizer and Scheduler

In [16]:
optimizer = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)

In [17]:
epochs = 4

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

<b> Define Performance Metrics

In [18]:
def f1_score_func(pred, labels):
    preds_flat = np.argmax(pred, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [19]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v:k for k,v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])})/{len(y_true)}\n')

<b> Create Training Loop

In [20]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [22]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
    torch.save(model.state_dict(), f'final_BERT_ft_epoch{epoch}.model')
    tqdm.write('\nEpoch {epoch}')

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=552.0, style=ProgressStyle(description_widt…


Epoch {epoch}


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=552.0, style=ProgressStyle(description_widt…


Epoch {epoch}


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=552.0, style=ProgressStyle(description_widt…