In [1]:
import pandas as pd
import numpy as np
from pandas.io.sql import read_sql
import os
import re
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

In [2]:
import transformers
transformers.__version__

'2.0.0'

In [3]:
# define the directory paths where we saved the imdb reviews
train_neg = 'datasets/aclImdb/train/neg'
train_pos = 'datasets/aclImdb/train/pos'

test_neg = 'datasets/aclImdb/test/neg'
test_pos = 'datasets/aclImdb/test/pos'

In [4]:
# for another implementation see https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
from sklearn.utils import shuffle

def load_directory_data(directory):
    data = {}
    data["review"] = []
    data["sentiment"] = []
    
    float_dict ={'pos': 1, 'neg': 0}
  
    for file_path in os.listdir(directory):
        with open(os.path.join(directory, file_path), "r") as f:
            data["review"].append(f.read())
    
    data["sentiment"] = re.match('.*\/([^-]*)', directory).group(1)    
    data = pd.DataFrame.from_dict(data)
    data["sentiment"]= data["sentiment"].map(float_dict)
    return data



In [5]:
# laod training and test data using a shuffle function
training_data = pd.concat([load_directory_data(train_neg), load_directory_data(train_pos)])
training_data = shuffle(training_data)
test_data = pd.concat([load_directory_data(test_neg), load_directory_data(test_pos)])
test_data = shuffle(test_data)
print(f'Lenght of training data is {len(training_data):,} and test data is {len(test_data):,}')

Lenght of training data is 25,000 and test data is 25,000


In [6]:
# check the data was parsed correctly
training_data.head(2)

Unnamed: 0,review,sentiment
5494,I would rather have 20 root canals than go thr...,0
3821,what can i say about this film that hasnt alre...,1


In [7]:
# now use an iterator class that we will feed into python for training
# Next step is define a class that takes the text and labels, tokenizes the text 
# using the bert tokenizer, converts tokens to ids, pads the sentences to make sure they are the same
# size as the model allows; if they are longer it trims them else it pads them with 0.
# finallly feeds themn to the classifier.

from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_seq_length = 256

class Data_Processing(object):
    def __init__(self, text_column, label_column):
        
        # define the text column from the dataframe
        self.text_column = text_column.tolist()
    
        #self.label_column = pd.Categorical(pd.factorize(label_column)[0])
    
        # define the label column and transform it to list
        self.label_column = label_column.tolist()
    
# iter method to get each element at the time and tokenize it using bert        
  
    def __getitem__(self, index):
        
        tokenized_text = tokenizer.tokenize(self.text_column[index])
        # Account for [CLS] and [SEP] with "- 2"

        # check for the sequence lenght taking into consideration the 
        # fact that we need to include the SEP special token and the CLS special
        # tokens. 
        if len(tokenized_text) > max_seq_length - 2:
            tokenized_text = tokenized_text[0:(max_seq_length - 2)]

        # We add the CLS token at the beginning of the tokenized sequence and the 
        # SEP token at the end.
        tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]

        # convert the inputs to ids (dict looking)
        input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

        # We define the size of the input mask to correspon to the lenght of the inputs
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.


        #attention_masks.append(seq_mask) 

        #input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))

        # 
        input_ids += padding

        #input_mask += padding

        attention_masks = [1 if x>0 else 0 for x in input_ids] 

        assert len(input_ids) == max_seq_length
        assert len(attention_masks) == max_seq_length

        #print(ids_review)
        input_ids = torch.tensor(input_ids)

        labels = self.label_column[index] 

        #list_of_labels = [torch.from_numpy(np.array(labels)).squeeze(0)]
        list_of_labels = torch.tensor(labels)
        attention_masks = torch.tensor(attention_masks)
        return input_ids, list_of_labels, attention_masks
  
    def __len__(self):
        return len(self.text_column)

In [8]:
# reference for dataloaders https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
import torch
batch_size = 4

# create a class to process the traininga and test data
training_data = Data_Processing(training_data['review'], training_data['sentiment'])

test_data =  Data_Processing(test_data['review'], test_data['sentiment'])

# use the dataloaders class to load the data
dataloaders_dict = {'train': DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=10),
                    'val': DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=10)
                   }

dataset_sizes = {'train':len(training_data),
                 'val':len(test_data)}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [9]:
example = next(iter(dataloaders_dict.get('train')))
example

[tensor([[  101, 16524,  2143,  ...,     0,     0,     0],
         [  101,  1045,  2481,  ...,     0,     0,     0],
         [  101,  6548,  3899,  ...,     0,     0,     0],
         [  101,  1045,  2481,  ...,  1055,  3145,   102]]),
 tensor([1, 0, 0, 0]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]])]

In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_seq_length = 512

In [11]:
from transformers import BertConfig
config = BertConfig(num_labels=2)

In [12]:
config

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

In [13]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",config=config)

In [14]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score
# define the metrics to evaluate
def log_metrics(y_pred, y_true):
    print('Accuracy:', accuracy_score(y_true,y_pred))
    #print('MCC:', matthews_corrcoef(y_true,y_pred))
    print('AUC score:', roc_auc_score(y_true, y_pred))

In [16]:
import logging

logging.basicConfig(level = logging.INFO, filename ='bert_imdb_classifier_512.txt', filemode ='w', 
                   format = '%(name)s -%(levelname)s - %(message)s')

In [17]:
import torch.optim as optim 
from torch.optim import lr_scheduler, AdamW
criterion = nn.CrossEntropyLoss()


lrmain = 2e-5

optim1 = optim.AdamW(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
   ])

optimizer_ft = optim1

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1)

In [18]:
import torch.nn.functional as F
#import torch.nn.functional as F
import copy 
import time
number_steps = 16
print(number_steps)

def train_model(model, criterion, optimizer, scheduler, num_epochs=3):
    best_eval_acc = 0.0
    for epoch in range(num_epochs):
        #zero the model gradients
        model.zero_grad()
        print(f'starting epoch {epoch+1} out of {num_epochs}')
        training_loss = []
        training_accuracy = []
        val_loss = []
        val_accuracy = []
        outputs_ = []
        labels_ = []

        epoch_loss = 0.0

        # track number of batches and number of iterations
        counter = 0
        iterations = 0

        # Iterate over data, feeding inputs, attention masks and labels
        model.train()
        for i, (inputs, label, attention_mask) in enumerate(dataloaders_dict['train']):
            # add a counter that will register how many examples we have fed to the
            # model
            counter+= batch_size
            iterations+=1
            # move the sequences, labels and masks to the GPU
            inputs = inputs.to(device) 
            label = label.to(device)
            attention_mask = attention_mask.to(device)

            # feed the sequences to the model, specifying the attention mask
            loss, outputs = model(inputs, attention_mask=attention_mask, labels=label)
            outputs1 = outputs
            
            # feed the logits returned by the model to the softmax to classify the function
            outputs = F.softmax(outputs,dim=1)

            # calculate the loss function, squeeze the labels so their shapes are compatible
            loss_manual = criterion(outputs, label.squeeze(0))
            
            # divide the loss by the number of steps
            loss_reg = loss / number_steps 

            #add the loss to the epoch loss
            epoch_loss += loss_reg
            training_loss.append(loss_reg)

            loss.backward()
      
            # accumulate gradients and update every x batches
            if (i+1) % number_steps == 0:
                
                optimizer.step()                            # Now we can do an optimizer step
                model.zero_grad()                           # Reset gradients tensors
      
            # only present the information 
            if counter%1000 == 0:
                # get the predictions and the true labels out of the GPU
                preds1 = torch.argmax(outputs,dim=1).cpu().data.numpy()
                true1 = np.array(label.squeeze(0).cpu().data.numpy())
         
        # get the accurary score
                training_accuracy.append(accuracy_score(preds1,true1))
            
                print(f'current training loss is {epoch_loss/iterations} and accuracy is {np.mean(training_accuracy):,.2%}')
                
        with torch.no_grad():
            model.eval()
            counter_val = 0
            iterations_val = 0
            
            for i, (inputs, label, attention_mask) in enumerate(dataloaders_dict['val']):
                counter_val += batch_size
                iterations_val += 1
      
                # move the sequences, labels and masks to the GPU
                inputs = inputs.to(device) 
                label = label.to(device)
                attention_mask = attention_mask.to(device)
          
              # feed the sequences to the model, specifying the attention mask
                loss_eval, outputs = model(inputs, attention_mask=attention_mask, labels = label)

              # feed the logits returned by the model to the softmax to classify the function
                outputs = F.softmax(outputs,dim=1)
                
              # calculate the loss function, squeeze the labels so their shapes are compatible
                loss_eval_manual = criterion(outputs, label.squeeze(0))
                val_loss.append(loss_eval)
                
                preds1 = torch.argmax(outputs,dim=1).cpu().data.numpy()
                true1 = np.array(label.squeeze(0).cpu().data.numpy())
      
                # get the accurary score
                val_accuracy.append(accuracy_score(preds1,true1))

                if counter_val % 1000 == 0:
                    # get the predictions and the true labels out of the GPU for validation
                    
                    print(f'current validation loss is {np.sum(val_loss)/iterations_val} and accuracy is {np.mean(val_accuracy):,.2%}')
                              
        print(f'For epoch {epoch+1} training loss is {np.sum(training_loss)/iterations}, \
        training accuracy is {np.mean(training_accuracy):,.2%}, Validation \
        loss is {np.sum(val_loss)/iterations_val} and validation accuracy is {np.mean(val_accuracy):,.2%}')
        eval_acc = np.mean(val_accuracy)
        if eval_acc >= best_eval_acc:
            best_eval_acc = eval_acc
            print(f'saving the model with validation accuracy of {eval_acc:,.2%} ')
            torch.save(model.state_dict(), 'bert_imdb_classification_state_dict_512.pth')
            torch.save(optimizer_ft.state_dict(), 'bert_imdb_classification_optimiser_512.pth')
        else:
            print(f'model did not improve')
        
        logging.info(f'We completed epoch {epoch+1} with a training loss of {np.sum(training_loss)/iterations} \
        a training accuracy of {np.mean(training_accuracy):,.2%}, Validation \
        loss is {np.sum(val_loss)/iterations_val} and validation accuracy is {np.mean(val_accuracy):,.2%}')
      
    return model

16


In [19]:
model.to(device)
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=3)

starting epoch 1 out of 3
current training loss is 0.04403327777981758 and accuracy is 25.00%
current training loss is 0.04156672582030296 and accuracy is 50.00%
current training loss is 0.03717075660824776 and accuracy is 58.33%
current training loss is 0.03300055116415024 and accuracy is 68.75%
current training loss is 0.030756933614611626 and accuracy is 75.00%
current training loss is 0.02848183736205101 and accuracy is 75.00%
current training loss is 0.026581810787320137 and accuracy is 78.57%
current training loss is 0.02508373372256756 and accuracy is 78.12%
current training loss is 0.02409145049750805 and accuracy is 80.56%
current training loss is 0.023241417482495308 and accuracy is 77.50%
current training loss is 0.022398877888917923 and accuracy is 79.55%
current training loss is 0.021794715896248817 and accuracy is 81.25%
current training loss is 0.021110640838742256 and accuracy is 80.77%
current training loss is 0.020723877474665642 and accuracy is 80.36%
current trainin

current training loss is 0.004900208208709955 and accuracy is 100.00%
current training loss is 0.0049104331992566586 and accuracy is 98.08%
current training loss is 0.004861780907958746 and accuracy is 98.21%
current training loss is 0.0048882910050451756 and accuracy is 96.67%
current training loss is 0.004928573966026306 and accuracy is 95.31%
current training loss is 0.004909077193588018 and accuracy is 95.59%
current training loss is 0.004930620081722736 and accuracy is 95.83%
current training loss is 0.00500698946416378 and accuracy is 96.05%
current training loss is 0.005010789725929499 and accuracy is 96.25%
current training loss is 0.00502385850995779 and accuracy is 96.43%
current training loss is 0.005031574983149767 and accuracy is 95.45%
current training loss is 0.0049986992962658405 and accuracy is 95.65%
current training loss is 0.004966124892234802 and accuracy is 95.83%
current training loss is 0.004941725172102451 and accuracy is 96.00%
current validation loss is 0.190

In [20]:
#len(model_ft1[1][0])
