In [3]:
import sqlite3 as lite
import pandas as pd
import numpy as np
from pandas.io.sql import read_sql
import os
import re
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

--2019-10-02 14:42:06--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2019-10-02 14:42:28 (3.65 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [5]:
!tar -xzf aclImdb_v1.tar.gz

In [6]:
# define the directory paths of the imdb reviews
train_neg = 'aclImdb/train/neg'
train_pos = 'aclImdb/train/pos'

test_neg = 'aclImdb/test/neg'
test_pos = 'aclImdb/test/pos'

In [9]:
# for another implementation see https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
from sklearn.utils import shuffle

def load_directory_data(directory):
  data = {}
  data["review"] = []
  data["sentiment"] = []
  
  float_dict ={'pos': 1, 'neg': 0}
  
  for file_path in os.listdir(directory):
    with open(os.path.join(directory, file_path), "r") as f:
      data["review"].append(f.read())
      
                               
          #re.match('.*\/([^-]*)', file_path.group(1))
  data["sentiment"] = re.match('.*\/([^-]*)', directory).group(1)    
  data = pd.DataFrame.from_dict(data)
  data["sentiment"]= data["sentiment"].map(float_dict)
  return data



In [10]:
# laod training and test data using a shuffle function

training_data = pd.concat([load_directory_data(train_neg), load_directory_data(train_pos)])
training_data = shuffle(training_data)

test_data = pd.concat([load_directory_data(test_neg), load_directory_data(test_pos)])
test_data = shuffle(test_data)
print(f'Lenght of training data is {len(training_data):,} and test data is {len(test_data):,}')

Lenght of training data is 25,000 and test data is 25,000


In [11]:
# now use an iterator class that we will feed into python for training
# Next step is define a class that takes the text and labels, tokenizes the text 
# using the bert tokenizer, converts tokens to ids, pads the sentences to make sure they are the same
# size as the model allows; if they are longer it trims them else it pads them with 0.
# finallly feeds themn to the classifier.

from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_seq_length = 256

class Data_Processing(object):
  
  def __init__(self, text_column, label_column):
    
    # define the text column from the dataframe
    self.text_column = text_column.tolist()
    
    #self.label_column = pd.Categorical(pd.factorize(label_column)[0])
    
    # define the label column and transform it to list
    self.label_column = label_column.tolist()
    
# iter method to get each element at the time and tokenize it using bert        
  
  def __getitem__(self, index):
    
    tokenized_text = tokenizer.tokenize(self.text_column[index])
    
    # Account for [CLS] and [SEP] with "- 2"
    
    # check for the sequence lenght taking into consideration the 
    # fact that we need to include the SEP special token and the CLS special
    # tokens. 
    if len(tokenized_text) > max_seq_length - 2:
      tokenized_text = tokenized_text[0:(max_seq_length - 2)]
    
    # We add the CLS token at the beginning of the tokenized sequence and the 
    # SEP token at the end.
    tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]
    
    # convert the inputs to ids (dict looking)
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

    # We define the size of the input mask to correspon to the lenght of the inputs
    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    
    
    #attention_masks.append(seq_mask) 

    #input_mask = [1] * len(input_ids)
    
    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    
    # 
    input_ids += padding
    
    #input_mask += padding

    attention_masks = [1 if x>0 else 0 for x in input_ids] 
       
    assert len(input_ids) == max_seq_length
    assert len(attention_masks) == max_seq_length
       
    #print(ids_review)
    input_ids = torch.tensor(input_ids)
        
    labels = self.label_column[index] 
    
    #list_of_labels = [torch.from_numpy(np.array(labels)).squeeze(0)]
    list_of_labels = torch.tensor(labels)
    attention_masks = torch.tensor(attention_masks)
    return input_ids, list_of_labels, attention_masks
  
  def __len__(self):
        return len(self.text_column)

100%|██████████| 231508/231508 [00:00<00:00, 2773559.70B/s]


In [12]:
# reference for dataloaders https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
import torch
batch_size = 8

# create a class to process the traininga and test data
training_data = Data_Processing(training_data['review'].iloc[0:7000], training_data['sentiment'].iloc[0:7000])

test_data =  Data_Processing(test_data['review'].iloc[0:7000], test_data['sentiment'].iloc[0:7000])

# use the dataloaders class to load the data
dataloaders_dict = {'train': DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=10),
                   'val': DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=10)
                   }

dataset_sizes = {'train':len(training_data),
                'val':len(test_data)}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [13]:
# define the model
# Taken from Huggin Face implementation available here: 

class BertForSequenceClassification(nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(config, num_labels)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config, num_labels=2, output_attentions = False):
      
      super(BertForSequenceClassification, self).__init__()
      #super(BertForMultiLabelSequenceClassification, self).__init__(config)

      #super(BertForSequenceClassification, self).__init__()
      #self.output_attentions = output_attentions
      self.num_labels = num_labels
      self.bert = BertModel.from_pretrained('bert-base-uncased')
      self.dropout= nn.Dropout(config.hidden_dropout_prob)
      self.classifier = nn.Linear(config.hidden_size, num_labels)
      nn.init.xavier_normal_(self.classifier.weight)
    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
      
      _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
      
      pooled_output = self.dropout(pooled_output)
      
      logits = self.classifier(pooled_output)

      return logits
    
    def freeze_bert_encoder(self):
      for param in self.bert.parameters():
        param.requires_grad = False
      
    def unfreeze_bert_encoder(self):
      for param in self.bert.parameters():
        
        param.requires_grad = True

In [14]:
from transformers import BertConfig
#from pytorch_transformers import BertForSequenceClassification
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

config = BertConfig(num_labels=2)

model = BertForSequenceClassification(config=config, num_labels=2)
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

100%|██████████| 313/313 [00:00<00:00, 171812.22B/s]
100%|██████████| 440473133/440473133 [01:37<00:00, 4517376.73B/s]


In [16]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score
# define the metrics to evaluate
def log_metrics(y_pred, y_true):
    print('Accuracy:', accuracy_score(y_true,y_pred))
    #print('MCC:', matthews_corrcoef(y_true,y_pred))
    print('AUC score:', roc_auc_score(y_true, y_pred))
    
    # to do implement F1 score in pitchfork

In [17]:
## to do 
import torch.nn.functional as F
#import torch.nn.functional as F
import copy 
import time
number_steps = 2
print(number_steps)

def train_model(model, criterion, optimizer, scheduler, num_epochs=3):

  best_eval_acc = 0.0
  for epoch in range(num_epochs):
    #zero the model gradients
    model.zero_grad()
    print(f'starting epoch {epoch+1} out of {num_epochs}')
    training_loss = []
    training_accuracy = []
    val_loss = []
    val_accuracy = []

    epoch_loss = 0.0
    
    # track number of batches and number of iterations
    counter = 0
    iterations = 0
    
    # Iterate over data, feeding inputs, attention masks and labels
    model.train()
    for i, (inputs, label, attention_mask) in enumerate(dataloaders_dict['train']):
      # add a counter that will register how many examples we have fed to the
      # model
      counter+= batch_size
      iterations+=1
      # move the sequences, labels and masks to the GPU
      inputs = inputs.to(device) 
      label = label.to(device)
      attention_mask = attention_mask.to(device)
      
      # feed the sequences to the model, specifying the attention mask
      outputs = model(inputs, attention_mask=attention_mask)
      
      # feed the logits returned by the model to the softmax to classify the function
      outputs = F.softmax(outputs,dim=1)
      
      # calculate the loss function, squeeze the labels so their shapes are compatible
      loss = criterion(outputs, label.squeeze(0))
      
      # divide the loss by the number of steps
      loss_reg = loss.item() / number_steps 
      
      #add the loss to the epoch loss
      epoch_loss += loss_reg
      training_loss.append(loss_reg)
       
      loss.backward()
      
      # accumulate gradients and update every 2 batches
      if (i+1) % number_steps == 0:
        optimizer.step()                            # Now we can do an optimizer step
        model.zero_grad()                           # Reset gradients tensors
      
      # only present the information 
      if counter%800 == 0:
        # get the predictions and the true labels out of the GPU
        preds1 = torch.argmax(outputs,dim=1).cpu().data.numpy()
        true1 = np.array(label.squeeze(0).cpu().data.numpy())
      
      # get the accurary score
        training_accuracy.append(accuracy_score(preds1,true1))
        print(f'current training loss is {epoch_loss/iterations} and accuracy is {np.mean(training_accuracy):,.2%}')
        
        #print(f'loss is {accuracy_score(preds1,true1):,.2%}')
        #print(f'The average accuracy is {np.mean(accuracy_):,.2%} and the current loss is {loss}')
      
      #if epoch_loss<previous_loss:
      #  print(f'saving the model with epoch loss {epoch_loss} of and accuracy of {np.mean(accuracy_):,.2%}')
      #  torch.save(model.state_dict(), 'bert_imdb.pth')
      #  torch.save(optimiser.state_dict(), 'bert_imdb_optimiser.pth')
          
    with torch.no_grad():
          model.eval()
          counter_val = 0
          iterations_val = 0
          for i, (inputs, label, attention_mask) in enumerate(dataloaders_dict['val']):
            counter_val += batch_size
            iterations_val += 1
      
            # move the sequences, labels and masks to the GPU
            inputs = inputs.to(device) 
            label = label.to(device)
            attention_mask = attention_mask.to(device)
          
          # feed the sequences to the model, specifying the attention mask
            outputs = model(inputs, attention_mask=attention_mask)
          
          # feed the logits returned by the model to the softmax to classify the function
            outputs = F.softmax(outputs,dim=1)
          
          # calculate the loss function, squeeze the labels so their shapes are compatible
            loss_eval = criterion(outputs, label.squeeze(0))
            val_loss.append(loss_eval.item())

            if counter_val % 800 == 0:
              # get the predictions and the true labels out of the GPU for validation
              preds1 = torch.argmax(outputs,dim=1).cpu().data.numpy()
              true1 = np.array(label.squeeze(0).cpu().data.numpy())
      
              # get the accurary score
              val_accuracy.append(accuracy_score(preds1,true1))
              print(f'current validation loss is {np.sum(val_loss)/iterations_val} and accuracy is {np.mean(val_accuracy):,.2%}')
      

                        
    print(f'For epoch {epoch+1} training loss is {np.sum(training_loss)/iterations}, training accuracy is {np.mean(training_accuracy):,.2%}, Validation loss is {np.sum(val_loss)/iterations_val} and validation accuracy is {np.mean(val_accuracy):,.2%}')
    eval_acc = np.mean(val_accuracy)
    if eval_acc >= best_eval_acc:
      best_eval_acc = eval_acc
      print(f'saving the model with validation accuracy of {eval_acc:,.2%} ')
      torch.save(model.state_dict(), 'bert_imdb.pth')
      torch.save(optimizer_ft.state_dict(), 'bert_imdb_optimiser.pth')
    else:
      print(f'model did not improve')
      
  return model

2


In [18]:
# reference this post https://towardsdatascience.com/bert-classifier-just-another-pytorch-model-881b3cf05784
import torch.optim as optim 
from torch.optim import lr_scheduler

lrlast = .001
lrmain = .00001
optim1 = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

#optim1 = optim.Adam(model.parameters(), lr=0.001)#,momentum=.9)
# Observe that all parameters are being optimized
optimizer_ft = optim1
criterion = nn.CrossEntropyLoss()

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1) 

In [19]:
model.to(device)
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=3)

starting epoch 1 out of 3
current training loss is 0.3374501758813858 and accuracy is 50.00%
current training loss is 0.3006599934399128 and accuracy is 68.75%
current training loss is 0.27448181589444476 and accuracy is 75.00%
current training loss is 0.26478370897471903 and accuracy is 78.12%
current training loss is 0.2575100645422935 and accuracy is 80.00%
current training loss is 0.2517114018648863 and accuracy is 79.17%
current training loss is 0.24642134528074947 and accuracy is 80.36%
current training loss is 0.24260049467906356 and accuracy is 82.81%
current validation loss is 0.4007155841588974 and accuracy is 100.00%
current validation loss is 0.41173031345009803 and accuracy is 100.00%
current validation loss is 0.41504809727271397 and accuracy is 100.00%
current validation loss is 0.4193042891472578 and accuracy is 93.75%
current validation loss is 0.4190378712415695 and accuracy is 92.50%
current validation loss is 0.4202676913142204 and accuracy is 91.67%
current validat

In [37]:
# load the model weights and optimizer
model_ft1.load_state_dict(torch.load('bert_imdb.pth'))

<All keys matched successfully>

In [59]:
# define a prediction function
def prediction_example(text_sequence=None, model=None, max_seq_length=256, labels = True):
    
    tokenized_text = tokenizer.tokenize(text_sequence)
    
    # Account for [CLS] and [SEP] with "- 2"
    
    # check for the sequence lenght taking into consideration the 
    # fact that we need to include the SEP special token and the CLS special
    # tokens. 
    if len(tokenized_text) > max_seq_length - 2:
        tokenized_text = tokenized_text[0:(max_seq_length - 2)]
    
    # We add the CLS token at the beginning of the tokenized sequence and the 
    # SEP token at the end.
    tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]
    
    # convert the inputs to ids (dict looking)
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
        
    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    
    input_ids += padding
    
    #input_mask += padding

    attention_masks = [1 if x>0 else 0 for x in input_ids] 
       
    assert len(input_ids) == max_seq_length
    assert len(attention_masks) == max_seq_length
       
    #print(ids_review)
    input_ids = torch.tensor(input_ids)
    
    #list_of_labels = [torch.from_numpy(np.array(labels)).squeeze(0)]
    
    attention_masks = torch.tensor(attention_masks)
    
    # set model to eval
    model.eval()
    
    inputs = input_ids.to(device) 
    attention_mask = attention_masks.to(device)
    
    prediction = model_ft1(inputs.unsqueeze(0), attention_mask=attention_mask)
    result = torch.argmax(prediction,dim=1).cpu().data.numpy()
    
    label_values = {"Positive": 1, "Negative": 0}

    if labels:
        for key, value in label_values.items():
            if result == value:
                print (key)
    
    return result
    
    

In [60]:
text_to_predict = prediction_example(text_sequence = 'One of the greatests movies in the spagethi western genre. \
Wonderful performances', model = model_ft1, max_seq_length = 256, labels = True)

Positive


torch.Size([256])

AttributeError: 'NoneType' object has no attribute 'keys'

array([1])