In [5]:
import sqlite3 as lite
import pandas as pd
import numpy as np
from pandas.io.sql import read_sql
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os


In [4]:
# download the imdb data
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

--2019-07-26 14:34:05--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2019-07-26 14:34:38 (2.43 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [11]:
!tar -xzf aclImdb_v1.tar.gz

 aclImdb		 fit_head.pth
 aclImdb_v1.tar.gz	 itos.pkl
'bert imdb.ipynb'	 models
 database.sqlite	'pitchfork bert.ipynb'
 data_clas_pitch.pkl	 pitchfork_classification_script.ipynb
 data_lm_pitchfork.pkl	 pitchfork_language_model.ipynb
 fine_tuned_enc.pth	 scrape_npr.ipynb
 fine_tuned.pth		 Untitled.ipynb


In [16]:

# define the directory paths of the imdb reviews
train_neg = 'aclImdb/train/neg'
train_pos = 'aclImdb/train/pos'

test_neg = 'aclImdb/test/neg'
test_pos = 'aclImdb/test/pos'

In [23]:
# for another implementation see https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
from sklearn.utils import shuffle
import re
def load_directory_data(directory):
    data = {}
    data["review"] = []
    data["sentiment"] = []
    
    float_dict ={'pos': 1, 'neg': 0}
    
    for file_path in os.listdir(directory):
        with open(os.path.join(directory, file_path), "r") as f:
            data["review"].append(f.read())
            #re.match('.*\/([^-]*)', file_path.group(1))
    data["sentiment"] = re.match('.*\/([^-]*)', directory).group(1)    
    data = pd.DataFrame.from_dict(data)
    data["sentiment"]= data["sentiment"].map(float_dict)

    return data

training_data = pd.concat([load_directory_data(train_neg), load_directory_data(train_pos)])
training_data = shuffle(training_data)

test_data = pd.concat([load_directory_data(test_neg), load_directory_data(test_pos)])
test_data = shuffle(test_data)
print(f'Lenght of training data is {len(training_data):,} and test data is {len(test_data):,}')

Lenght of training data is 25,000 and test data is 25,000


In [24]:
# now use an iterator class that we will feed into python for training
# Next step is define a class that takes the text and labels, tokenizes the text 
# using the bert tokenizer, converts tokens to ids, pads the sentences to make sure they are the same
# size as the model allows; if they are longer it trims them else it pads them with 0.
# finallly feeds themn to the classifier.
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)
max_seq_length = 256

class Data_Processing(object):
  
  def __init__(self, text_column, label_column):
    
    self.text_column = text_column.tolist()
    
    #self.label_column = pd.Categorical(pd.factorize(label_column)[0])
    self.label_column = label_column.tolist()
    
        
  def __getitem__(self,index):
    
    tokenized_text = tokenizer.tokenize(self.text_column[index])
    
    # Account for [CLS] and [SEP] with "- 2"
    
    if len(tokenized_text) > max_seq_length - 2:
      tokenized_text = tokenized_text[:(max_seq_length - 2)]
                
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
    
    input_mask = [1] * len(input_ids)
    
    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    
    input_ids += padding
    
    input_mask += padding
   
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    

    
    #print(ids_review)
    input_ids = torch.tensor(input_ids)
        
    labels = self.label_column[index] 
    
    #list_of_labels = [torch.from_numpy(np.array(labels)).squeeze(0)]
    list_of_labels = torch.tensor(labels)
    
    return input_ids, list_of_labels
  
  def __len__(self):
        return len(self.text_column)

In [25]:
import torch
batch_size = 16

training_data = Data_Processing(training_data['review'].iloc[0:5000], training_data['sentiment'].iloc[0:5000])

test_data =  Data_Processing(test_data['review'].iloc[0:5000], test_data['sentiment'].iloc[0:5000])

dataloaders_dict = {'train': DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val': DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=0)
                   }

dataset_sizes = {'train':len(training_data),
                'val':len(test_data)}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [26]:
# define the model
# Taken from Huggin Face implementation available here: 

class BertForSequenceClassification(nn.Module):
    
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(config, num_labels)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config, num_labels=2, output_attentions = False):
      
      super(BertForSequenceClassification, self).__init__()
      #super(BertForMultiLabelSequenceClassification, self).__init__(config)

      #super(BertForSequenceClassification, self).__init__()
      #self.output_attentions = output_attentions
      self.num_labels = num_labels
      self.bert = BertModel.from_pretrained('bert-base-uncased')
      self.dropout= nn.Dropout(config.hidden_dropout_prob)
      self.classifier = nn.Linear(config.hidden_size, num_labels)
      nn.init.xavier_normal_(self.classifier.weight)
    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
      
      _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
      
      pooled_output = self.dropout(pooled_output)
      
      logits = self.classifier(pooled_output)

      return logits
    
    def freeze_bert_encoder(self):
      for param in self.bert.parameters():
        param.requires_grad = False
      
    def unfreeze_bert_encoder(self):
      for param in self.bert.parameters():
        
        param.requires_grad = True

In [27]:
from pytorch_transformers import BertConfig

config = BertConfig()

model = BertForSequenceClassification(config=config, num_labels=2)

100%|██████████| 313/313 [00:00<00:00, 123199.81B/s]
100%|██████████| 440473133/440473133 [02:29<00:00, 2950420.77B/s]


In [None]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score
# define the metrics to evaluate
def log_metrics(y_pred, y_true):
    print('Accuracy:', accuracy_score(y_true,y_pred))
    #print('MCC:', matthews_corrcoef(y_true,y_pred))
    print('AUC score:', roc_auc_score(y_true, y_pred))

    

In [None]:
import torch.nn.functional as F
import copy 
import time

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        accuracy_ = []
        epoch_loss1 = 0.0

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
              
            running_loss = 0.0
            
            sentiment_corrects = 0
            
            
            # Iterate over data.
            for inputs, label in dataloaders_dict[phase]:
                
                inputs = inputs.to(device) 

                label = label.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    #print(inputs)
                    outputs = model(inputs)

                    outputs = F.softmax(outputs,dim=1)
                    #print('here label ', torch.max(label.float(), 1))
                    #print(label.squeeze(1))
                    loss = criterion(outputs, label.squeeze(0))
                    
                    preds1 = F.softmax(outputs,dim=1)
                    preds1 = torch.argmax(preds1,dim=1).cpu().data.numpy()
                    true1 = np.array(label.squeeze(0).cpu().data.numpy())
                    
                    accuracy_.append(accuracy_score(preds1,true1))
                    
                    #print(loss)
                    #print(outputs, torch.max(label, 1)[0])
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        
                        loss.backward()
                        optimizer.step()
                        #print(print(torch.max(outputs, 1)[1] , label.squeeze(0)))

                # statistics
                running_loss += loss.item() 
                
                epoch_loss1 += outputs.shape[0] * loss.item()

                
                sentiment_corrects += torch.sum(torch.max(outputs, 1)[1] == label.squeeze(0))
                #print(running_loss)

                
            epoch_loss = running_loss / dataset_sizes[phase] 
            
            accuracy_avg = sum(accuracy_) / dataset_sizes[phase]
            
            print(f'epoch loss 1 is:{epoch_loss} and alt version is {epoch_loss1}, accuracy {accuracy_avg}')

            preds = F.softmax(outputs,dim=1)
            preds = torch.argmax(preds,dim=1).cpu().data.numpy()
            true = np.array(label.squeeze(0).cpu().data.numpy())
            
            log_metrics(preds,true)
            #print(print(torch.max(outputs, 1)[1] , label.squeeze(0)))
            
            #sentiment_acc = sentiment_corrects.double() / dataset_sizes[phase]

            #print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            #print('{} sentiment_acc: {:.4f}'.format(
            #    phase, sentiment_acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')


        

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
import torch.optim as optim 
from torch.optim import lr_scheduler

lrlast = .001
lrmain = .00001
optim1 = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

#optim1 = optim.Adam(model.parameters(), lr=0.001)#,momentum=.9)
# Observe that all parameters are being optimized
optimizer_ft = optim1
criterion = nn.CrossEntropyLoss()

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1) 

In [None]:
model.to(device)
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=3)

In [28]:
import numpy as np

test1 = [1,1,2,3,1,3,1]
np.mean(test1)

1.7142857142857142