In [1]:
import pandas as pd
import numpy as np
import os
import re
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

In [2]:
# laod training and test data using a shuffle function
from sklearn.utils import shuffle

training_data = pd.read_csv('../data/fine_tune_training_dataset.csv')
training_data = shuffle(training_data)
test_data = pd.read_csv('../data/fine_tune_test_dataset.csv')
test_data = shuffle(test_data)
print(f'Lenght of training data is {len(training_data):,} and test data is {len(test_data):,}')

Lenght of training data is 5,479 and test data is 625


In [3]:
test_data['len']=test_data['sentence_2'].apply(lambda x: len(x))
test_data[test_data['len']==5]

Unnamed: 0,sentence_1,sentence_2,label,len
182,There has not been 1 real tear out of #Shelli ...,#bb17,0,5
551,The Latest: More Homes Razed by Northern Calif...,phew!,1,5
494,Patience Jonathan,KS315,1,5
576,@rachelcaine,In...,1,5
438,am boy @Crash,.fuck,1,5
367,Full read,Here:,1,5
543,Good tips!,_????,1,5
22,Hiroshima survivors fight nuclear industry in ...,video,0,5
559,MH370: debris found on reunion island. ??,MH370,1,5
450,New item:,THATS,1,5


In [4]:
# we need to define a function to go over the tokens returned by the BERT tokenizer. We will skip special 
# characters and follow the proportion of masked tokens,
# 
# refer this file for the mask_token function https://github.com/google-research/bert/blob/master/create_pretraining_data.py

def mask_tokens(inputs):
    '''inputs: This the pair of sentences tokenized by the BERT tokenizer. This script returns 
    a the list of inputs and the masked labels. '''
    
    #copy the inputs
    labels = inputs.copy()
    
    # create a list of the indexes we are going to modify with a masked label
    index_to_replace = []
    # create a lost with the labels we are going to predict
    labels_filtered = []

    # create index of observations to replace, we skip masking the CLS and SEP special tokens
    for (index, element) in enumerate(labels):
        if element=='[CLS]'or element== '[SEP]':
            continue
        else:
            index_to_replace.append(index)
            labels_filtered.append(element)
        
    # get a list of the indices we are masking (85 percent mask, 10 percent random, 10 percent no change))
    # masked_indices= np.random.binomial(size=len(index_to_replace), n=2, p= 0.15)
    masked_indices = np.random.choice([0, 1], size=len(index_to_replace), p=[.85, .15])
    
    # create a list to store the labels of the masked tokens
    labels_masked_token = []
    # the masked indices contains the indices that we are going to mask, we iterate to get the index position 
    # of the indices we need to replace
    for index_position, index_bool in zip(index_to_replace, list(masked_indices)):
        if index_bool==1:
            labels_masked_token.append(index_position)
        else:
            continue 
    
    # once we have that letter we will replace those tokens with the masked token
    indices_replaced_mask_tokens = np.random.binomial(size=len(labels_masked_token), n=1, p= 0.8)
    
    # for the remaining 20 percent tokens half of the time we will replace them with a random token and the other
    # half we leave the original token in place.
    indices_replaced_mask_rand = np.random.binomial(size=len(labels_masked_token), n=1, p= 0.5) & ~indices_replaced_mask_tokens 
    
    # We now use our copy of the tokends, convert that into the ids of the vocabulary and start the replacement
    # process
    labels=tokenizer.convert_tokens_to_ids(labels)
    
    # We first replace the tokens masked with the id of the special [MASK] token in our vocabulary which 
    # corresponds to 100
    for boolean, true_index in zip(indices_replaced_mask_tokens, labels_masked_token):
        if boolean==1:
            labels[true_index]=100
    
    # For the tokens that need to be replaced with a random token in the range of the len of the vocabulary
    for boolean, true_index in zip(indices_replaced_mask_rand, labels_masked_token):
        if boolean==1:
            labels[true_index]= np.random.randint(0,len(tokenizer.vocab), size=1)[0]
    
    # create a list of the position masked tokens
    position_masked_tokens= []
    
    # we finally parse everything and add -1 to the tokens we did not mask.
    for index,token in enumerate(labels):
        if index in labels_masked_token:
            position_masked_tokens.append(tokenizer.convert_tokens_to_ids(inputs[index]))
        else:
            position_masked_tokens.append(-1)
    
    return labels, position_masked_tokens

In [5]:
# create our dataloader class
# create class to load a review, split into sentences, check if the sentences are in line with max lenght
# reference this post https://github.com/ceshine/pytorch-pretrained-BERT/blob/master/notebooks/Next%20Sentence%20Prediction.ipynb?source=post_page-----1dbfe6a66f1d----------------------
import torch
from transformers import BertTokenizer, BertModel, BertForNextSentencePrediction

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_seq_length = 128

class Data_Processing(object):
    def __init__(self, sentence_1, sentence_2, label):
        self.sentence_1 = sentence_1.tolist()
        self.sentence_2 = sentence_2.tolist()
        self.label = label.tolist()
        
    # define the text column from the dataframe
        assert isinstance(self.sentence_1, list), 'Argument of wrong type!'
        assert isinstance(self.sentence_2, list), 'Argument of wrong type!'

        

    def __len__(self):
        return len(self.sentence_1) 

# iter method to get each element at the time and tokenize it using bert        

    def __getitem__(self, index):
        sentence_1_tokenized_text = tokenizer.tokenize(self.sentence_1[index])
        sentence_2_tokenized_text = tokenizer.tokenize(self.sentence_2[index])
        
        while True:
            total_length = len(sentence_1_tokenized_text) + len(sentence_2_tokenized_text)

            if total_length <= (max_seq_length-3):
                break
            if len(sentence_1_tokenized_text) > len(sentence_2_tokenized_text):
                sentence_1_tokenized_text.pop()
            else:
                sentence_2_tokenized_text.pop()

#sentence_1_tokenized_tex_ids = tokenizer.convert_tokens_to_ids(sentence_1_tokenized_tex_ids)
#sentence_2_tokenized_tex_ids = tokenizer.convert_tokens_to_ids(sentence_2_tokenized_tex_ids)

        inputs = ["[CLS]"] + sentence_1_tokenized_text + ["[SEP]"]
        inputs +=  sentence_2_tokenized_text + ["[SEP]"]
        # 
        segment_ids = [0] * (len(sentence_1_tokenized_text)+2)
        segment_ids += [1] * (len(sentence_2_tokenized_text) + 1)

        # get the inputs id's (with the masked tokens) as well as the labels of the masked tokens
        input_ids, masked_lm_labels = mask_tokens(inputs)

         # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        padding_masked = [-1] * (max_seq_length - len(input_ids))

        #add padding in case it is necessary
        input_ids += padding
        segment_ids += padding
        attention_masks = [1 if x>0 else 0 for x in input_ids]
        masked_lm_labels += padding_masked

        # Checkt the lenghts of the sequences
        assert len(input_ids) == max_seq_length
        assert inputs[0] == "[CLS]"
        assert input_ids[0]== 101
        assert len(attention_masks) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(masked_lm_labels) == max_seq_length

        # convert to torch tensor
        input_ids = torch.tensor(input_ids)
        segment_ids = torch.tensor(segment_ids)
        attention_masks = torch.tensor(attention_masks)
        masked_lm_labels = torch.tensor(masked_lm_labels)

        # get sequence label
        sequence_label = self.label[index]

        return input_ids, segment_ids, attention_masks, masked_lm_labels, sequence_label
  

In [6]:
# reference for dataloaders https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
%time
batch_size = 12

# create a class to process the traininga and test data
training_data = Data_Processing(training_data['sentence_1'],
                                training_data['sentence_2'],
                                training_data['label'])

test_data =  Data_Processing(test_data['sentence_1'], 
                             test_data['sentence_2'], 
                             test_data['label'])




CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


In [8]:
%time
# use the dataloaders class to load the data
dataloaders_dict = {'train': DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val': DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
                   }

dataset_sizes = {'train':len(training_data),
                'val':len(test_data)}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 6.68 µs
cuda:0


In [9]:
example = next(iter(dataloaders_dict.get('train')))
example

[tensor([[  101, 26539,   100,  ...,     0,     0,     0],
         [  101,  1030,  2032,  ...,     0,     0,     0],
         [  101,  8902, 25311,  ...,     0,     0,     0],
         ...,
         [  101,  2066,  2339,  ...,     0,     0,     0],
         [  101, 21318, 13149,  ...,     0,     0,     0],
         [  101,   100, 24459,  ...,     0,     0,     0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[  -1,   -1, 5488,  ...,   -1,   -1,   -1],
         [  -1,   -1,   -1,  ...,   -1,   -1,   -1],
         [  -1,   -1,   -1,  ...,   -1,   -1,   -1],
         ...,
     

In [10]:
from transformers import BertConfig
config = BertConfig()
from transformers import BertForPreTraining
model = BertForPreTraining.from_pretrained(
    "bert-base-uncased",config=config)

In [11]:
import torch.optim as optim 
from torch.optim import lr_scheduler, AdamW
#from transformers import WarmupLinearSchedule

lrlast = .001
#lrmain = .00001
lrmain = 2e-5


optim1 = optim.AdamW(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
   ])

#optim1 = optim.Adam(model.parameters(), lr=0.001)#,momentum=.9)
# Observe that all parameters are being optimized
optimizer_ft = optim1

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1) 

In [12]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score
import torch.nn.functional as F
number_steps= 4


def train_model(model, optimizer, scheduler, num_epochs=3):
    best_eval_acc = 0.0
    for epoch in range(num_epochs):
        #zero the model gradients
        model.zero_grad()
        print(f'starting epoch {epoch+1} out of {num_epochs}')
        training_loss = []
        training_accuracy_next_sentence = []
        training_accuracy_vocab = []
        val_loss = []
        val_accuracy_next_sentence = []
        val_accuracy_vocab = []

        epoch_loss = 0.0

        # track number of batches and number of iterations
        counter = 0
        iterations = 0
        counter_val = 0
        iterations_val = 0

        # Iterate over data, feeding inputs, attention masks and labels
        model.train()
        for i, (inputs, segment_ids, attention_masks, masked_lm_labels, sequence_label) in enumerate(dataloaders_dict['train']):
            # add a counter that will register how many examples we have fed to thee model
            counter += batch_size
            iterations += 1
            # move the sequences, labels and masks to the GPU
            inputs = inputs.to(device) 
              #print(inputs)
            segment_ids = segment_ids.to(device)
            #print(segment_ids)
            attention_masks = attention_masks.to(device)
            #print(attention_mask)

            masked_lm_labels = masked_lm_labels.to(device)
            sequence_label = sequence_label.to(device)

            # feed the sequences to the model, specifying the attention mask
            outputs = model(inputs, token_type_ids = segment_ids, attention_mask= attention_masks, masked_lm_labels=masked_lm_labels,
                         next_sentence_label=sequence_label)

            # feed the logits returned by the model to the softmax to classify the function
            prediction_scores = outputs[1]
            seq_relationship_scores = outputs[2]

            # TO DO Accuracy of model

            #
            loss_reg = outputs[0]/number_steps

            #add the loss to the epoch loss
            epoch_loss += loss_reg
            training_loss.append(loss_reg)

            outputs[0].backward()

            # accumulate gradients and update every 4 batches
            if (i+1) % number_steps == 0:
                optimizer.step()                            # Now we can do an optimizer step
                model.zero_grad()

          # only present the information 
            if counter% 10 == 0:
            # get the predictions and the true labels out of the GPU
                preds_next_sentence = torch.argmax(seq_relationship_scores,dim=1).cpu().data.numpy()
                true_next_sentence = np.array(sequence_label.squeeze(0).cpu().data.numpy())

            #print('here', true_next_sentence.shape, preds_next_sentence.shape)

                accuracy_next_sentence = accuracy_score(true_next_sentence,preds_next_sentence)
                training_accuracy_next_sentence.append(accuracy_next_sentence)


            # repeat with softmax for vocabulary
            #masked_predictions = F.softmax(prediction_scores,dim=1)
            #masked_predictions = torch.argmax(masked_predictions, 1).cpu().data.numpy()

            #true_vocab = np.array(masked_lm_labels.cpu().data.numpy())
            #print('now here', masked_predictions.shape, true_vocab.shape)
            #print(masked_predictions)
            #accuracy_vocab = accuracy_score(masked_predictions,true_vocab)
            #accuracy_vocab = sum(masked_predictions == true_vocab.squeeze(1))/len(true_vocab)
                print(f'current training loss is {np.sum(training_loss)/iterations} \
                and next sent accuracy is {np.mean(training_accuracy_next_sentence):,.2%}')


        with torch.no_grad():
            model.eval()
            counter_val = 0
            iterations_val = 0
            
            for i, (inputs, segment_ids, attention_masks, masked_lm_labels, sequence_label) in enumerate(dataloaders_dict['val']):
                counter_val += batch_size
                iterations_val += 1

                inputs = inputs.to(device) 
                #print(inputs)
                segment_ids = segment_ids.to(device)
                #print(segment_ids)
                attention_masks = attention_masks.to(device)
                #print(attention_mask)

                masked_lm_labels = masked_lm_labels.to(device)
                sequence_label = sequence_label.to(device)

                # feed the sequences to the model, specifying the attention mask
                outputs = model(inputs, token_type_ids = segment_ids, attention_mask= attention_masks, 
                                masked_lm_labels = masked_lm_labels,
                                next_sentence_label = sequence_label)

                prediction_scores = outputs[1]
                seq_relationship_scores = outputs[2]

                # calculate the loss function, squeeze the labels so their shapes are compatible
                loss_eval = outputs[0] 
                val_loss.append(loss_eval)

                # get the predictions and the true labels out of the GPU for validation
                preds_next_sentence = torch.argmax(seq_relationship_scores,dim=1).cpu().data.numpy()
                print(preds_next_sentence)
                true_next_sentence = np.array(sequence_label.squeeze(0).cpu().data.numpy())
                accuracy_next_sentence = accuracy_score(true_next_sentence,preds_next_sentence)

                val_accuracy_next_sentence.append(accuracy_next_sentence)

                if counter_val % 100 == 0:

                    print(f'current val loss is {np.sum(val_loss)/iterations_val} \
                    and next sent accuracy val is {np.mean(val_accuracy_next_sentence):,.2%}')
              
        print(f'For epoch {epoch+1} training loss is {np.sum(training_loss)/iterations}, \
        training accuracy is {np.mean(training_accuracy_next_sentence):,.2%}, Validation loss is {np.sum(val_loss)/iterations_val} \
        and validation accuracy is {np.mean(val_accuracy_next_sentence):,.2%}')

        eval_acc = np.mean(val_accuracy_next_sentence)
        if eval_acc >= best_eval_acc:
            best_eval_acc = eval_acc
            print(f'saving the model with validation accuracy of {eval_acc:,.2%} ')
            torch.save(model.state_dict(), '../data/bert_tweet_language_finetune_model.pth')
            #torch.save(model.state_dict(), 'bert_imdb_pretrain.pth')
            #torch.save(optimizer_ft.state_dict(), 'bert_imdb_optimiser_pretrain.pth')
            print('now saving as recommended by huggingface')
            #model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            #output_model_file = os.path.join("bert_pretrained_model.bin")
            #torch.save(model_to_save.state_dict(), output_model_file)

            # model loading weights

            model.load_state_dict(torch.load('../data/bert_tweet_language_finetune_model.pth'))
            #optimizer.load_state_dict(torch.load('bert_imdb_optimiser_pretrain.pth'))

        else:
            print(f'model did not improve')

      # load     
          #if epoch_loss<previous_loss:
          #  print(f'saving the model with epoch loss {epoch_loss} of and accuracy of {np.mean(accuracy_):,.2%}')
          #  torch.save(model.state_dict(), 'bert_imdb.pth')
          #  torch.save(optimiser.state_dict(), 'bert_imdb_optimiser.pth')
    return model

In [13]:
model.to(device)
model_ft1 = train_model(model, optimizer_ft, exp_lr_scheduler,
                       num_epochs=3)

starting epoch 1 out of 3
current training loss is 3.5915982723236084                 and next sent accuracy is 66.67%
current training loss is 3.1834590435028076                 and next sent accuracy is 58.33%
current training loss is 2.8794779777526855                 and next sent accuracy is 58.33%
current training loss is 2.653743267059326                 and next sent accuracy is 54.17%
current training loss is 2.490097999572754                 and next sent accuracy is 51.67%
current training loss is 2.3602707386016846                 and next sent accuracy is 52.78%
current training loss is 2.2567176818847656                 and next sent accuracy is 52.38%
current training loss is 2.165539264678955                 and next sent accuracy is 55.21%
current training loss is 2.0949466228485107                 and next sent accuracy is 56.48%
current training loss is 2.0257456302642822                 and next sent accuracy is 56.67%
current training loss is 1.9856339693069458    

current training loss is 1.2576628923416138                 and next sent accuracy is 62.31%
current training loss is 1.2552610635757446                 and next sent accuracy is 62.64%
[1 0 1 1 0 0 0 0 1 1 0 1]
[1 0 0 0 1 0 0 1 0 1 0 1]
[0 1 0 1 1 0 0 0 1 1 1 1]
[1 0 1 0 0 0 1 1 0 0 0 0]
[1 0 1 0 0 1 0 1 0 1 1 1]
[1 1 0 0 1 0 1 1 1 1 0 1]
[1 1 1 1 1 1 0 0 1 0 1 1]
[0 1 1 0 1 1 1 0 0 1 0 1]
[1 0 1 1 0 1 1 1 1 0 1 1]
[0 0 1 1 1 1 1 0 1 0 0 0]
[0 1 1 0 1 1 0 0 1 1 1 1]
[0 1 0 0 0 1 1 0 0 0 0 1]
[1 0 0 1 1 1 0 1 1 0 0 1]
[1 1 0 1 0 0 0 1 1 0 1 0]
[1 1 1 1 1 0 0 0 1 1 0 0]
[1 0 1 1 0 1 0 1 1 1 1 1]
[1 1 1 1 1 0 1 1 0 1 0 1]
[1 1 1 1 0 1 0 1 1 0 1 1]
[0 0 0 0 1 1 1 0 0 0 1 1]
[1 0 1 1 1 1 0 1 1 0 0 0]
[1 1 1 0 1 1 1 0 1 1 0 1]
[0 0 0 1 0 0 1 0 0 1 0 1]
[0 1 1 0 1 0 0 0 1 0 1 1]
[1 1 1 1 0 1 0 0 1 0 0 0]
[1 0 0 1 0 0 1 1 1 0 1 0]
current val loss is 4.2206268310546875                     and next sent accuracy val is 71.67%
[1 0 0 0 0 1 1 1 1 1 1 1]
[0 0 1 1 1 1 1 0 1 1 1 1]
[0 0 0 1 0 1 1 1

current training loss is 1.0658082962036133                 and next sent accuracy is 67.77%
current training loss is 1.066374659538269                 and next sent accuracy is 67.75%
current training loss is 1.0642855167388916                 and next sent accuracy is 67.50%
current training loss is 1.0633811950683594                 and next sent accuracy is 67.72%
current training loss is 1.0645768642425537                 and next sent accuracy is 67.48%
current training loss is 1.0649805068969727                 and next sent accuracy is 67.47%
current training loss is 1.0652960538864136                 and next sent accuracy is 67.79%
current training loss is 1.0636357069015503                 and next sent accuracy is 67.67%
current training loss is 1.0622297525405884                 and next sent accuracy is 67.76%
current training loss is 1.062690019607544                 and next sent accuracy is 67.64%
current training loss is 1.0618051290512085                 and next sen

current training loss is 0.9843931794166565                 and next sent accuracy is 75.18%
current training loss is 0.9858165383338928                 and next sent accuracy is 74.82%
current training loss is 0.9848302602767944                 and next sent accuracy is 74.31%
current training loss is 0.9849754571914673                 and next sent accuracy is 74.83%
current training loss is 0.9871969223022461                 and next sent accuracy is 74.67%
current training loss is 0.9877802133560181                 and next sent accuracy is 74.84%
current training loss is 0.9902846217155457                 and next sent accuracy is 74.68%
current training loss is 0.9907011389732361                 and next sent accuracy is 74.21%
current training loss is 0.993273138999939                 and next sent accuracy is 74.23%
current training loss is 0.9951943159103394                 and next sent accuracy is 74.39%
current training loss is 0.9945905208587646                 and next se

NameError: name 'datetime' is not defined

In [None]:
import requests
resp = requests.post('https://textbelt.com/text', {
  'phone': '4123131113',
  'message': 'BERT for pretraining is done',
  'key': 'textbelt',
})
print(resp.json())