In [1]:
import pandas as pd
import numpy as np
from pandas.io.sql import read_sql
import os
import re
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

In [2]:
# define the directory paths where we saved the tweets
train = "../data/train.csv"
test = '../data/test.csv'
#'../data/train.csv', encoding = 'utf8')

In [3]:
# for another implementation see https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
from sklearn.utils import shuffle

def load_data(filename, test = False):
    
    data = pd.read_csv(filename)
    if test:
        data = data[['id','text']]
        
    else:
        data = data[['text','target']]
        
        
        
    return data

In [4]:
train_tweets = load_data(train)
train_tweets.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# split dataset into training and validation 
from sklearn.model_selection import train_test_split
training_data, test_data = train_test_split(train_tweets,
                                     random_state = 42,
                                     stratify = train_tweets[['target']],
                                     test_size = 0.1)


In [6]:
# now use an iterator class that we will feed into python for training
# Next step is define a class that takes the text and labels, tokenizes the text 
# using the bert tokenizer, converts tokens to ids, pads the sentences to make sure they are the same
# size as the model allows; if they are longer it trims them else it pads them with 0.
# finallly feeds themn to the classifier.
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
config = BertConfig(num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_seq_length = 280

class Data_Processing(object):
    def __init__(self, text_column, label_column):
        
        # define the text column from the dataframe
        self.text_column = text_column.tolist()
    
        #self.label_column = pd.Categorical(pd.factorize(label_column)[0])
    
        # define the label column and transform it to list
        self.label_column = label_column.tolist()
    
# iter method to get each element at the time and tokenize it using bert        
  
    def __getitem__(self, index):
        
        tokenized_text = tokenizer.tokenize(self.text_column[index])
        # Account for [CLS] and [SEP] with "- 2"

        # check for the sequence lenght taking into consideration the 
        # fact that we need to include the SEP special token and the CLS special
        # tokens. 
        if len(tokenized_text) > max_seq_length - 2:
            tokenized_text = tokenized_text[0:(max_seq_length - 2)]

        # We add the CLS token at the beginning of the tokenized sequence and the 
        # SEP token at the end.
        tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]

        # convert the inputs to ids (dict looking)
        input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

        # We define the size of the input mask to correspon to the lenght of the inputs
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.


        #attention_masks.append(seq_mask) 

        #input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))

        # 
        input_ids += padding

        #input_mask += padding

        attention_masks = [1 if x>0 else 0 for x in input_ids] 

        assert len(input_ids) == max_seq_length
        assert len(attention_masks) == max_seq_length

        #print(ids_review)
        input_ids = torch.tensor(input_ids)

        labels = self.label_column[index] 

        #list_of_labels = [torch.from_numpy(np.array(labels)).squeeze(0)]
        list_of_labels = torch.tensor(labels)
        attention_masks = torch.tensor(attention_masks)
        return input_ids, list_of_labels, attention_masks
  
    def __len__(self):
        return len(self.text_column)

In [7]:
batch_size = 4

# create a class to process the traininga and test data
training_data = Data_Processing(training_data['text'], training_data['target'])

test_data =  Data_Processing(test_data['text'], test_data['target'])

# use the dataloaders class to load the data
dataloaders_dict = {'train': DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=10),
                    'val': DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=10)
                   }

dataset_sizes = {'train':len(training_data),
                 'val':len(test_data)}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [8]:
example = next(iter(dataloaders_dict.get('train')))
example

[tensor([[  101,  2444, 14409,  ...,     0,     0,     0],
         [  101,  1045,  1005,  ...,     0,     0,     0],
         [  101,  1001,  1052,  ...,     0,     0,     0],
         [  101,  1030,  9915,  ...,     0,     0,     0]]),
 tensor([0, 0, 0, 0]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])]

In [10]:
state_dict = torch.load('../data/bert_tweet_language_finetune_model.pth')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",config = config, state_dict = state_dict)


In [11]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [12]:
from sklearn.metrics import accuracy_score, f1_score
# define the metrics to evaluate
def log_metrics(y_pred, y_true):
    print('Accuracy:', accuracy_score(y_true,y_pred))
    #print('MCC:', matthews_corrcoef(y_true,y_pred))
    print('F1 score:', f1_score(y_true, y_pred))

In [13]:
import logging
logging.basicConfig(level = logging.INFO, filename ='bert_tweet_classifier_280_finetune.txt', filemode ='w', 
                   format = '%(name)s -%(levelname)s - %(message)s')

In [14]:
import torch.optim as optim 
from torch.optim import lr_scheduler, AdamW
criterion = nn.CrossEntropyLoss()


lrmain = 2e-5

optim1 = optim.AdamW(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
   ])

optimizer_ft = optim1

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.1)

In [15]:
import torch.nn.functional as F
#import torch.nn.functional as F
import copy 
import time
number_steps = 20
print(number_steps)

def train_model(model, criterion, optimizer, scheduler, num_epochs=3):
    best_eval_acc = 0.0
    for epoch in range(num_epochs):
        #zero the model gradients
        model.zero_grad()
        print(f'starting epoch {epoch+1} out of {num_epochs}')
        training_loss = []
        training_accuracy = []
        val_loss = []
        val_accuracy = []
        outputs_ = []
        labels_ = []

        epoch_loss = 0.0

        # track number of batches and number of iterations
        counter = 0
        iterations = 0

        # Iterate over data, feeding inputs, attention masks and labels
        model.train()
        for i, (inputs, label, attention_mask) in enumerate(dataloaders_dict['train']):
            # add a counter that will register how many examples we have fed to the
            # model
            counter+= batch_size
            iterations+=1
            # move the sequences, labels and masks to the GPU
            inputs = inputs.to(device) 
            label = label.to(device)
            attention_mask = attention_mask.to(device)

            # feed the sequences to the model, specifying the attention mask
            loss, outputs = model(inputs, attention_mask=attention_mask, labels=label)
            outputs1 = outputs
            
            # feed the logits returned by the model to the softmax to classify the function
            outputs = F.softmax(outputs,dim=1)

            # calculate the loss function, squeeze the labels so their shapes are compatible
            loss_manual = criterion(outputs, label.squeeze(0))
            
            # divide the loss by the number of steps
            loss_reg = loss / number_steps 

            #add the loss to the epoch loss
            epoch_loss += loss_reg
            training_loss.append(loss_reg)

            loss.backward()
      
            # accumulate gradients and update every x batches
            if (i+1) % number_steps == 0:
                
                optimizer.step()                            # Now we can do an optimizer step
                model.zero_grad()                           # Reset gradients tensors
      
            # only present the information 
            if counter%1000 == 0:
                # get the predictions and the true labels out of the GPU
                preds1 = torch.argmax(outputs,dim=1).cpu().data.numpy()
                true1 = np.array(label.squeeze(0).cpu().data.numpy())
         
        # get the accurary score
                training_accuracy.append(accuracy_score(preds1,true1))
            
                print(f'current training loss is {epoch_loss/iterations} and accuracy is {np.mean(training_accuracy):,.2%}')
                
        with torch.no_grad():
            model.eval()
            counter_val = 0
            iterations_val = 0
            
            for i, (inputs, label, attention_mask) in enumerate(dataloaders_dict['val']):
                counter_val += batch_size
                iterations_val += 1
      
                # move the sequences, labels and masks to the GPU
                inputs = inputs.to(device) 
                label = label.to(device)
                attention_mask = attention_mask.to(device)
          
              # feed the sequences to the model, specifying the attention mask
                loss_eval, outputs = model(inputs, attention_mask=attention_mask, labels = label)

              # feed the logits returned by the model to the softmax to classify the function
                outputs = F.softmax(outputs,dim=1)
                
              # calculate the loss function, squeeze the labels so their shapes are compatible
                loss_eval_manual = criterion(outputs, label.squeeze(0))
                val_loss.append(loss_eval)
                
                preds1 = torch.argmax(outputs,dim=1).cpu().data.numpy()
                true1 = np.array(label.squeeze(0).cpu().data.numpy())
      
                # get the accurary score
                val_accuracy.append(accuracy_score(preds1,true1))

                if counter_val % 1000 == 0:
                    # get the predictions and the true labels out of the GPU for validation
                    
                    print(f'current validation loss is {np.sum(val_loss)/iterations_val} and accuracy is {np.mean(val_accuracy):,.2%}')
                              
        print(f'For epoch {epoch+1} training loss is {np.sum(training_loss)/iterations}, \
        training accuracy is {np.mean(training_accuracy):,.2%}, Validation \
        loss is {np.sum(val_loss)/iterations_val} and validation accuracy is {np.mean(val_accuracy):,.2%}')
        eval_acc = np.mean(val_accuracy)
        if eval_acc >= best_eval_acc:
            best_eval_acc = eval_acc
            print(f'saving the model with validation accuracy of {eval_acc:,.2%} ')
            torch.save(model.state_dict(), 'bert_tweet_classification_state_dict_280_finetuned.pth')
            #torch.save(optimizer_ft.state_dict(), 
            #           'bert_tweet_classification_state_dict_280_no_finetuned_optimizer.pth')
            #model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            #output_model_file = os.path.join("bert_tweet_classification_280_no_finetuned_model.bin")

        else:
            print(f'model did not improve')
        
        logging.info(f'We completed epoch {epoch+1} with a training loss of {np.sum(training_loss)/iterations} \
        a training accuracy of {np.mean(training_accuracy):,.2%}, Validation \
        loss is {np.sum(val_loss)/iterations_val} and validation accuracy is {np.mean(val_accuracy):,.2%}')
      
    return model

20


In [16]:
model.to(device)
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=3)

starting epoch 1 out of 3
current training loss is 0.031585946679115295 and accuracy is 75.00%
current training loss is 0.029187101870775223 and accuracy is 62.50%
current training loss is 0.02686399593949318 and accuracy is 75.00%
current training loss is 0.025872591882944107 and accuracy is 75.00%
current training loss is 0.024858733639121056 and accuracy is 75.00%
current training loss is 0.023933369666337967 and accuracy is 79.17%
For epoch 1 training loss is 0.023506755009293556,         training accuracy is 79.17%, Validation         loss is 0.3957637548446655 and validation accuracy is 82.72%
saving the model with validation accuracy of 82.72% 
starting epoch 2 out of 3
current training loss is 0.01754308119416237 and accuracy is 75.00%
current training loss is 0.01743929274380207 and accuracy is 87.50%
current training loss is 0.017386216670274734 and accuracy is 83.33%
current training loss is 0.017206910997629166 and accuracy is 87.50%
current training loss is 0.0175482332706

In [None]:
#model.load_state_dict(torch.load('bert_tweet_classification_state_dict_280_no_finetuned.pth'))

In [17]:
test_tweets = load_data(test, test = True)
test_tweets.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [18]:
class Data_Processing_predictions(object):
    def __init__(self, text_column, id_column):
        
        # define the text column from the dataframe
        self.text_column = text_column.tolist()
        self.id_column = id_column.tolist()
# iter method to get each element at the time and tokenize it using bert        
    def __getitem__(self, index):
        
        tokenized_text = tokenizer.tokenize(self.text_column[index])
        
        if len(tokenized_text) > max_seq_length - 2:
            tokenized_text = tokenized_text[0:(max_seq_length - 2)]

        # We add the CLS token at the beginning of the tokenized sequence and the 
        # SEP token at the end.
        tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]

        # convert the inputs to ids (dict looking)
        input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding

        attention_masks = [1 if x>0 else 0 for x in input_ids] 

        assert len(input_ids) == max_seq_length
        assert len(attention_masks) == max_seq_length
        #print(ids_review)
        ids = self.id_column[index]
        #list_of_ids = torch.tensor(ids)
        input_ids = torch.tensor(input_ids)
        #list_of_labels = [torch.from_numpy(np.array(labels)).squeeze(0)]
        attention_masks = torch.tensor(attention_masks)
         
        return input_ids, attention_masks, ids
    def __len__(self):
        return len(self.text_column)

In [19]:
batch_size = 4

# create a class to process the traininga and test data
prediction_data = Data_Processing_predictions(test_tweets['text'], test_tweets['id'])


# use the dataloaders class to load the data
dataloaders_dict_pred = {'pred': DataLoader(prediction_data, 
                                            batch_size=batch_size, shuffle=True, num_workers=10),
                   }


print(device)

cuda:0


In [20]:
example1 = next(iter(dataloaders_dict_pred.get('pred')))
example1

[tensor([[  101,  1996,  5469,  ...,     0,     0,     0],
         [  101,  3531, 21357,  ...,     0,     0,     0],
         [  101,  2034,  6869,  ...,     0,     0,     0],
         [  101,  2792,  1015,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([3251,  142, 5422, 7014])]

In [27]:
model_ft1.load_state_dict(torch.load('../models/bert_tweet_classification_state_dict_280_no_finetuned.pth'))
#os.path.join("../models/bert_tweet_classification_280_no_finetuned_model.bin")
#os.getcwd()

<All keys matched successfully>

In [28]:
def predictions_model(model = None):
    prediction_data_frame_list = []
    
    with torch.no_grad():
        model.eval()
        for i, (inputs, attention_mask, ids) in enumerate(dataloaders_dict_pred['pred']):
                
                inputs = inputs.to(device) 
                attention_mask = attention_mask.to(device)
          
              # feed the sequences to the model, specifying the attention mask
                outputs = model(inputs, attention_mask=attention_mask)
                
              # feed the logits returned by the model to the softmax to classify the function
                outputs = F.softmax(outputs[0],dim=1)
                
              # calculate the loss function, squeeze the labels so their shapes are compatible
                #loss_eval_manual = criterion(outputs, label.squeeze(0))
                
                preds1 = torch.argmax(outputs,dim=1).cpu().data.numpy()
                ids = ids.cpu().data.numpy()
                
                temp_data = pd.DataFrame(zip(ids,preds1), columns = ['id', 'target'])
                prediction_data_frame_list.append(temp_data)                
    
    prediction_df = pd.concat(prediction_data_frame_list)
    return prediction_df
                

In [29]:
predictions = predictions_model(model = model_ft1)

In [31]:
import datetime

date = datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")
date

'Feb 18 2020 12:38:24'

In [32]:
date = date.replace(" ", "_")
date

'Feb_18_2020_12:38:24'

In [33]:
predictions.to_csv('../data/'+date+'submission_bert_fine_tuned.csv', index = False)