# Using a Pre-Trained BERT Model for gathering hashtag Sentiment from Twitter

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 2.8MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 16.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 28.7MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl 

In [2]:
import os
import nltk
import torch
import time
import copy
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from transformers import *
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import *
import torch.nn.functional as F
from nltk.corpus import twitter_samples
from sklearn.model_selection import train_test_split
from model import BertForSequenceClassification

I am going to be using the BERT pretrained weights for transfer learning to gather sentiment from tweets aggregated by hashtags. I will be training the BERT weights on an nltk corpus of tweets. I start by downloading the corpus.

In [3]:
nltk.download('twitter_samples')
twitter_samples.fileids()

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

I then parse the data into the negative and positive sentiment tweets and prepare them into a dataframe.

In [4]:
strings = np.concatenate((twitter_samples.strings('negative_tweets.json'),twitter_samples.strings('positive_tweets.json')), axis=0)
sentiment = np.concatenate((['negative']* len(twitter_samples.strings('negative_tweets.json')), ['positive']*len(twitter_samples.strings('positive_tweets.json'))), axis=0)
data = pd.DataFrame({'tweet': strings, 'sentiment': sentiment})

In [5]:
data

Unnamed: 0,tweet,sentiment
0,hopeless for tmr :(,negative
1,Everything in the kids section of IKEA is so c...,negative
2,@Hegelbon That heart sliding into the waste ba...,negative
3,"“@ketchBurning: I hate Japanese call him ""bani...",negative
4,"Dang starting next week I have ""work"" :(",negative
...,...,...
9995,"@chriswiggin3 Chris, that's great to hear :) D...",positive
9996,@RachelLiskeard Thanks for the shout-out :) It...,positive
9997,@side556 Hey! :) Long time no talk...,positive
9998,@staybubbly69 as Matt would say. WELCOME TO AD...,positive


The next thing to do is prepare the Dataset into a form that can be easily used for training. I will create a dataset class that tokenizes the strings to a specific length (in our case, 256).

The tokenizer is a `BertTokenizer` from the Transformers library.

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
tokenizer.tokenize(data.tweet[0])

['hopeless', 'for', 't', '##m', '##r', ':', '(']

Split the data into training and validation sets.

In [8]:
X = data['tweet']
y= data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()
y_train = pd.get_dummies(y_train).values.tolist()
y_test = pd.get_dummies(y_test).values.tolist()

Create a Tokenized dataset. We set the length of a tokenized string to 256. If it is longer or shorter, the string is cut off/padded to get it to a length of 256.

In [9]:
max_seq_length = 256

class TextDataset(Dataset):

    def __init__(self, data_list):
      self.x_y = data_list
        
    def __getitem__(self,index):
        
        tokenized_review = tokenizer.tokenize(self.x_y[0][index])
        if len(tokenized_review) > max_seq_length:
            tokenized_review = tokenized_review[:max_seq_length]
        ids_review  = tokenizer.convert_tokens_to_ids(tokenized_review)
        padding = [0] * (max_seq_length - len(ids_review))
        ids_review += padding
        assert len(ids_review) == max_seq_length
        ids_review = torch.tensor(ids_review)
        sentiment = self.x_y[1][index]       
        list_of_labels = [torch.from_numpy(np.array(sentiment))]
        
        return ids_review, list_of_labels[0]
    
    def __len__(self):
        return len(self.x_y[0])

Create the Bert model. We will use the pretrained weights from the `BertModel` Transformers model. 

In [11]:
model=BertForSequenceClassification()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


cuda:0


In [12]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Define our batch size, define our dataframes as `TextDataset` classes and create torch `DataLoaders` for training and validation sets.

In [13]:
batch_size = 16

train_lists = [X_train, y_train]
test_lists = [X_test, y_test]

training_dataset = TextDataset(train_lists)

test_dataset = TextDataset(test_lists)

dataloaders_dict = {
    'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
    'val':torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    }

dataset_sizes = {
    'train':len(train_lists[0]),
    'val':len(test_lists[0])
    }

Create a training loop. Display loss and accuracy metrics during training.

In [14]:
from IPython.display import clear_output, display
def train_model(model, criterion, optimizer, scheduler, num_epochs=10):
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100
    status_df = pd.DataFrame({'Epoch':[], 'train_loss': [], 'train_acc': [], 'val_loss':[], 'val_acc': []})

    for epoch in range(num_epochs):
        print(epoch)
        clear_output(wait=True)
        display(status_df)
        epoch_string = '{}/{}'.format(epoch + 1, num_epochs )

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:

            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            sentiment_corrects = 0

            # Iterate over data.
            for batch in tqdm(dataloaders_dict[phase]):
                inputs = batch[0]
                sentiment = batch[1]

                inputs = inputs.to(device) 
                sentiment = sentiment.to(device)
                
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    
                    loss = criterion(outputs, torch.max(sentiment.float(), 1)[1])
                    
                    if phase == 'train':
                        
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)

                sentiment_corrects += torch.sum(torch.max(outputs, 1)[1] == torch.max(sentiment, 1)[1])

            epoch_loss = running_loss / dataset_sizes[phase]

            sentiment_acc = sentiment_corrects.double() / dataset_sizes[phase]

            if phase == 'train':
              train_loss = epoch_loss
              train_acc = '{:.4f}'.format(sentiment_acc)
            elif phase == 'val':
              val_loss = epoch_loss
              val_acc = '{:.4f}'.format(sentiment_acc)

            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')
            
            scheduler.step()

        epoch_data = pd.Series([epoch_string, train_loss, train_acc, val_loss, val_acc], index=status_df.columns)
        status_df = status_df.append(epoch_data, ignore_index=True) 
    
    clear_output(wait=True)
    display(status_df) 
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    model.load_state_dict(best_model_wts)
    return model

Define the learning rate and optimizer used for updating the weights between epochs.

In [15]:
lrlast = .001
lrmain = .00001
optim1 = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},       
   ]
)

optimizer_ft = optim1
criterion = nn.CrossEntropyLoss()
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)

Run the training loop.

In [16]:
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=10)

Unnamed: 0,Epoch,train_loss,train_acc,val_loss,val_acc
0,1/10,0.366165,0.9412,0.315254,0.998
1,2/10,0.315111,0.9981,0.314677,0.9984
2,3/10,0.313942,0.9993,0.314694,0.9984
3,4/10,0.314025,0.9992,0.314883,0.9984
4,5/10,0.313967,0.9993,0.314845,0.9984
5,6/10,0.313989,0.9992,0.314844,0.9984
6,7/10,0.313951,0.9993,0.314845,0.9984
7,8/10,0.314232,0.9991,0.314843,0.9984
8,9/10,0.314097,0.9992,0.314843,0.9984
9,10/10,0.313912,0.9993,0.314843,0.9984


Training complete in 21m 49s


In [17]:
torch.save(model_ft1, 'model.pth')