In [None]:
# !pip install datasets
# !pip install transformers

In [None]:
import os
import sys

import torch 
import torch.nn as nn
from tqdm import tqdm

from datasets import load_dataset
from torch.utils.data import DataLoader

from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
torch.cuda.is_available()

True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd '/content/drive/MyDrive/'

/content/drive/MyDrive


In [None]:
# Random seed for reproducibilty
import numpy as np
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f435a43c590>

In [None]:
financial_phrasebank = load_dataset('financial_phrasebank', 'sentences_50agree') 
financial_phrasebank



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})

#### Training Examples: 
- 0: 'negative'
- 1: 'neutral'
- 2: 'positive'

In [None]:
print("Sentence:", financial_phrasebank['train'][0]['sentence'])
print("Label:", financial_phrasebank['train'][0]['label'])

Sentence: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Label: 1


In [None]:
print("Sentence:", financial_phrasebank['train'][20]['sentence'])
print('Label: Positive')
# print("Label:", financial_phrasebank['train'][20]['label'])

Sentence: Lifetree was founded in 2000 , and its revenues have risen on an average by 40 % with margins in late 30s .
Label: Positive


#### Create a validation set

In [None]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(financial_phrasebank['train']['sentence'], 
                                                                            financial_phrasebank['train']['label'], 
                                                                            test_size = 0.2, random_state=0)

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(train_inputs, train_labels, test_size=0.1, random_state=0)



In [None]:
print("Sample sizes: \n Train: {} \n Validation: {} \n Test: {}".format(len(train_inputs), len(validation_inputs), len(test_inputs)))

Sample sizes: 
 Train: 3488 
 Validation: 388 
 Test: 970


In [None]:
financial_phrasebank_dict = {
    'train': {'sentence': train_inputs, 
              'label': train_labels
              },
    'valid': {'sentence': validation_inputs, 
              'label': validation_labels
              },
    'test': {'sentence': test_inputs, 
              'label': test_labels
             }
    }

In [None]:
financial_phrasebank_dict['train']['sentence'][20]

'The measures result from the statutory joint negotiations with employees which started in February and concerned all operations in the country .'


Original paper configurations: 
    
    Tokenizer = bert-base-cased,
    Bert model = bert-base-uncased,
    Num Train Epochs = 4.0,
    Max sequence length = 64,
    Train batch size = 32,
    Learning rate = 2e-5,
    Output mode ='classification'


#### Tokenize

Tokenize and add paddings to all of the sentences and map the tokens to thier word IDs.
For every sentence, BertTokenizer will:
  - (1) Tokenize the sentence.
  - (2) Prepend the `[CLS]` token to the start. - token id 101
  - (3) Append the `[SEP]` token to the end. - token id 102
  - (4) Map tokens to their IDs.
  - (5) Ensure all sentences are equal length. Pad sequences with 0 

In [None]:
# Build a BERT based tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", do_lower_case=True) 

In [None]:
# Some of the common BERT tokens
print(tokenizer.sep_token, tokenizer.sep_token_id) # marker for ending of a sentence
print(tokenizer.cls_token, tokenizer.cls_token_id) # start of each sentence, so BERT knows we’re doing classification
print(tokenizer.pad_token, tokenizer.pad_token_id) # special token for padding
print(tokenizer.unk_token, tokenizer.unk_token_id) # tokens not found in training set 

[SEP] 102
[CLS] 101
[PAD] 0
[UNK] 100


In [None]:
# def encode_sentences(sentences): 
#     tokenized_datasets = []
#     for sentence in sentences:
#         encoded = tokenizer.encode_plus(sentence, max_length = 256, pad_to_max_length = True, truncation=True)
#         tokenized_datasets.append(encoded.token_type_ids)
#     return tokenized_datasets
# train_tokenized = encode_sentences(auditor_sentiment['train']['sentence'])
# test_itokenized = encode_sentences(auditor_sentiment['test']['sentence'])
# tokenized_datasets = {}
# for collection in auditor_sentiment:
#     tokenized_datasets[collection] = tokenizer(auditor_sentiment[collection]['sentence'], return_tensors='pt', padding=True)
#     tokenized_datasets[collection]['labels'] = auditor_sentiment[collection]['label']

In [None]:
def tokenize_datasets(data):
  tokenized_datasets = {}
  for collection in data: 
    tokenized_datasets[collection] = tokenizer(data[collection]['sentence'], padding='max_length', max_length = 64, truncation=True, return_tensors='pt')
    tokenized_datasets[collection]['label'] = data[collection]['label']

    tokenized_datasets[collection]['tokens'] = [["[CLS]"] + tokenizer.tokenize(row) + ["[SEP]"] for row in data[collection]['sentence']]
  
  return tokenized_datasets

In [None]:
tokenized_datasets = tokenize_datasets(financial_phrasebank_dict)

In [None]:
tokenizer(financial_phrasebank_dict['train']['sentence'][0])

{'input_ids': [101, 1170, 1103, 7516, 1669, 117, 25128, 9570, 1564, 1821, 26237, 1389, 15469, 3547, 1177, 22871, 1320, 18119, 1116, 1717, 3112, 2686, 1114, 9468, 13505, 11470, 7582, 1107, 170, 3955, 4065, 123, 7300, 3443, 1111, 9987, 172, 5800, 1891, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print("Tokens:", tokenized_datasets['train']['tokens'][0])
print(len(tokenized_datasets['train']['tokens'][0]))
print("Label:", tokenized_datasets['train']['label'][0])
print("Input:", tokenized_datasets['train']['input_ids'][0])
print(tokenized_datasets['train']['input_ids'][0].size())
print("Token Type Ids:", tokenized_datasets['train']['token_type_ids'][0])
print("Attention Mask:", tokenized_datasets['train']['attention_mask'][0])

Tokens: ['[CLS]', 'after', 'the', 'reporting', 'period', ',', 'bio', '##tie', 'north', 'am', '##eric', '##an', 'licensing', 'partner', 'so', '##max', '##on', 'pharmaceutical', '##s', 'announced', 'positive', 'results', 'with', 'na', '##lm', '##ef', '##ene', 'in', 'a', 'pilot', 'phase', '2', 'clinical', 'trial', 'for', 'smoking', 'c', '##ess', '##ation', '.', '[SEP]']
41
Label: 2
Input: tensor([  101,  1170,  1103,  7516,  1669,   117, 25128,  9570,  1564,  1821,
        26237,  1389, 15469,  3547,  1177, 22871,  1320, 18119,  1116,  1717,
         3112,  2686,  1114,  9468, 13505, 11470,  7582,  1107,   170,  3955,
         4065,   123,  7300,  3443,  1111,  9987,   172,  5800,  1891,   119,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
torch.Size([64])
Token Type Ids: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
tokenized_datasets['train'].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'label', 'tokens'])

In [None]:
print('Max train sentence length: ', max([len(sen) for sen in financial_phrasebank_dict['train']['sentence']]))

Max train sentence length:  308


#### Create dataset and dataloader


In [None]:
def data_loader(batch_size, tokenized_data):
  #Creating the DataLoader which will help us to load data into the GPU/CPU
  batch_size = batch_size
  dataloaders = {}
  # Create the DataLoader for our data set.
  for collection in tokenized_data: 
    data = TensorDataset(tokenized_data[collection]['input_ids'], tokenized_data[collection]['token_type_ids'], 
                         tokenized_data[collection]['attention_mask'], torch.tensor(tokenized_data[collection]['label']),
                         )
    # print(data)
    
    if collection == 'train':
      sampler = RandomSampler(data)
    else: 
      sampler = SequentialSampler(data)
    
    dataloaders[collection] = DataLoader(data, sampler=sampler, batch_size=batch_size)

  return dataloaders

In [None]:
data_loaders = data_loader(batch_size=32, tokenized_data=tokenized_datasets)

#### Loading the pre-trained BERT model from huggingface library: 
BertForSequenceClassification the pretrained BERT model with a single linear classification layer on top. 

In [None]:
model =  BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# model

In [None]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    device = 'cuda'
else:
    device = 'cpu'

In [None]:
torch.cuda.is_available()

True

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
import numpy as np
def accuracy(preds, labels): 
  preds = preds.detach().cpu().numpy()
  labels = labels.to('cpu').numpy()
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Train

In [None]:
def train(model, optimizer, criterion, train_dataloader, val_dataloader, num_epochs, load_pretrained=False):
    
    plot_cache = {'train_loss':[], 'train_acc': [], 'val_loss':[], 'val_acc': []}
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    
    for epoch in range(num_epochs):
        print("")
        print("Epoch:", epoch)
        if not load_pretrained:
            
            model.train() 
            counter = 0
            
            train_batch_loss = 0
            train_batch_acc = 0
            
            for step, batch in enumerate(train_dataloader):
                optimizer.zero_grad()
                counter += 1

                b_input_ids = batch[0].to(device)
                b_token_type_ids = batch[1].to(device)
                b_input_mask = batch[2].to(device)
                b_labels = batch[3].to(device)

                logits = model(b_input_ids, token_type_ids=b_token_type_ids,attention_mask=b_input_mask)[0]
                # print(logits.size())   
                # print(b_labels.size())
                loss = criterion(logits.view(-1, logits.size()[1]), b_labels.view(-1))
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

                #train batch accuracy & loss: 
                train_batch_loss += loss.item()
                train_batch_acc += accuracy(logits, b_labels)

            epoch_train_loss = train_batch_loss / counter
            train_losses.append(epoch_train_loss)

            epoch_train_acc = train_batch_acc / counter
            train_accs.append(epoch_train_acc)

            print("")
            print("  Average training loss: {0:.2f}".format(epoch_train_loss))
            print("  Average training acc: {0:.2f}".format(epoch_train_acc))

        model.eval()
        with torch.no_grad():

            val_batch_loss = 0
            val_batch_acc = 0 
            val_counter = 0 

            for step, batch in enumerate(val_dataloader):
                val_counter +=1 
                b_input_ids = batch[0].to(device)
                b_token_type_ids = batch[1].to(device)
                b_input_mask = batch[2].to(device)
                b_labels = batch[3].to(device)

                logits = model(b_input_ids, token_type_ids=b_token_type_ids,attention_mask=b_input_mask)[0]
                        
                val_loss = criterion(logits.view(-1, logits.size()[1]), b_labels.view(-1))
                
                #validation batch accuracy & loss: 
                val_batch_loss += val_loss.item()
                val_batch_acc += accuracy(logits, b_labels)
                # print("Batch acc:", val_batch_acc)
                # print("Batch loss:", val_batch_loss)

            epoch_val_loss = val_batch_loss / val_counter
            val_losses.append(epoch_val_loss)

            epoch_val_acc = val_batch_acc / val_counter
            val_accs.append(epoch_val_acc)

            print("")
            print("  Average validation loss: {0:.2f}".format(epoch_val_loss))
            print("  Average validation accuracy: {0:.2f}".format(epoch_val_acc))

        plot_cache['val_loss'].append(epoch_val_loss)
        plot_cache['val_acc'].append(epoch_val_acc)

        plot_cache['train_loss'].append(epoch_train_loss)
        plot_cache['train_acc'].append(epoch_train_acc)
    
    return plot_cache

In [None]:
plot_cache = train(model.to(device), optimizer, criterion, train_dataloader=data_loaders['train'], val_dataloader=data_loaders['valid'], num_epochs=5,  load_pretrained=False)


Epoch: 0

  Average training loss: 0.91
  Average training acc: 0.59

  Average validation loss: 0.82
  Average validation accuracy: 0.66

Epoch: 1

  Average training loss: 0.80
  Average training acc: 0.64

  Average validation loss: 0.81
  Average validation accuracy: 0.66

Epoch: 2

  Average training loss: 0.71
  Average training acc: 0.70

  Average validation loss: 0.74
  Average validation accuracy: 0.69

Epoch: 3

  Average training loss: 0.58
  Average training acc: 0.76

  Average validation loss: 0.72
  Average validation accuracy: 0.74

Epoch: 4

  Average training loss: 0.47
  Average training acc: 0.82

  Average validation loss: 0.77
  Average validation accuracy: 0.76


# Evaluate

In [None]:
#Evaluating our model on the test set

# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(data_loaders['test'])))

# Put model in evaluation mode
model.eval()

# Tracking variables 
pred_labels , true_labels = [], []

# Predict 
for batch in data_loaders['test']:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  #Unpack the inputs from our dataloader
  b_input_ids,b_token_type_ids,  b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  pred_labels.append(logits)
  true_labels.append(label_ids)

Predicting labels for 31 test sentences...


In [None]:
# 0: 'negative'
# 1: 'neutral'
# 2: 'positive
neutral, positive, negative = 0,0,0
for num in financial_phrasebank_dict['test']['label']: 
  if num == 1:
    neutral += 1 
  elif num == 2: 
    positive += 1 
  else: 
    negative += 1

test_size = len(financial_phrasebank_dict['test']['label'])

In [None]:
print('Positive samples: %d of %d (%.2f%%)' % (positive, test_size, (positive / test_size * 100.0)))

Positive samples: 267 of 970 (27.53%)


In [None]:
print('Length of one batch of predictions:' , len(pred_labels[0]), '\n' , pred_labels[0]) 

Length of one batch of predictions: 32 
 [[-1.9561164   1.4764671   0.47912   ]
 [-1.1707851   1.5321939  -0.30746183]
 [-1.7339723   2.6343248  -1.4860181 ]
 [-1.9766779   2.3522124  -0.7139114 ]
 [-1.4665638   1.9703399  -1.1904297 ]
 [ 1.8163798  -1.9582653   0.3811872 ]
 [-2.4837236   2.3419034  -0.42181006]
 [-1.9693336   2.7756677  -1.5953579 ]
 [-0.05415426 -1.5534953   1.8259779 ]
 [-1.0334909   0.60724586  0.5549024 ]
 [-1.9864109   2.8196588  -1.529021  ]
 [-1.8395462   2.780816   -1.6727744 ]
 [-2.1193264   2.1700437  -0.28301784]
 [-1.662284    1.8751233  -0.20562175]
 [ 1.022599   -0.9437102   0.58216697]
 [ 1.6325885  -1.3983743   0.04253381]
 [-2.107618    2.9093404  -1.1507095 ]
 [ 1.0961431  -1.6163067   1.1348453 ]
 [-2.5563602   2.5971236  -0.39279017]
 [-2.3535526   1.954251   -0.06912953]
 [-1.8995874   1.0790801   0.7390092 ]
 [-0.20021155 -1.4299909   1.6287733 ]
 [ 0.32332602 -0.74758273  0.9827371 ]
 [ 1.8204323  -1.4610221  -0.10632248]
 [-1.7332036   2.615190

In [None]:
print('Length of one batch of true labels:' , len(true_labels[0]), '\n' , true_labels[0]) 

Length of one batch of true labels: 32 
 [2 1 1 1 1 0 1 1 2 1 0 1 1 2 2 1 1 1 2 0 2 2 1 0 2 1 1 2 2 0 2 1]


In [None]:
# For each input batch the predictions are a 3-column ndarray (one column for "0", one column for "1", and one column for "2"). 
# Pick the label with the highest value and turn this
predictions = []
for i in range(len(pred_labels)):
  predictions.append(np.argmax(pred_labels[i], axis=1).flatten())

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [None]:
actual = np.concatenate(true_labels, axis =0)
predictions = np.concatenate(predictions, axis =0)

In [None]:
print("Accuracy", accuracy_score(actual, predictions))
print("Precision", precision_score(actual, predictions, average='macro'))
print("F1 Score", f1_score(actual, predictions, average='macro'))

Accuracy 0.7061855670103093
Precision 0.6594896331738437
F1 Score 0.6140012738285491


Refrences: 
- https://huggingface.co/FinanceInc

# Appendix

In [None]:
# class Dataset(torch.utils.data.Dataset):
#     def __init__(self, encodings):
#         # store encodings internally
#         self.encodings = encodings

#     def __len__(self):
#         # return the number of samples
#         return self.encodings['input_ids'].shape[0]

#     def __getitem__(self, i):
#         # return dictionary of input_ids, attention_mask, and labels for index i
#         return {key: tensor[i] for key, tensor in self.encodings.items()}

In [None]:
# encodings = {'train_encodings': {'input_ids': tokenized_datasets['train']['input_ids'], 
#                                 'token_type_ids': tokenized_datasets['train']['token_type_ids'], 
#                                 'attention_mask': tokenized_datasets['train']['attention_mask'], 
#                                 'labels': tokenized_datasets['train']['labels']
#                                  },
             
#              'val_encodings': {'input_ids': tokenized_datasets['validation']['input_ids'], 
#                                 'token_type_ids': tokenized_datasets['validation']['token_type_ids'], 
#                                 'attention_mask': tokenized_datasets['validation']['attention_mask'], 
#                                 'labels': tokenized_datasets['validation']['labels']
#                                  },

#              'test_encodings':{'input_ids': tokenized_datasets['test']['input_ids'], 
#                               'token_type_ids': tokenized_datasets['test']['token_type_ids'], 
#                               'attention_mask': tokenized_datasets['test']['attention_mask'], 
#                               'labels': tokenized_datasets['test']['labels']
#                                }
#               }

In [None]:
# datasets = {
#     'train_dataset': Dataset(encodings['train_encodings']),
#     'val_dataset': Dataset(encodings['val_encodings']),
#     'test_dataset': Dataset(encodings['test_encodings'])
#     }

In [None]:
# datasets

In [None]:
# print(len(datasets['train_dataset']))
# print(len(datasets['val_dataset']))

In [None]:
# dataloaders = {'loader_train': DataLoader(datasets['train_dataset'], batch_size=32, shuffle=True),
#                'loader_val': DataLoader(datasets['val_dataset'], batch_size=32, shuffle=True),
#                'loader_test': DataLoader(datasets['test_dataset'], batch_size=32, shuffle=True)}

In [None]:
# for batch in dataloaders['loader_train']:
#     for key, value in batch.items():
#         print(key, ":", value)
#     break

In [None]:
# print((tokenized_datasets['train']['input_ids']).size(0))
# print((tokenized_datasets['train']['token_type_ids']).size(0)) 
# print((tokenized_datasets['train']['attention_mask']).size(0))
# print(torch.tensor(tokenized_datasets['train']['labels']))

In [None]:
# def train(model, optimizer, criterion, train_dataloader, num_epochs, load_pretrained=False):
#     plot_cache = {'train_loss':[], 'train_acc': [], 'val_loss':[], 'val_acc': []}
#     train_losses = []
#     train_accs = []
#     val_losses = []
#     val_accs = []
    
#     for epoch in range(num_epochs):
#         print("Epoch:", epoch)
#         if not load_pretrained:
#           model.train() 
#             counter = 0
#             train_batch_loss = 0
#             train_batch_acc = 0

#             val_batch_loss = 0
#             val_batch_acc = 0 

            
#             for step, batch in enumerate(train_dataloader):
                
#                 b_input_ids = batch[0].to(device)
#                 b_token_type_ids = batch[1].to(device)
#                 b_input_mask = batch[2].to(device)
#                 b_labels = batch[3].to(device)

#                 optimizer.zero_grad()
#                 counter += 1

#                 logits = model(b_input_ids, token_type_ids=b_token_type_ids,attention_mask=b_input_mask)[0]
#                 # print(logits.size())   
#                 # print(b_labels.size())
#                 loss = criterion(logits.view(-1, logits.size()[1]), b_labels.view(-1))
#                 train_batch_loss += loss.item()

#                 #train batch accuracy: 
#                 train_batch_acc += accuracy(logits, b_labels)

#                 loss.backward()
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#                 optimizer.step()
            
#             epoch_train_loss = train_batch_loss / counter
#             print(counter)
#             train_losses.append(epoch_train_loss)

#             epoch_train_acc = train_batch_acc / counter
#             # print(counter)
#             train_accs.append(epoch_train_acc)

#             print("")
#             print("  Average training loss: {0:.2f}".format(epoch_train_loss))
#             print("  Average training acc: {0:.2f}".format(epoch_train_acc))

#         plot_cache['train_loss'].append(epoch_train_loss)
#         plot_cache['train_acc'].append(epoch_train_acc)
    
#     return plot_cache, model

In [None]:
# plot_cache, model = train(model.to(device), optimizer, criterion, train_dataloader=data_loaders['train'], num_epochs=5, load_pretrained=False)