<a href="https://colab.research.google.com/github/izabellakacprzak/twitter-location-ner/blob/master/CustomNERModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A custom Twitter Location NER Model based on the token classification Bert model.
Training data should be provided in **sentences.csv** in the following format:

```
Evansville #weather on November 13 2015 - 11/13/2015 http://t.co/l2Tc6nU3kI,I-LOC O O O O O O O O
```



In [None]:
###################################
#             IMPORTS             #
###################################

!pip install transformers
!pip install keras

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertForTokenClassification
import csv
import torch.nn as nn
import pandas as pd
import numpy as np
from transformers import AdamW

from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
###################################
#      EVALUATION FUNCTIONS       #
###################################

def sigmoid(x):
    return 1/(1 + np.exp(-x))

# Computes a confusion matrix of the format above based on y_pred (predictions)
#   and y_test (gold)
# Note: y_pred and y_test should be numpy arrays
def confusion_matrix(y_pred, y_test):
  # For each prediction, convert to a tag of 0 or 1
  y_pred_tag = np.round(sigmoid(y_pred)).astype('int32')
  y_test = y_test.astype('int32')

  # Since this is binary classification, the confusion matrix is (2, 2)
  confusion = np.zeros((2, 2), dtype=np.int)

  # Count each class for each prediction
  for i in range(len(y_pred_tag)):
    confusion[y_test[i], y_pred_tag[i]] += 1

  return confusion

# Calculates precision from a confusion matrix of the format above
# Note: class_label should be CORRECT or INCORRECT
def precision(confusion, class_label):
  true_pos = confusion[1][class_label]
  false_pos = confusion[0][class_label]

  return true_pos / (true_pos + false_pos)

# Calculates recall from s a confusion matrix of the format above
# Note: class_label should be CORRECT or INCORRECT
def recall(confusion, class_label):
  true_pos = confusion[class_label][1]
  false_neg = confusion[class_label][0]

  return true_pos / (true_pos + false_neg)

# Calculates f1 measure from a confusion matrix of the format above
# Note: class_label should be CORRECT or INCORRECT
def f_one_measure(confusion, class_label):
  total_precision = precision(confusion, class_label)
  total_recall = recall(confusion, class_label)

  return (2 * total_precision * total_recall) / (total_precision + total_recall)

# Calculates the average f1 measure from a confusion matrix of the format above
def avg_f_one_measure(confusion):
  correct_f_one = f_one_measure(confusion, 0)
  incorrect_f_one = f_one_measure(confusion, 1)

  return (correct_f_one + incorrect_f_one) / 2

# Calculates the accuracy from a confusion matrix of the format above
def accuracy(confusion):
  true_pos = confusion[1][1]
  true_neg = confusion[0][0]

  total = sum([np.sum(row) for row in confusion])

  return (true_pos + true_neg) / total

In [None]:
###################################
#          SET CONSTANTS          #
###################################

MAX_SEQUENCE_LENGTH = 50
LEARNING_RATE = 1e-7
BATCH_SIZE = 10
UNIQUE_LABELS = 2
EPOCHS = 5
DATA_FILE = "sentences.csv"

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
###################################
#            TOKENIZER            #
###################################

def tokenize_and_preserve_labels(tokenizer, sentence, text_labels):
  tokenized_sentence = []
  labels = []

  for word, label in zip(sentence, text_labels):
      tokenized_word = tokenizer.tokenize(word)
      n_subwords = len(tokenized_word)

      # Add the tokenized word to the final tokenized word list
      tokenized_sentence.extend(tokenized_word)

      # Add the same label to the new list of labels `n_subwords` times
      labels.extend([label] * n_subwords)

  return tokenized_sentence, labels

# A mapping of tags to indexes
tags2index = {t:i for i,t in enumerate(["O", "I-LOC"])}
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

# Getting the pre-generated sentences
df = pd.read_csv(DATA_FILE, delimiter=',', header=None, names=['texts', 'tags'], quoting=csv.QUOTE_NONE)
# df = shuffle(df)

sentences = [sent.split(" ") for sent in df.texts.values]
labels = [tags.split(" ") for tags in df.tags.values]

tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(tokenizer, sent, labs)
    for sent, labs in zip(sentences, labels)
]

tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

# Pad sentences and labels to MAX_SEQUENCE_LENGTH
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_SEQUENCE_LENGTH, dtype="long", value=0.0,
                          truncating="post", padding="post")

tags = pad_sequences([[tags2index.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_SEQUENCE_LENGTH, value=tags2index["O"], padding="post",
                     dtype="long", truncating="post")


# Split the dataset into train, validation and test
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

tr_inputs, rest_inputs, tr_tags, rest_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.2, train_size=0.8)
tr_masks, rest_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.2, train_size=0.8)

val_inputs, test_inputs, val_tags, test_tags = train_test_split(rest_inputs, rest_tags,
                                                            random_state=2018, test_size=0.5, train_size=0.5)
val_masks, test_masks, _, _ = train_test_split(rest_masks, rest_inputs,
                                             random_state=2018, test_size=0.5, train_size=0.5)

print(np.unique(tr_tags, return_counts=True))
print(val_tags)
# Creating tensors for training, validation and testing sentences, labels and attention masks
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
test_inputs = torch.tensor(test_inputs)

tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
test_tags = torch.tensor(test_tags)

tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
test_masks = torch.tensor(test_masks)

# Creating train, validation and test data loaders
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(val_inputs, val_masks, val_tags)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

print("Data Loaders preparation complete")

In [None]:
###################################
#       OPTIMISER AND LOSS        #
###################################
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=UNIQUE_LABELS,
                                                                       output_attentions = False,
                                                                       output_hidden_states = False)

optimiser = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

In [None]:
##################################
#         TRAINING LOOP          #
##################################
model.cuda()

# Store the current epoch number
epochs = 0

loss_values, validation_loss_values = [], []

# Iterate for up to 25 epochs
while epochs < EPOCHS:
  epochs += 1
  
  # Tracking variables (nb = Naive Bayes, tr=Tracking)
  tr_loss = 0
  nb_tr_steps = 0

  print("Processing Epoch Number: {}".format(epochs))
  

  ## Training ##

  # Set the model to training mode
  model.train()
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from the dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Clear out the gradients
    optimiser.zero_grad()

    # Forward pass
    # Shape of outputs -> (batch_size, num_features)
    #                     (so in this case 'torch.Size([32, 1])')
    outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)

    # Make b_labels the same shape as outputs and convert to float
    #     (i.e. from 'torch.Size([32])' to 'torch.Size([32, 1])')
    # b_labels = b_labels.unsqueeze(1)
    b_labels = b_labels.float()

    # Calculate loss
    loss = outputs[0]

    # Backward pass
    loss.backward()

    # Update parameters and take a step
    optimiser.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_steps += 1

  # Total loss for this epoch
  print(" Train loss: {}".format(tr_loss/nb_tr_steps))
    

  ## Validation ##

  # Put model in evaluation mode
  model.eval()

  # Initialise confusion matrix
  confusion = np.zeros((2, 2), dtype=np.int)

  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  predictions , true_labels = [], []
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = [b.type(torch.LongTensor) for b in batch]
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Don't compute or store gradients
    with torch.no_grad():
      # Forward pass, calculate logit predictions (predicted values)
      outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
      
    # Move logits and labels to CPU
    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate confusion matrix
    eval_loss += outputs[0].mean().item()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)

  eval_loss = eval_loss / len(validation_dataloader)
  validation_loss_values.append(eval_loss)
  print("Validation loss: {}".format(eval_loss))
  preds = np.array([p_i for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if l_i != 2])
  gold = np.array([p_i for p in true_labels for p_i in p if p_i != 2])
  confusion = confusion_matrix(preds, gold)
  print("Confusion matrix: {}".format(confusion))
  print("Accuracy: {}".format(accuracy(confusion)))

In [None]:
##################################
#         FINAL TESTING          #
##################################

# Put model in evaluation mode
model.eval()

# Initialise confusion matrix
confusion = np.zeros((UNIQUE_LABELS, UNIQUE_LABELS), dtype=np.int)

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []
# Evaluate data for one epoch
for batch in test_dataloader:
  # Add batch to GPU
  batch = [b.type(torch.LongTensor) for b in batch]
  batch = tuple(t.to(device) for t in batch)

  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
    
  # Don't compute or store gradients
  with torch.no_grad():
    # Forward pass, calculate logit predictions (predicted values)
    outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
      
  # Move logits and labels to CPU
  logits = outputs[1].detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  # Calculate confusion matrix
  eval_loss += outputs[0].mean().item()
  predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
  true_labels.extend(label_ids)

eval_loss = eval_loss / len(validation_dataloader)
validation_loss_values.append(eval_loss)
print("Validation loss: {}".format(eval_loss))
preds = np.array([p_i for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if l_i != 2])
gold = np.array([p_i for p in true_labels for p_i in p if p_i != 2])
confusion = confusion_matrix(preds, gold)
print(confusion)
acc = accuracy(confusion)
f1_zero = f_one_measure(confusion, 0)
f1_location = f_one_measure(confusion, 1)
print("F1 zero: {}".format(f1_zero))
print("F1 location: {}".format(f1_location))
f1 = avg_f_one_measure(confusion)
print("Accuracy: {}".format(acc))
print("F1: {}".format(f1))