In [1]:
import numpy as np
import random
import torch

In [2]:
import spacy

In [3]:
print(torch.__version__)

1.10.0+cu102


In [4]:
import torchtext

In [5]:
torch.cuda.is_available()
torch.cuda.get_device_name(0)

'GeForce GTX 1050'

## Data Preparation

In [6]:
# Helper function, grab data from torchtext
#Field helps us handle how the data will be processed and sent to us.
#In this case, we set lower=True to get all words in lower case
#We will train and test our model using the UDPOS, which stands for universal dependency pos tagging
#This torchtext dataset is pre-labelled with UD pos tags
#Load the UDPOS dataset and then do the train test splitting
def get_UDPOS():
    Fields = (("text", torchtext.legacy.data.Field(lower=True)), 
              ("udtags", torchtext.legacy.data.Field(unk_token=None)), (None, None))
    train, val, test = torchtext.legacy.datasets.UDPOS.splits(Fields)
    return (train, val, test)

In [7]:
class PrepData():
    def __init__(self):
        self.text = torchtext.legacy.data.Field(lower=True)
        self.tags = torchtext.legacy.data.Field(unk_token=None)
        self.Fields = (("text", self.text), ("udtags", self.tags), (None, None))
        self.train = self.get_UDPOS()[0]
        self.val = self.get_UDPOS()[1]
        self.test = self.get_UDPOS()[2]
        self.trainIter = self.iterator()[0]
        self.valIter = self.iterator()[1]
        self.testIter = self.iterator()[2]
    def get_UDPOS(self):
        Train, Val, Test = torchtext.legacy.datasets.UDPOS.splits(self.Fields)
        return (Train, Val, Test)
    def vectorize_text(self):
        #Use pre-trained word vector from GloVe, 
        #Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download
        self.text.build_vocab(self.train, min_freq = 2, vectors = "glove.6B.50d")
    def vectorize_tags(self):
        self.tags.build_vocab(self.train)
    #BucketIterator can batch sentences of similar lengths together to minimize
    #the amount of padding needed
    #This function will define such bucket iterator to load batches of data from the UDP dataset
    def iterator(self):
        TrainIt, ValIt, TestIt = torchtext.legacy.data.BucketIterator.splits(
        (self.train, self.val, self.test),
        batch_size = 100,
        device = torch.device("cuda"))
        return (TrainIt, ValIt, TestIt)

In [8]:
#Initialize and prepare data
prepdata = PrepData()
text = prepdata.text
tags = prepdata.tags
Fields = prepdata.Fields
train, val, test = prepdata.train, prepdata.val, prepdata.test
trainIter, valIter, testIter = prepdata.trainIter, prepdata.valIter, prepdata.testIter
#prepdata.vectorize_text()
#prepdata.vectorize_tags()
text.build_vocab(train, min_freq = 2, vectors = "glove.6B.50d")
tags.build_vocab(train)

## Building Model

#### The design of our bi-lstm model looks like the plot below. The tokens will first enter an embedding layer for encoding. And then they will be fed to a bidirectional LSTM layer with pretrained weights. Last but not least, we can get predicted tags after going through the linear layer. Dropout layers are added to prevent overfitting.

In [9]:
from IPython.display import Image
Image(url= "PytorchBiLSTM.jpg", width=600, height=600)

In [10]:
# Here is the structure of the POSTagging neural network. 
# This class first initializes the variables and then the forward function implements the structure

class POSTagging(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, dropout, output_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx = 1)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_size, num_layers = 2, dropout=dropout, bidirectional=True)
        self.linear = torch.nn.Linear(hidden_size*2, output_size)
        self.dropout = torch.nn.Dropout(dropout)
    def forward(self, text):
        embedded_text = self.dropout(self.embedding(text))
        lstm_out, (hidden_state, cell_state) = self.lstm(embedded_text)
        return self.linear(self.dropout(lstm_out))

In [11]:
#initialize the network
net = POSTagging(len(text.vocab), 50, 64, 0.2, len(tags.vocab))
#load pretrained embedding values into the embedding layer
net.embedding.weight.data.copy_(text.vocab.vectors)
net.embedding.weight.data[1] = torch.zeros(50)
#define the loss to be the cross entropy loss, which works well with multi-class classification problem
criterion = torch.nn.CrossEntropyLoss(ignore_index = 0)
#throw the jobs to nvidia card
net = net.to('cuda')
criterion = criterion.to('cuda')
#We use adam for gradient descent to optimize the parameters
optimizer = torch.optim.Adam(net.parameters())

In [12]:
# The main function helps with training, validation, and testing processes.
# Each chunck of conditional loop is responsible for reading in the data (in iterator format), 
# making predictions, as well as keeping track of loss and accuracy
def main(validation, evaluation=False):
    total_loss = 0
    total_accuracy = 0
    if validation == False and evaluation == False:
        net.train()
        for i in trainIter:
            X = i.text
            y = i.udtags
            optimizer.zero_grad()
            pred = net(X)
            pred = pred.view(-1, pred.shape[-1])
            y = y.view(-1)
            loss = criterion(pred,y)
            #when calculating accuracy, one should not discard unknown tokens
            accuracy = pred.argmax(dim = 1, keepdim = True)[(y != 0).nonzero()].squeeze(1).eq(y[(y != 0).nonzero()]).sum() / y[(y != 0).nonzero()].shape[0]
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_accuracy += accuracy.item()
        training_loss = total_loss / len(trainIter)
        training_accuracy = total_accuracy / len(trainIter)
        return training_loss, training_accuracy
    elif validation == True and evaluation == False:
        net.eval()
        with torch.no_grad():
            for i in valIter:
                X = i.text
                y = i.udtags
                pred = net(X)
                pred = pred.view(-1, pred.shape[-1])
                y = y.view(-1)
                loss = criterion(pred, y)
                accuracy = pred.argmax(dim = 1, keepdim = True)[(y != 0).nonzero()].squeeze(1).eq(y[(y != 0).nonzero()]).sum() / y[(y != 0).nonzero()].shape[0]
                total_loss += loss.item()
                total_accuracy += accuracy.item()
            validation_loss = total_loss / len(valIter)
            validation_accuracy = total_accuracy / len(valIter)
            return validation_loss, validation_accuracy
    else:
        net.eval()
        with torch.no_grad():
            for i in testIter:
                X = i.text
                y = i.udtags
                pred = net(X)
                pred = pred.view(-1, pred.shape[-1])
                y = y.view(-1)
                loss = criterion(pred, y)
                accuracy = pred.argmax(dim = 1, keepdim = True)[(y != 0).nonzero()].squeeze(1).eq(y[(y != 0).nonzero()]).sum() / y[(y != 0).nonzero()].shape[0]
                total_loss += loss.item()
                total_accuracy += accuracy.item()
            testing_loss = total_loss / len(testIter)
            testing_accuracy = total_accuracy / len(testIter)
            return testing_loss, testing_accuracy

In [13]:
#Training the model with 10 epochs on GPU
#Training accuracy keeps increasing, which is a good sign
for i in range(10):
    training_loss, training_accuracy = main(False)
    validation_loss, validation_accuracy = main(True)
    testing_loss, testing_accuracy = main(False, True)
    print(f'Epoch: {i + 1:1}')
    print(f'\t Training Loss: {training_loss:.3f} , Training Accuracy: {training_accuracy:.3f}')
    print(f'\t Validation Loss: {validation_loss:.3f} ,  Validation Accuracy: {validation_accuracy:.3f}')
    print(f'\t Testing Loss: {testing_loss:.3f} ,  Testing Accuracy: {testing_accuracy:.3f}')

Epoch: 1
	 Training Loss: 1.933 , Training Accuracy: 0.390
	 Validation Loss: 1.167 ,  Validation Accuracy: 0.643
	 Testing Loss: 1.189 ,  Testing Accuracy: 0.645
Epoch: 2
	 Training Loss: 0.748 , Training Accuracy: 0.772
	 Validation Loss: 0.710 ,  Validation Accuracy: 0.792
	 Testing Loss: 0.720 ,  Testing Accuracy: 0.791
Epoch: 3
	 Training Loss: 0.485 , Training Accuracy: 0.850
	 Validation Loss: 0.597 ,  Validation Accuracy: 0.820
	 Testing Loss: 0.596 ,  Testing Accuracy: 0.825
Epoch: 4
	 Training Loss: 0.391 , Training Accuracy: 0.879
	 Validation Loss: 0.546 ,  Validation Accuracy: 0.833
	 Testing Loss: 0.540 ,  Testing Accuracy: 0.839
Epoch: 5
	 Training Loss: 0.334 , Training Accuracy: 0.896
	 Validation Loss: 0.500 ,  Validation Accuracy: 0.846
	 Testing Loss: 0.490 ,  Testing Accuracy: 0.852
Epoch: 6
	 Training Loss: 0.298 , Training Accuracy: 0.907
	 Validation Loss: 0.479 ,  Validation Accuracy: 0.849
	 Testing Loss: 0.467 ,  Testing Accuracy: 0.854
Epoch: 7
	 Training Lo

## Testing on Synthetic Data

In [14]:
#read our synthetic data file generated in another part
import pickle
with open ('test_data', 'rb') as fp:
    test_data = pickle.load(fp)

In [15]:
test_data

[[('here', 'ADV'),
  ('claim', 'NOUN'),
  ('the', 'DET'),
  ('quest', 'NOUN'),
  ('stuff', 'NOUN'),
  ('the', 'DET'),
  ('word', 'NOUN'),
  ('dilute', 'ADJ'),
  ('doll', 'NOUN'),
  ('sooner', 'ADV')],
 [('residential', 'ADJ'),
  ("''", '.'),
  ('the', 'DET'),
  ('the', 'DET'),
  ('the', 'DET'),
  ('some', 'DET'),
  ('A', 'DET'),
  ('output', 'NOUN'),
  ('deeper', 'ADJ'),
  ('the', 'DET')],
 [('him', 'PRON'),
  ('mastiff', 'CONJ'),
  ('his', 'DET'),
  ('thoroughly', 'ADV'),
  ('Among', 'ADP'),
  ('as', 'ADP'),
  ('young', 'ADJ'),
  ('An', 'DET'),
  ('desperate', 'ADJ'),
  ('Judge', 'NOUN')],
 [('the', 'DET'),
  ('by', 'ADP'),
  ('his', 'DET'),
  ('which', 'DET'),
  ('into', 'ADP'),
  ('high', 'ADJ'),
  ('that', 'PRON'),
  ('necessary', 'ADJ'),
  ('those', 'DET'),
  ('of', 'ADP')],
 [('in', 'ADP'),
  ('the', 'DET'),
  ('therapeutic', 'ADJ'),
  ('a', 'DET'),
  ('This', 'DET'),
  ('of', 'ADP'),
  ('not', 'ADV'),
  ('nothing', 'NOUN'),
  ('as', 'ADP'),
  ('The', 'DET')],
 [('His', 'DET'),
 

In [16]:
#Remove unknown tag X
for i in test_data:
    for j in i:
        if j[1]=='X':
            i.remove((j[0],j[1]))

In [17]:
#break up the input data structure into lists of sentences and tags
test_sentences = []
for i in test_data:
    test_sentence = []
    for j in range(len(i)):
        test_sentence.append(i[j][0])
    test_sentences.append(test_sentence)

In [18]:
test_tags = []
for i in test_data:
    test_tag = []
    for j in range(len(i)):
        test_tag.append(i[j][1])
    test_tags.append(test_tag)

In [19]:
#Synthetic data is based on nltk,
#Some tags are named different to torchtext's
#Fix here
for i in test_tags:
    for j in range(len(i)):
        if i[j] == '.':
            i[j] = 'PUNCT'
        if i[j] == 'PRT':
            i[j] = 'PART'

In [20]:
# Make predictions on tags using the synthetic data
pred_tags_output=[]
for i in range(len(test_sentences)):
    sentence = [i for i in test_sentences[i]]
    # Vectorize the sentence. 0 stands for unknown tokens in torchtext
    vectorized_text = [text.vocab.stoi[t] for t in [j for j in test_sentences[i]]]
    # Turn the vectorized sentence into a tensor object
    tensor = torch.LongTensor(vectorized_text).unsqueeze(-1).to('cuda')
    pred = net(tensor)
    # Make classification for each token based on the tag with the highest probability
    pred_tags = [tags.vocab.itos[i.item()] for i in pred.argmax(-1)]
    pred_tags_output.append(pred_tags)

In [21]:
# A brief look of how the model performs before checking the metrics

test_sentences[1],pred_tags_output[1],test_tags[1]

(['residential',
  "''",
  'the',
  'the',
  'the',
  'some',
  'A',
  'output',
  'deeper',
  'the'],
 ['PROPN', 'PUNCT', 'DET', 'DET', 'DET', 'DET', 'ADJ', 'NOUN', 'VERB', 'DET'],
 ['ADJ', 'PUNCT', 'DET', 'DET', 'DET', 'DET', 'DET', 'NOUN', 'ADJ', 'DET'])

In [22]:
test_sentences[7],pred_tags_output[7],test_tags[7]

(['hold', 'E', 'its', 'a', 'deferments', '2', 'the', 'the', 'and', 'work'],
 ['NOUN', 'VERB', 'PRON', 'DET', 'NOUN', 'NUM', 'DET', 'DET', 'CCONJ', 'NOUN'],
 ['VERB', 'NOUN', 'DET', 'DET', 'DET', 'NUM', 'DET', 'DET', 'CONJ', 'NOUN'])

In [23]:
#Added classifier report to the code so that it displays precision, recall, and f1 score
import sklearn.metrics

In [24]:
pred_tags_output_flat = [i for j in pred_tags_output for i in j]
#Keep consistency, torchtext divides up CONJ into CCONJ and SCONJ for instance
#Merge them into CONJ like NLTK
for i in range(len(pred_tags_output_flat)):
    if pred_tags_output_flat[i] == "CCONJ":
        pred_tags_output_flat[i] = "CONJ"
    elif pred_tags_output_flat[i] == "SCONJ":
        pred_tags_output_flat[i] = "CONJ"
    elif pred_tags_output_flat[i] == "PROPN":
        pred_tags_output_flat[i] = "NOUN"
    elif pred_tags_output_flat[i] == "AUX":
        pred_tags_output_flat[i] = "VERB"
test_tags_flat = [i for j in test_tags for i in j]
#imbalanced labels
print(f'\t Accuracy: {sklearn.metrics.accuracy_score(test_tags_flat, pred_tags_output_flat):.3f}')
print(f'\t Precision: {sklearn.metrics.precision_score(test_tags_flat, pred_tags_output_flat, average="macro", zero_division = 0):.3f}')
print(f'\t F1 Score: {sklearn.metrics.f1_score(test_tags_flat, pred_tags_output_flat, average="macro", zero_division = 0):.3f}')
print(f'\t Recall: {sklearn.metrics.recall_score(test_tags_flat, pred_tags_output_flat, average="macro", zero_division = 0):.3f}')

	 Accuracy: 0.666
	 Precision: 0.658
	 F1 Score: 0.609
	 Recall: 0.630


In [25]:
#balanced labels
print(f'\t Accuracy: {sklearn.metrics.accuracy_score(test_tags_flat, pred_tags_output_flat):.3f}')
print(f'\t Precision: {sklearn.metrics.precision_score(test_tags_flat, pred_tags_output_flat, average="weighted", zero_division = 0):.3f}')
print(f'\t F1 Score: {sklearn.metrics.f1_score(test_tags_flat, pred_tags_output_flat, average="weighted", zero_division = 0):.3f}')
print(f'\t Recall: {sklearn.metrics.recall_score(test_tags_flat, pred_tags_output_flat, average="weighted", zero_division = 0):.3f}')

	 Accuracy: 0.666
	 Precision: 0.778
	 F1 Score: 0.691
	 Recall: 0.666


In [26]:
print(sklearn.metrics.classification_report(test_tags_flat, pred_tags_output_flat))

              precision    recall  f1-score   support

         ADJ       0.81      0.53      0.64       118
         ADP       0.84      0.77      0.80        66
         ADV       0.64      0.54      0.58        52
        CONJ       0.78      0.78      0.78        40
         DET       0.98      0.65      0.78       358
        NOUN       0.43      0.73      0.54       114
         NUM       0.92      0.55      0.69        20
        PART       0.25      0.05      0.09        19
        PRON       0.27      0.72      0.39        40
       PUNCT       1.00      0.88      0.93        80
        VERB       0.35      0.73      0.47        51

    accuracy                           0.67       958
   macro avg       0.66      0.63      0.61       958
weighted avg       0.78      0.67      0.69       958



## Testing on Real World Data

In [63]:
#real world data
#Using brown from nltk library
import nltk
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\jhlda\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\jhlda\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [64]:
brown = brown.tagged_sents(tagset='universal')[0:100]

In [65]:
for i in brown:
    for j in i:
        if j[1]=='X':
            i.remove((j[0],j[1]))

In [66]:
real_test_sentences = []
for i in brown:
    real_test_sentence = []
    for j in range(len(i)):
        real_test_sentence.append(i[j][0])
    real_test_sentences.append(real_test_sentence)

In [67]:
real_test_tags = []
for i in brown:
    real_test_tag = []
    for j in range(len(i)):
        real_test_tag.append(i[j][1])
    real_test_tags.append(real_test_tag)

In [68]:

for i in real_test_tags:
    for j in range(len(i)):
        if i[j] == '.':
            i[j] = 'PUNCT'
        if i[j] == 'PRT':
            i[j] = 'PART'

In [69]:
brown

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [70]:
real_pred_tags_output=[]
for i in range(len(real_test_sentences)):
    sentence = [i for i in real_test_sentences[i]]
    vectorized_text = [text.vocab.stoi[t] for t in [j for j in real_test_sentences[i]]]
    tensor = torch.LongTensor(vectorized_text).unsqueeze(-1).to('cuda')
    pred = net(tensor)
    pred_tags = [tags.vocab.itos[i.item()] for i in pred.argmax(-1)]
    real_pred_tags_output.append(pred_tags)

In [71]:
real_pred_tags_output_flat = [i for j in real_pred_tags_output for i in j]
real_test_tags_flat = [i for j in real_test_tags for i in j]
for i in range(len(real_pred_tags_output_flat)):
    if real_pred_tags_output_flat[i] == "CCONJ":
        real_pred_tags_output_flat[i] = "CONJ"
    elif real_pred_tags_output_flat[i] == "SCONJ":
        real_pred_tags_output_flat[i] = "CONJ"
    elif real_pred_tags_output_flat[i] == "PROPN":
        real_pred_tags_output_flat[i] = "NOUN"
    elif real_pred_tags_output_flat[i] == "AUX":
        real_pred_tags_output_flat[i] = "VERB"
#imbalanced labels
print(f'\t Accuracy: {sklearn.metrics.accuracy_score(real_test_tags_flat, real_pred_tags_output_flat):.3f}')
print(f'\t Precision: {sklearn.metrics.precision_score(real_test_tags_flat, real_pred_tags_output_flat, average="macro", zero_division = 0):.3f}')
print(f'\t F1 Score: {sklearn.metrics.f1_score(real_test_tags_flat, real_pred_tags_output_flat, average="macro", zero_division = 0):.3f}')
print(f'\t Recall: {sklearn.metrics.recall_score(real_test_tags_flat, real_pred_tags_output_flat, average="macro", zero_division = 0):.3f}')

	 Accuracy: 0.855
	 Precision: 0.749
	 F1 Score: 0.728
	 Recall: 0.732


In [72]:
#balanced labels
print(f'\t Accuracy: {sklearn.metrics.accuracy_score(real_test_tags_flat, real_pred_tags_output_flat):.3f}')
print(f'\t Precision: {sklearn.metrics.precision_score(real_test_tags_flat, real_pred_tags_output_flat, average="weighted", zero_division = 0):.3f}')
print(f'\t F1 Score: {sklearn.metrics.f1_score(real_test_tags_flat, real_pred_tags_output_flat, average="weighted", zero_division = 0):.3f}')
print(f'\t Recall: {sklearn.metrics.recall_score(real_test_tags_flat, real_pred_tags_output_flat, average="weighted", zero_division = 0):.3f}')

	 Accuracy: 0.855
	 Precision: 0.875
	 F1 Score: 0.859
	 Recall: 0.855


In [74]:
print(sklearn.metrics.classification_report(real_test_tags_flat, real_pred_tags_output_flat, zero_division = 1))

              precision    recall  f1-score   support

         ADJ       0.65      0.81      0.72       120
         ADP       0.96      0.80      0.87       256
         ADV       0.91      0.67      0.77        58
        CONJ       0.60      0.98      0.74        47
         DET       0.96      0.76      0.85       279
        NOUN       0.84      0.92      0.88       721
         NUM       0.86      0.62      0.72        39
        PART       0.85      0.70      0.77        57
        PRON       0.47      0.75      0.58        48
       PUNCT       1.00      0.86      0.93       259
        VERB       0.89      0.92      0.91       384
           X       0.00      1.00      0.00         0

    accuracy                           0.85      2268
   macro avg       0.75      0.82      0.73      2268
weighted avg       0.88      0.85      0.86      2268

