# Drive files

In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
cd /content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf

/content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf


# Imports

In [63]:
import pandas as pd
import numpy as np
import torch

# Exploring the Data

In [64]:
# display original kaggle data
#Reading pandas df
data = pd.read_csv("data/ner_dataset.csv", encoding = "ISO-8859-1")

#Reading datasets from OS
train_sents = open('data/small/train/sentences.txt', 'r').readline()
train_labels = open('data/small/train/labels.txt', 'r').readline()
print('SENTENCE:', train_sents)
print('SENTENCE LABEL:', train_labels)
print('ORIGINAL DATA:\n', data.head())
#Deleting from memory just in case
del(data, train_sents, train_labels)

SENTENCE: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

SENTENCE LABEL: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

ORIGINAL DATA:
     Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


## Importing the data

In [65]:
#Read data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = [line.strip() for line in file.readlines()]
    return data

In [66]:
#Load the datasets from the google drive
train_sentences = load_data('/content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf/data/large/train/sentences.txt')
train_labels = load_data('/content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf/data/large/train/labels.txt')

val_sentences = load_data('/content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf/data/large/val/sentences.txt')
val_labels = load_data('/content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf/data/large/val/labels.txt')

test_sentences = load_data('/content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf/data/large/test/sentences.txt')
test_labels = load_data('/content/drive/MyDrive/SequenceModelsCoursera/W1_Test3/tf/data/large/test/labels.txt')

# Encoding

In [67]:
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

class SentenceVectorizer:
    """
    Custom word-level text encoder
    """

    #Initializing needed variables
    def __init__(self, pad_token="", unk_token="[UNK]"):
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.word2idx = {pad_token: 0, unk_token: 1}
        self.idx2word = {0: pad_token, 1: unk_token}
        self.vocab = [pad_token, unk_token]


    def fit(self, sentences):
        #Converting the single string if passed to a list for further processing
        if isinstance(sentences, str):
            sentences = [sentences]

        #Populating the dictionary with our vocabulary
        word_counts = Counter(word for sentence in sentences for word in sentence.split())
        for word, _ in word_counts.items():
            if word not in self.word2idx:
                self.word2idx[word] = len(self.word2idx)
                self.idx2word[len(self.idx2word)] = word
                self.vocab.append(word)


    def transform(self, sentences):
        if isinstance(sentences, str):
            sentences = [sentences]

        #Vectorizing the words by pulling the values from the dictionary, if none is found -> assign the UNK token
        vectorized = [[self.word2idx.get(word, self.word2idx[self.unk_token])
                       for word in sentence.split()]
                      for sentence in sentences]

        #Padding to the biggest sequence received
        return pad_sequence([torch.tensor(sentence) for sentence in vectorized],
                            batch_first=True,
                            padding_value=self.word2idx[self.pad_token])


def get_sentence_vectorizer(sentences):
    torch.manual_seed(33)

    # Creating the object of the Vectorizer
    sentence_vectorizer = SentenceVectorizer()

    #Building vocabulary
    sentence_vectorizer.fit(sentences)

    # Get the vocabulary
    vocab = sentence_vectorizer.vocab

    return sentence_vectorizer, vocab

In [68]:
#Creating the vectorizer object
vectorizer, vocab = get_sentence_vectorizer(train_sentences)
vocab_size = len(vocab)

In [69]:
def tokenize_sentences(sentences):
    #Tokenizing the passed sentence
    encoded_sentences = vectorizer.transform(sentences)

    return encoded_sentences

In [70]:
# Sample sentences
sentence = "Many French citizens are goin to visit Morocco for summer"

print(f"Sentence {sentence}")

# Vectorize the sentences
vectorized_sentences = tokenize_sentences(sentence)

print("\nVectorized sentences:")
print(vectorized_sentences)

print("\nShape of vectorized sentences:")
print(vectorized_sentences.shape)

Sentence Many French citizens are goin to visit Morocco for summer
Encoded sentence shape: torch.Size([1, 10])

Vectorized sentences:
tensor([[4284,  855, 1789,  182,    1,    9, 2354, 8377,  225, 6483]])

Shape of vectorized sentences:
torch.Size([1, 10])


## Encoding the sentences

In [71]:
# Vocab size
vocab_size = len(set(word for sentence in train_sentences for word in sentence.split()))

## Encoding labels

In [72]:
print(f"Sentences: {train_sentences[0]}")
print(f"Labels: {train_labels[0]}")

Sentences: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Labels: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O


In [73]:
#Creating a set of all possible tags
def get_tags(labels):
    tags = set(tag for tags in labels for tag in tags.split())
    tags = list(tags)
    tags.sort()

    return tags

In [74]:
#Creating a set of tags and saving it in a variable
tags = get_tags(train_labels)
num_tags = len(tags)
print(num_tags)

17


In [75]:
#Mapping each tag to a unique integer to encode the labels later
def make_tag_map(tags):
    tag_map = {}

    for i, tag in enumerate(tags):
        tag_map[tag] = i

    return tag_map

In [76]:
# #Mapping each tag to a unique integer to encode the labels later (saving into a var)
tag_map = make_tag_map(tags)
print(tag_map)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


## Encoding labels (this time fr)

## New

In [79]:
import torch.nn as nn

    #Create an encoded version of tags for a sample and pad them with -1 to the size of the biggest sequence present.

def label_vectorizer(labels, tags_map):
    #Iterating over each labels sample
    total_element_ids = []

    #Checking if input is a list of strings
    if isinstance(labels, list) and all(isinstance(item, str) for item in labels):
        for label in labels:
            element_ids = []
            tokens = label.split()

            #Iterating over each token in the same sequence targets
            for token in tokens:
                element_ids.append(tags_map[token])

            # Adding the encoded ids of one sample to the main list
            total_element_ids.append(element_ids)

    #Checking if the input is a single string
    elif isinstance(labels, str):
        element_ids = []
        tokens = labels.split()

        for token in tokens:
            element_ids.append(tags_map[token])

    #Padding
    padded_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(sequence) for sequence in total_element_ids],
                                                        batch_first=True, padding_value=-1)

    return padded_ids

In [80]:
print(f"Sentence: {train_sentences[:6]}")
print(f"Vectorized labels: {label_vectorizer(train_labels[:6], tag_map)}")

Sentence: ['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "', 'They marched from the Houses of Parliament to a rally in Hyde Park .', 'Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .', "The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .", "The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country ."]
Vectorized labels: tensor([[16, 16, 16, 16, 16, 16,  2, 16, 16, 16, 16, 16,  2, 16, 16, 16, 16, 16,
          3, 16, 16, 16, 16, 16, -1, -1, -1, -1, -1, -1],
        [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

# Building the Dataset

In [81]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class CustomDataset(Dataset):
    def __init__(self, sentences, labels, transform=tokenize_sentences, label_transform=label_vectorizer, tag_map=tag_map):
        super().__init__()
        self.transform = transform
        self.label_transform = label_transform
        self.tag_map = tag_map

        #Encoding our data
        self.sentences = self.transform(sentences)
        self.labels = self.label_transform(labels, self.tag_map)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = self.sentences[idx]
        labels = self.labels[idx]

        return sequence, labels

In [82]:
#Creating datasets
train_dataset = CustomDataset(train_sentences, train_labels)
val_dataset = CustomDataset(val_sentences, val_labels)
test_dataset = CustomDataset(test_sentences, test_labels)

#Dataloaders for batching
BATCH_SIZE = 64

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

Encoded sentence shape: torch.Size([33570, 104])
Encoded sentence shape: torch.Size([7194, 73])
Encoded sentence shape: torch.Size([7194, 70])


In [83]:
data, labels = next(iter(train_loader))

print(f"Train Data shape: {data.size()}")
print(f"Train Labels shape: {labels.size()}")

Train Data shape: torch.Size([64, 104])
Train Labels shape: torch.Size([64, 104])


In [84]:
data, labels = next(iter(val_loader))

print(f"Validation Data shape: {data.size()}")
print(f"Validation Labels shape: {labels.size()}")

Validation Data shape: torch.Size([64, 73])
Validation Labels shape: torch.Size([64, 73])


In [85]:
data, labels = next(iter(test_loader))

print(f"Test Data shape: {data.size()}")
print(f"Test Labels shape: {labels.size()}")

Test Data shape: torch.Size([64, 70])
Test Labels shape: torch.Size([64, 70])


In [86]:
# Exploring information about the training data
print(f'The number of outputs is {len(tag_map)}')

# The number of vocabulary tokens (including <PAD>)
g_vocab_size = vocab_size
print(f"Num of vocabulary words in the training set: {g_vocab_size}")
print('The training size is', len(train_dataset))
print('The validation size is', len(val_dataset))
print('An example of the first sentence is\n\t', next(iter(train_loader))[0])
print('An example of its corresponding label is\n\t', next(iter(train_loader))[1])

The number of outputs is 17
Num of vocabulary words in the training set: 29845
The training size is 33570
The validation size is 7194
An example of the first sentence is
	 tensor([[   63,  6461,  3717,  ...,     0,     0,     0],
        [ 6504,  1491,    95,  ...,     0,     0,     0],
        [   63,   554,   818,  ...,     0,     0,     0],
        ...,
        [ 2913,    18,    25,  ...,     0,     0,     0],
        [16155, 16156,   117,  ...,     0,     0,     0],
        [13211, 10169,  4943,  ...,     0,     0,     0]])
An example of its corresponding label is
	 tensor([[16, 16, 16,  ..., -1, -1, -1],
        [ 6, 14, 16,  ..., -1, -1, -1],
        [16, 16, 16,  ..., -1, -1, -1],
        ...,
        [16, 16, 16,  ..., -1, -1, -1],
        [16, 16, 16,  ..., -1, -1, -1],
        [16, 16, 16,  ..., -1, -1, -1]])


# Building a NN

In [87]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [88]:
class NER(nn.Module):
    def __init__(self, vocab_size = vocab_size, num_tags=num_tags, embedding_dim=50):
        super().__init__()

        #Defining a neural net
        self.embedding = nn.Embedding(vocab_size+2, embedding_dim, padding_idx=0)
        self.LSTM = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)
        self.fc = nn.Linear(embedding_dim, num_tags)
        self.activation = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        #Defining NN's forward prop
        #print(f"just passed x shape: {x.size()}\n")
        x = self.embedding(x)
        #print(f"embedded x shape: {x.size()}\n")
        x, _ = self.LSTM(x)
        #print(f"post LSTM x shape: {x.size()}\n")
        x = self.fc(x)
        #print(f"post fc x shape: {x.size()}\n")
        outputs = self.activation(x)
        #print(f"post activated outputs shape: {outputs.size()}\n")
        outputs = outputs.transpose(1, 2)  # New shape: [64, 17, 154]

        return outputs.to(device)

In [89]:
#Creating a model
model = NER(vocab_size, len(tag_map)).to(device)

## Masked loss

In [90]:
#Defining a loss function (ignoreing padding indices -1)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [91]:
#Testing the loss function
true_labels = torch.tensor([0,1,2,0])
predicted_logits = torch.tensor([[-2.3,-0.51,-1.20] , [-1.61,-0.36,-2.30], [-2.30, -0.69,-0.92], [-0.92,-0.92,-1.61]])
print(loss_fn(predicted_logits, true_labels))

tensor(1.1243)


## Masked accuracy

In [92]:
def masked_accuracy(y_pred, y_true):
    """
    Defining a masked loss function that will account for padded values and will cancel them out
    """
    y_true = y_true.to(dtype=torch.float).to(device)

    #Creating a mask to detect padded values and cancel them out
    mask = torch.eq(y_true, -1)
    mask = 1 - mask.to(dtype=torch.float).to(device)

    #Extracting actual labels from logits
    y_pred_class = y_pred.to(dtype=torch.float).to(device)
    y_pred_class = torch.argmax(y_pred_class, dim=-2)

    #Checking the # of correctly predicted values
    matches_true_pred = torch.eq(y_pred_class, y_true)
    matches_true_pred = matches_true_pred.to(dtype=torch.float).to(device)

    #Canceling out the padded values (mask)
    matches_true_pred *= mask

    #Calculating accuracy
    masked_acc = torch.sum(matches_true_pred) / torch.sum(mask)

    return masked_acc


In [93]:
true_labels = torch.tensor([0,1,2,0])
predicted_logits = torch.tensor([[0.1,0.6,0.3] , [0.2,0.7,0.1], [0.1, 0.5,0.4], [0.4,0.4,0.2]])
predicted_logits = torch.transpose(predicted_logits, 0, 1)
print(masked_accuracy(predicted_logits, true_labels))

tensor(0.5000)


## Optimizer & Loss

In [94]:
optimizer = torch.optim.Adam(params = model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)
metric = masked_accuracy

# Training

In [95]:
for epoch in range(2):
    running_loss = 0.0

    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        #print(f"labels size: {labels.size()}")

        #Zero the parameter gradients
        optimizer.zero_grad()

        #Forward prop
        outputs = model(inputs)
        #print(f"Outputs shape: {outputs.size()}")
        loss = criterion(outputs, labels)
        accuracy = metric(outputs, labels)
        loss.backward()
        optimizer.step()

        #Print statistics
        running_loss += loss.item()

        if i % 50 == 49:
            print(f'[{epoch + 1}, {i + 1}] masked loss: {running_loss / 2000:.4f}, masked accuracy: {accuracy:.4f}')
            running_loss = 0.0

print("Finished training")

[1, 50] masked loss: 0.0222, masked accuracy: 0.8497
[1, 100] masked loss: 0.0115, masked accuracy: 0.9255
[1, 150] masked loss: 0.0083, masked accuracy: 0.9355
[1, 200] masked loss: 0.0065, masked accuracy: 0.9179
[1, 250] masked loss: 0.0057, masked accuracy: 0.9359
[1, 300] masked loss: 0.0053, masked accuracy: 0.9492
[1, 350] masked loss: 0.0048, masked accuracy: 0.9518
[1, 400] masked loss: 0.0044, masked accuracy: 0.9417
[1, 450] masked loss: 0.0042, masked accuracy: 0.9469
[1, 500] masked loss: 0.0041, masked accuracy: 0.9546
[2, 50] masked loss: 0.0033, masked accuracy: 0.9605
[2, 100] masked loss: 0.0033, masked accuracy: 0.9531
[2, 150] masked loss: 0.0031, masked accuracy: 0.9713
[2, 200] masked loss: 0.0032, masked accuracy: 0.9570
[2, 250] masked loss: 0.0031, masked accuracy: 0.9622
[2, 300] masked loss: 0.0031, masked accuracy: 0.9634
[2, 350] masked loss: 0.0030, masked accuracy: 0.9717
[2, 400] masked loss: 0.0032, masked accuracy: 0.9616
[2, 450] masked loss: 0.0030, 

## Validation

In [96]:
running_val_loss = 0.0
model.eval()

with torch.no_grad():
    for i, (inputs, labels) in enumerate(val_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        accuracy = metric(outputs, labels)

        running_val_loss += loss.item()

        if i % 20 == 19:
            print(f"[{i + 1}] masked loss: {running_val_loss:.4f}, masked val accuracy: {accuracy:.4f}")
            running_val_loss = 0.0

[20] masked loss: 4.1505, masked val accuracy: 0.9441
[40] masked loss: 3.9000, masked val accuracy: 0.9572
[60] masked loss: 3.6106, masked val accuracy: 0.9541
[80] masked loss: 3.9606, masked val accuracy: 0.9481
[100] masked loss: 3.4380, masked val accuracy: 0.9635


# Testing on our own sentence

In [97]:
def tag_map_reverse(tag_map):
    """
    Mapping indices to Named Entity tags
    """
    tag_map_reverse = {}

    for tag, idx in tag_map.items():
        tag_map_reverse[idx] = tag

    return tag_map_reverse


idx_to_tags = tag_map_reverse(tag_map)

In [100]:
def predict(sentence, model, sentence_vectorizer, tag_map):
    """
    Predicting NER tags on a custom function
    """
    vectorized_sentence = sentence_vectorizer(sentence)
    print(vectorized_sentence.size())

    outputs = model(vectorized_sentence.to(device))
    print(outputs.size())

    tag_ids = torch.argmax(outputs, axis=1)
    tag_ids = torch.squeeze(tag_ids, dim=0)

    tag_predictions = []

    for id in tag_ids:
        tag_predictions.append(tag_map[id.item()])

    return tag_predictions

In [101]:
sentence = "Peter Parker , the White House director of trade and manufacturing policy of U.S , said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall , though he said it wouldn ’t necessarily come"

predictions = predict(sentence, model, tokenize_sentences, idx_to_tags)

print(f"Sentence: {sentence}")
print(f"Predicted labels: {predictions}")

for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)

Encoded sentence shape: torch.Size([1, 52])
torch.Size([1, 52])
torch.Size([1, 17, 52])
Sentence: Peter Parker , the White House director of trade and manufacturing policy of U.S , said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall , though he said it wouldn ’t necessarily come
Predicted labels: ['B-per', 'I-per', 'O', 'O', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'O', 'O', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Peter B-per
Parker I-per
White B-org
House I-org
U.S B-org
Sunday B-tim
morning I-tim
White B-org
House I-org
