In [92]:
!nvidia-smi

Tue Nov 28 01:48:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    51W / 400W |   1067MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [93]:
!pip install --upgrade  textblob gensim pytorch-nlp swifter




In [126]:

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import itertools
import sys
from textblob import TextBlob, Word
import numpy as np
import random
import re
import swifter
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import os
import pandas as pd
import gensim
import warnings
import nltk

max_length = 100
# Hyperparameters
embedding_dim = 100  # embedding dimension
hidden_dim = 100  # LSTM hidden dimensions
num_layers = 1  # number of LSTM layers
batch_size = 64  # batch size
num_epochs = 10  # number of epochs to train
lr = 0.001  # learning rate


def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  random.seed(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
set_seeds_and_trace()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [127]:
%%writefile get_data.sh
if [ ! -f ner_dataset.csv ]; then
  wget -O ner_dataset.csv https://www.dropbox.com/s/mbfv0x988mdj89h/ner_dataset.csv?dl=0
fi


Overwriting get_data.sh


In [128]:
!bash get_data.sh

In [129]:
data= pd.read_csv("./ner_dataset.csv",encoding="latin1")
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [130]:
print("Unique Words in corpus:",data['Word'].nunique())
print("Unique Tag in corpus:",data['Tag'].nunique())

Unique Words in corpus: 35178
Unique Tag in corpus: 17


In [131]:
words = list(set(data['Word'].values))
words.append("ENDPAD")
num_words = len(words)
tags = list(set(data['Tag'].values))
num_tags = len(tags)

In [132]:
class SentenceGetter(object):
  def __init__(self,data):
    self.n_sent = 1 #counter
    self.data = data
    agg_func = lambda s:[(w,p,t) for w,p,t in zip(s['Word'].tolist(),s['POS'].tolist(),s['Tag'].tolist())]
    self.grouped = self.data.groupby("Sentence #").apply(agg_func)
    self.sentences = [s for s in self.grouped]



getter = SentenceGetter(data)
sentences = getter.sentences

In [133]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [134]:
word2idx =  {w : i+1 for i,w in enumerate(words)}
tag2idx  =  {t : i for i,t in enumerate(tags)}

In [135]:
tokenized_sentences = [[word2idx[w[0]] for w in s]for s in sentences]

tokenized_sentences[0]

[29596,
 25251,
 21031,
 16530,
 34066,
 417,
 28589,
 23689,
 15978,
 16447,
 31340,
 32557,
 14364,
 4705,
 24470,
 16447,
 18025,
 25251,
 26566,
 3620,
 9597,
 1579,
 13191,
 19103]

In [136]:
maximum_length = max([len(x) for x in tokenized_sentences])
maximum_length

104

In [137]:
pre_X = [F.pad(torch.tensor(x), (0, maximum_length-len(x)), "constant", 0) for x in tokenized_sentences]
X = torch.stack(pre_X)
X.shape

torch.Size([47959, 104])

In [138]:
tokenizer_entities = [[tag2idx[w[2]] for w in s]for s in sentences]
tokenizer_entities[0]

[16,
 16,
 16,
 16,
 16,
 16,
 7,
 16,
 16,
 16,
 16,
 16,
 7,
 16,
 16,
 16,
 16,
 16,
 9,
 16,
 16,
 16,
 16,
 16]

In [139]:
maximum_tag_length = max([len(y) for y in tokenizer_entities])
maximum_tag_length

104

In [140]:
F.pad(torch.tensor(tokenizer_entities[0]), (0, maximum_tag_length-len(tokenizer_entities[0])), "constant", tag2idx["O"])

tensor([16, 16, 16, 16, 16, 16,  7, 16, 16, 16, 16, 16,  7, 16, 16, 16, 16, 16,
         9, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16])

In [141]:
pre_y = [F.pad(torch.tensor(tokenizer_entities[i]), (0, maximum_tag_length-len(tokenizer_entities[i])), "constant", tag2idx["O"]) for i in range(X.shape[0])]
y = torch.stack(pre_y)
y.shape

torch.Size([47959, 104])

In [142]:
y_categorical = torch.stack([F.one_hot(y[0], num_classes=num_tags) for i in range(y.shape[0])])
y_categorical.shape

torch.Size([47959, 104, 17])

In [143]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y_categorical, test_size=0.1, random_state=42)

In [144]:
train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)

In [145]:
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True, drop_last=True)


In [208]:
# Define a BiLSTM model for NER
class BiLSTMForNER(nn.Module):
    def __init__(self, num_words, num_tags, embedding_dim, hidden_dim=100, num_layers=1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        # Embedding layer that converts input words to embeddings
        self.word_embeddings = nn.Embedding(num_words, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states with dimensionality hidden_dim.
        # It will be bidirectional, meaning one LSTM for the forward pass and one for the backward pass.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=True, batch_first=True)

        # The linear layer maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, num_tags)

    def forward(self, sentence, hidden):
        # Get the embeddings for the sentence
        embeds = self.word_embeddings(sentence)
        # Pass the embeddings through the LSTM; lstm_out shape is (len(sentence), batch_size, hidden_dim)
        lstm_out, hidden = self.lstm(embeds, hidden)
        # Pass the LSTM output through the linear layer to get the tag space
        tag_space = self.hidden2tag(lstm_out)
        # Convert the tag space to tag scores, which are log probabilities of the tags
        tag_scores = torch.log_softmax(tag_space, dim=1)
        return tag_scores, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(2*self.num_layers, batch_size, self.hidden_dim // 2).zero_().to(device),
            weight.new(2*self.num_layers, batch_size, self.hidden_dim // 2).zero_().to(device)
        )
        return hidden

In [209]:
model = BiLSTMForNER(num_words, num_tags, embedding_dim, hidden_dim, num_layers).to(device)


In [210]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


In [211]:
def kl_divergence_loss(outputs, targets, tagset_size):
    """
    Calculate the KL Divergence loss for the outputs with respect to the targets
    """
    # Flatten outputs and targets to compute the distribution loss across all batches and classes
    outputs = outputs.view(-1, tagset_size)
    targets = targets.view(-1, tagset_size)
    # Compute KL Divergence
    kl_loss = F.kl_div(F.log_softmax(outputs, dim=1), F.softmax(targets.to(float), dim=1), reduction='batchmean')
    return kl_loss

In [224]:
def train_model(model, train_dataloader, optimizer, num_tags, learning_rate, epochs):
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        hidden = model.init_hidden(batch_size)
        total_loss = 0
        for sentences, tag_indices in train_dataloader:
            sentences = sentences.to(device)
            tag_indices = tag_indices.to(device)
            # Clear the gradients before each instance
            optimizer.zero_grad()
            # Forward pass
            outputs, hidden = model(sentences, hidden)
            loss = kl_divergence_loss(outputs, tag_indices, num_tags)
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            for hidden_state in hidden:
              hidden_state.detach_()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_dataloader)}")


In [226]:
train_model(model, train_dl, optimizer, num_tags, learning_rate=lr, epochs=num_epochs)

Epoch 1/10, Loss: 0.0024895705788056696
Epoch 2/10, Loss: 0.001997616343538338
Epoch 3/10, Loss: 0.001617987942079435
Epoch 4/10, Loss: 0.001173353546994676
Epoch 5/10, Loss: 0.0007177031897923659
Epoch 6/10, Loss: 0.0007289458718095877
Epoch 7/10, Loss: 0.0005730376620197423
Epoch 8/10, Loss: 0.0005998020456972916
Epoch 9/10, Loss: 0.0005338901493125186
Epoch 10/10, Loss: 0.0005248048226053383


In [279]:
ix2tag = {ix: tag for tag, ix in tag2idx.items()}

In [287]:
def argmax(iterable):
    return max(enumerate(iterable), key=lambda x: x[1])[0]

In [302]:
from sklearn.metrics import classification_report
def evaluate_model(model, test_dataloader, ix_to_tag):
    model.eval()  # Set the model to evaluation mode
    true_tags = []
    pred_tags = []
    hidden = model.init_hidden(batch_size)
    with torch.no_grad():
        for sentence, tags in test_dataloader:
            sentence = sentence.to(device)
            tags = tags.to(device)
            # Forward pass
            tag_scores, hidden = model(sentence, hidden)
            # Get the predicted tags
            _, max_indices = torch.max(F.softmax(tag_scores, dim=1), dim=-1, keepdim=True)
            one_hot = torch.zeros_like(F.softmax(tag_scores, dim=1))
            one_hot.scatter_(-1, max_indices, 1)
            # Update lists of true tags and predicted tags
            true_tags.extend(tags.tolist())
            pred_tags.extend(one_hot.tolist())
    # Convert index sequences to tag name sequences


    true_tag_names = [ix_to_tag[argmax(ix)] for ix in true_tags]
    pred_tag_names = [ix_to_tag[argmax(ix)] if argmax(ix) < 17 else ix_to_tag[16] for ix in pred_tags ]

    # Calculate and print the classification report
    print(classification_report(true_tag_names, pred_tag_names))

In [303]:
evaluate_model(model, test_dl, ix2tag)

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00         0
       B-eve       1.00      0.97      0.98      4736
       B-geo       0.00      0.00      0.00         0
       B-gpe       0.00      0.00      0.00         0
       B-nat       0.00      0.00      0.00         0
       B-org       0.00      0.00      0.00         0
       B-per       0.00      0.00      0.00         0
       B-tim       0.00      0.00      0.00         0
       I-art       0.00      0.00      0.00         0
       I-eve       0.00      0.00      0.00         0
       I-geo       0.00      0.00      0.00         0
       I-gpe       0.00      0.00      0.00         0
       I-nat       0.00      0.00      0.00         0
       I-org       0.00      0.00      0.00         0
       I-per       0.00      0.00      0.00         0
       I-tim       0.00      0.00      0.00         0
           O       0.00      0.00      0.00         0

    accuracy              