<a href="https://colab.research.google.com/github/jdasam/aat3020-2023/blob/main/notebooks/2_Named_entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition
- For a given word and its context window, estimate whether the given word is location or not

# 1. Download dataset
- CoNLL2003 

In [1]:
!wget https://data.deepai.org/conll2003.zip # Download dataset
!unzip conll2003.zip # Unzip dataset zip

--2023-03-28 06:07:10--  https://data.deepai.org/conll2003.zip
Resolving data.deepai.org (data.deepai.org)... 156.146.56.169, 2400:52e0:1500::977:1
Connecting to data.deepai.org (data.deepai.org)|156.146.56.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 982975 (960K) [application/zip]
Saving to: ‘conll2003.zip’


2023-03-28 06:07:10 (74.4 MB/s) - ‘conll2003.zip’ saved [982975/982975]

Archive:  conll2003.zip
  inflating: metadata                
  inflating: test.txt                
  inflating: train.txt               
  inflating: valid.txt               


## 2. Preprocess Dataset

In [3]:
with open("train.txt") as f:
  string = ''.join(f.readlines())
dataset = string.split('\n')

dataset[:70]

['-DOCSTART- -X- -X- O',
 '',
 'EU NNP B-NP B-ORG',
 'rejects VBZ B-VP O',
 'German JJ B-NP B-MISC',
 'call NN I-NP O',
 'to TO B-VP O',
 'boycott VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 '. . O O',
 '',
 'Peter NNP B-NP B-PER',
 'Blackburn NNP I-NP I-PER',
 '',
 'BRUSSELS NNP B-NP B-LOC',
 '1996-08-22 CD I-NP O',
 '',
 'The DT B-NP O',
 'European NNP I-NP B-ORG',
 'Commission NNP I-NP I-ORG',
 'said VBD B-VP O',
 'on IN B-PP O',
 'Thursday NNP B-NP O',
 'it PRP B-NP O',
 'disagreed VBD B-VP O',
 'with IN B-PP O',
 'German JJ B-NP B-MISC',
 'advice NN I-NP O',
 'to TO B-PP O',
 'consumers NNS B-NP O',
 'to TO B-VP O',
 'shun VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 'until IN B-SBAR O',
 'scientists NNS B-NP O',
 'determine VBP B-VP O',
 'whether IN B-SBAR O',
 'mad JJ B-NP O',
 'cow NN I-NP O',
 'disease NN I-NP O',
 'can MD B-VP O',
 'be VB I-VP O',
 'transmitted VBN I-VP O',
 'to TO B-PP O',
 'sheep NN B-NP O',
 '. . O O',
 '',
 'Germany NNP B-NP B

In [4]:
from itertools import groupby

dataset_in_sentence = [list(group) for k, group in groupby(dataset, lambda x: x == "") if not k]
dataset_in_sentence[:5]

[['-DOCSTART- -X- -X- O'],
 ['EU NNP B-NP B-ORG',
  'rejects VBZ B-VP O',
  'German JJ B-NP B-MISC',
  'call NN I-NP O',
  'to TO B-VP O',
  'boycott VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  '. . O O'],
 ['Peter NNP B-NP B-PER', 'Blackburn NNP I-NP I-PER'],
 ['BRUSSELS NNP B-NP B-LOC', '1996-08-22 CD I-NP O'],
 ['The DT B-NP O',
  'European NNP I-NP B-ORG',
  'Commission NNP I-NP I-ORG',
  'said VBD B-VP O',
  'on IN B-PP O',
  'Thursday NNP B-NP O',
  'it PRP B-NP O',
  'disagreed VBD B-VP O',
  'with IN B-PP O',
  'German JJ B-NP B-MISC',
  'advice NN I-NP O',
  'to TO B-PP O',
  'consumers NNS B-NP O',
  'to TO B-VP O',
  'shun VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  'until IN B-SBAR O',
  'scientists NNS B-NP O',
  'determine VBP B-VP O',
  'whether IN B-SBAR O',
  'mad JJ B-NP O',
  'cow NN I-NP O',
  'disease NN I-NP O',
  'can MD B-VP O',
  'be VB I-VP O',
  'transmitted VBN I-VP O',
  'to TO B-PP O',
  'sheep NN B-NP O',
  '. . O O']]

In [5]:
# [len(sentence) for sentence in dataset_in_sentence]
filtered_dataset = [sentence for sentence in dataset_in_sentence if len(sentence) > 5]
len(filtered_dataset)

10625

In [6]:
window_len = 2
sentence = filtered_dataset[0]

for i, word in enumerate(sentence):
  # print(word)
  splitted_word = word.split(' ')
  # print(splitted_word)
  center_word = splitted_word[0]
  label = splitted_word[-1]
  print(center_word, label)
  is_organization = label in ['B-ORG', 'I-ORG']
  # print(is_organization)
  
  prev_index = max(i - window_len, 0)
  prev_words = sentence[prev_index:i]
  prev_words = [word_str.split(' ')[0] for word_str in prev_words]

  # print(prev_words)

  next_index = i + window_len + 1
  next_words = sentence[i+1:next_index]
  # next_words = [sentence[next_index] ]
  next_words = [word_str.split(' ')[0] for word_str in next_words]

  # We have to add padding, if number of prev words or next words are shorter than expected
  if len(prev_words) != window_len:
    prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

  if len(next_words) != window_len:
    next_words = next_words + ['<pad>'] * (window_len - len(next_words))

  concatenated_words = prev_words + [center_word] + next_words
  print(concatenated_words)


EU B-ORG
['<pad>', '<pad>', 'EU', 'rejects', 'German']
rejects O
['<pad>', 'EU', 'rejects', 'German', 'call']
German B-MISC
['EU', 'rejects', 'German', 'call', 'to']
call O
['rejects', 'German', 'call', 'to', 'boycott']
to O
['German', 'call', 'to', 'boycott', 'British']
boycott O
['call', 'to', 'boycott', 'British', 'lamb']
British B-MISC
['to', 'boycott', 'British', 'lamb', '.']
lamb O
['boycott', 'British', 'lamb', '.', '<pad>']
. O
['British', 'lamb', '.', '<pad>', '<pad>']


In [7]:
import gensim.downloader

wrd2vec = gensim.downloader.load("glove-wiki-gigaword-300")



In [8]:
len(wrd2vec)

400000

In [9]:
import torch
import numpy as np
vec_dim = 300

corresp_vectors = []
for word in concatenated_words:
  if word.lower() in wrd2vec: # if the word exists in wrd2vec vocab
    vec = wrd2vec[word.lower()] # call corresponding vector 
    # vec = torch.tensor(vec)
  else: # there is no matching word in wrd2vec vocab, such as <pad>
    # vec = torch.zeros(vec_dim)
    vec = np.zeros(vec_dim) # use zero vectors for that token (word)
  corresp_vectors.append(vec)

# cat_vector = torch.cat(corresp_vectors)
cat_vector = torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float)
cat_vector.shape

# torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float).dtype

torch.Size([1500])

In [10]:
cat_vector

tensor([ 0.4436, -0.2418,  0.2366,  ...,  0.0000,  0.0000,  0.0000])

In [12]:
# pair of data sample (input) and the label (desired output)
cat_vector, is_organization

(tensor([ 0.4436, -0.2418,  0.2366,  ...,  0.0000,  0.0000,  0.0000]), False)

# Design Model

In [14]:
import torch.nn as nn

class OrgClassifier(nn.Module):
  def __init__(self, input_dim=1500, hidden_size=32):
    super().__init__()
    self.layer1 = nn.Linear(in_features=input_dim, out_features=hidden_size)
    self.layer2 = nn.Linear(in_features=hidden_size, out_features=1)
  
  def forward(self, x):
    hidden = self.layer1(x)
    hidden = torch.relu(hidden)
    out = self.layer2(hidden)
    return out.sigmoid()

model = OrgClassifier()
out = model(cat_vector)
print(cat_vector.shape, out.shape, out)

torch.Size([1500]) torch.Size([1]) tensor([0.4609], grad_fn=<SigmoidBackward0>)


In [15]:
hidden = model.layer1(cat_vector)
print(hidden)
print(hidden.shape)
hidden = torch.relu(hidden) # You have to put non-linear operation between layers
print(hidden)
out = model.layer2(hidden)
print(out)
print(out.shape)

tensor([-0.0178, -0.2095, -0.3372,  0.1973,  0.0235, -0.1307,  0.1096, -0.3513,
         0.0741, -0.1546, -0.1529, -0.1720,  0.0741,  0.2228,  0.3113, -0.0898,
        -0.1379,  0.2494,  0.1218, -0.0782,  0.0012,  0.0158, -0.0357, -0.1224,
        -0.1205, -0.0797,  0.1083, -0.0118,  0.2305,  0.1643, -0.0163,  0.1856],
       grad_fn=<AddBackward0>)
torch.Size([32])
tensor([0.0000, 0.0000, 0.0000, 0.1973, 0.0235, 0.0000, 0.1096, 0.0000, 0.0741,
        0.0000, 0.0000, 0.0000, 0.0741, 0.2228, 0.3113, 0.0000, 0.0000, 0.2494,
        0.1218, 0.0000, 0.0012, 0.0158, 0.0000, 0.0000, 0.0000, 0.0000, 0.1083,
        0.0000, 0.2305, 0.1643, 0.0000, 0.1856], grad_fn=<ReluBackward0>)
tensor([-0.1568], grad_fn=<AddBackward0>)
torch.Size([1])


In [35]:
# How nn.Linear works
out = model.layer2(hidden)

# Let's get the same value by matrix multiplication
# model.layer2.weight, model.layer2.bias
for param in model.layer2.named_parameters(): # you can call list of entire parameters
# by amodule.parameters()
  print(param)

print(hidden.shape, model.layer2.weight.shape)
hidden_mat = hidden.unsqueeze(0)
print(hidden_mat, hidden_mat.shape)
weighted_sum = torch.mm(hidden_mat, model.layer2.weight.T ) #torch.mm is much more strict than torch.matmul

weighted_sum_forloop = 0
for x, w in zip(hidden, model.layer2.weight[0]):
  # print(x.item(), w.item())
  weighted_input = x.item() * w.item()
  weighted_sum_forloop += weighted_input

print(weighted_sum_forloop, weighted_sum)

print(model.layer2.bias)
final_output = weighted_sum + model.layer2.bias
print(final_output, out)

('weight', Parameter containing:
tensor([[-0.1497,  0.1098,  0.0792,  0.1644, -0.0683, -0.0363,  0.0626,  0.1616,
          0.0830,  0.0584,  0.0949, -0.0562,  0.1452, -0.0965, -0.1577,  0.1127,
         -0.1460,  0.0594,  0.1732,  0.1479,  0.1643, -0.0771,  0.1436, -0.0760,
         -0.1402, -0.1011,  0.0470,  0.0520, -0.0498,  0.1345, -0.0660, -0.0856]],
       requires_grad=True))
('bias', Parameter containing:
tensor([-0.1756], requires_grad=True))
torch.Size([32]) torch.Size([1, 32])
tensor([[0.0000, 0.0000, 0.0000, 0.1973, 0.0235, 0.0000, 0.1096, 0.0000, 0.0741,
         0.0000, 0.0000, 0.0000, 0.0741, 0.2228, 0.3113, 0.0000, 0.0000, 0.2494,
         0.1218, 0.0000, 0.0012, 0.0158, 0.0000, 0.0000, 0.0000, 0.0000, 0.1083,
         0.0000, 0.2305, 0.1643, 0.0000, 0.1856]],
       grad_fn=<UnsqueezeBackward0>) torch.Size([1, 32])
0.01875207416171904 tensor([[0.0188]], grad_fn=<MmBackward0>)
Parameter containing:
tensor([-0.1756], requires_grad=True)
tensor([[-0.1568]], grad_fn=<AddB

In [None]:
relu_hidden = torch.relu(hidden) 
relu_hidden = hidden.relu()

torch.sigmoid(out) == out.sigmoid()

tensor([True])

# Make Dataset Class

In [56]:
class Dataset:
  def __init__(self, txt_fn, wrd2vec):
    dataset = self.read_text_data(txt_fn)
    dataset_in_sentence = self.group_by_sentence(dataset)
    filtered_dataset = [sentence for sentence in dataset_in_sentence if len(sentence) > 5]
    self.data_in_sentence = filtered_dataset

    # for every sentence, make windowed_words pairs:
    total_windowed_words = []
    for sentence in self.data_in_sentence:
      total_windowed_words += self.get_windowed_words_from_sentence(sentence)
    self.data = total_windowed_words
    self.wrd2vec = wrd2vec 
    self.vec_size = wrd2vec.vector_size
  
  def read_text_data(self, txt_fn):
    with open("train.txt") as f:
      string = ''.join(f.readlines())
    dataset = string.split('\n')
    return dataset
  
  def group_by_sentence(self, dataset):
    dataset_in_sentence = [list(group) for k, group in groupby(dataset, lambda x: x == "") if not k]
    return dataset_in_sentence

  def get_windowed_words_from_sentence(self, sentence):
    result = []
    for i, word in enumerate(sentence):
      splitted_word = word.split(' ')
      center_word = splitted_word[0]
      label = splitted_word[-1]
      is_organization = label in ['B-ORG', 'I-ORG']
      
      prev_index = max(i - window_len, 0)
      prev_words = sentence[prev_index:i]
      prev_words = [word_str.split(' ')[0] for word_str in prev_words]


      next_index = i + window_len + 1
      next_words = sentence[i+1:next_index]
      next_words = [word_str.split(' ')[0] for word_str in next_words]

      # We have to add padding, if number of prev words or next words are shorter than expected
      if len(prev_words) != window_len:
        prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

      if len(next_words) != window_len:
        next_words = next_words + ['<pad>'] * (window_len - len(next_words))

      concatenated_words = prev_words + [center_word] + next_words
      result.append( (concatenated_words, is_organization))
    return result

  def __len__(self): # number of independent data samples
    return len(self.data)

  def __getitem__(self, idx):
    # is called when you call dataset[idx]
    cat_words, label = self.data[idx]

    # return self.data[idx]
    return self.convert_windowed_words_to_vector(cat_words), label

  def convert_windowed_words_to_vector(self, cat_words):
    # cat_words: list of strings
    # e.g. ['<pad>', '<pad>', 'EU', 'rejects', 'German']
    corresp_vectors = []
    for word in cat_words:
      if word.lower() in self.wrd2vec: # if the word exists in wrd2vec vocab
        vec = self.wrd2vec[word.lower()] # call corresponding vector 
        # vec = torch.tensor(vec)
      else: # there is no matching word in wrd2vec vocab, such as <pad>
        # vec = torch.zeros(vec_dim)
        vec = np.zeros(self.vec_size) # use zero vectors for that token (word)
      corresp_vectors.append(vec)
    return torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float)

dataset = Dataset("train.txt", wrd2vec)

In [55]:
len(dataset.data), dataset.data[0]

(192587, (['<pad>', '<pad>', 'EU', 'rejects', 'German'], True))

In [57]:
len(dataset), dataset[0]

(192587,
 (tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.1950,  0.2041,  0.3530]), True))

In [59]:
dataset[100]

(tensor([-0.2295,  0.3255, -0.0927,  ..., -0.3422, -0.0224,  0.1368]), False)

In [52]:
wrd2vec.vector_size

300

In [38]:
# Our method can make training samples from a given data_in_setence
dataset.get_windowed_words_from_sentence(dataset.data_in_sentence[100])

[(['<pad>', '<pad>', 'Israel', "'s", 'Channel'], False),
 (['<pad>', 'Israel', "'s", 'Channel', 'Two'], False),
 (['Israel', "'s", 'Channel', 'Two', 'television'], True),
 (["'s", 'Channel', 'Two', 'television', 'said'], True),
 (['Channel', 'Two', 'television', 'said', 'Damascus'], False),
 (['Two', 'television', 'said', 'Damascus', 'had'], False),
 (['television', 'said', 'Damascus', 'had', 'sent'], False),
 (['said', 'Damascus', 'had', 'sent', 'a'], False),
 (['Damascus', 'had', 'sent', 'a', '"'], False),
 (['had', 'sent', 'a', '"', 'calming'], False),
 (['sent', 'a', '"', 'calming', 'signal'], False),
 (['a', '"', 'calming', 'signal', '"'], False),
 (['"', 'calming', 'signal', '"', 'to'], False),
 (['calming', 'signal', '"', 'to', 'Israel'], False),
 (['signal', '"', 'to', 'Israel', '.'], False),
 (['"', 'to', 'Israel', '.', '<pad>'], False),
 (['to', 'Israel', '.', '<pad>', '<pad>'], False)]

In [61]:
# use data loader

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=16, shuffle=True, drop_last=True)

for batch in dataloader:
  print(batch)
  break

[tensor([[-0.2456,  0.0680,  0.1825,  ..., -0.2329, -0.1223,  0.3550],
        [-0.0702,  0.3243,  0.0081,  ..., -0.2550,  0.0078, -0.6203],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.2004, -0.0822, -0.0626],
        ...,
        [-0.6533,  0.2088,  0.0180,  ..., -0.3988, -0.8299,  0.1757],
        [-0.0268,  1.0621, -0.0129,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ..., -1.2129,  0.7766,  0.1933]]), tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False])]


In [62]:
input_tensors = batch[0]
labels = batch[1]

input_tensors.shape, labels.shape

(torch.Size([16, 1500]), torch.Size([16]))

In [64]:
# Check that our model can compute the input batch
pred = model(input_tensors)
pred.shape, pred

(torch.Size([16, 1]), tensor([[0.4473],
         [0.4213],
         [0.4807],
         [0.4716],
         [0.4324],
         [0.4557],
         [0.4618],
         [0.4389],
         [0.4836],
         [0.4591],
         [0.4559],
         [0.4601],
         [0.4466],
         [0.4591],
         [0.4678],
         [0.4732]], grad_fn=<SigmoidBackward0>))

In [71]:
labels.dtype, labels, labels.float()

(torch.bool,
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [75]:
print(pred)
print(pred.squeeze())

tensor([[0.4473],
        [0.4213],
        [0.4807],
        [0.4716],
        [0.4324],
        [0.4557],
        [0.4618],
        [0.4389],
        [0.4836],
        [0.4591],
        [0.4559],
        [0.4601],
        [0.4466],
        [0.4591],
        [0.4678],
        [0.4732]], grad_fn=<SigmoidBackward0>)
tensor([0.4473, 0.4213, 0.4807, 0.4716, 0.4324, 0.4557, 0.4618, 0.4389, 0.4836,
        0.4591, 0.4559, 0.4601, 0.4466, 0.4591, 0.4678, 0.4732],
       grad_fn=<SqueezeBackward0>)


In [78]:
# Calculate loss
print(pred.shape, labels.shape)

def get_binary_cross_entropy_loss(pred, label):
  return label * (-torch.log(pred)) + (1-label) * (-torch.log(1-pred))

loss = get_binary_cross_entropy_loss(pred.squeeze(), labels.float())
loss = loss.mean() # take mean
loss

torch.Size([16, 1]) torch.Size([16])


tensor(0.6115, grad_fn=<MeanBackward0>)

In [80]:
# Check how gradient looks like before the backpropagation
print(model.layer1.weight.grad) # print None

None


In [81]:
# backpropagate the loss 
loss.backward()

In [82]:
# Check how gradient looks like after the backpropagation
print(model.layer1.weight.grad)

tensor([[ 7.3340e-03, -1.1900e-02, -9.1739e-04,  ...,  1.2517e-02,
         -2.4642e-03,  1.0083e-03],
        [-1.5112e-03,  1.8738e-03,  1.3736e-03,  ..., -1.9040e-03,
         -2.5207e-05,  1.3174e-03],
        [-3.2698e-03,  2.6332e-03, -7.1477e-05,  ..., -8.9265e-03,
          1.2005e-03,  1.9822e-03],
        ...,
        [-6.9544e-03,  1.2590e-02, -1.8057e-03,  ..., -1.2442e-02,
          3.1392e-04, -3.0374e-03],
        [ 3.0124e-03, -5.6204e-03,  1.3385e-03,  ...,  7.1443e-03,
          8.1332e-04,  2.6787e-04],
        [-1.9728e-04, -3.6963e-04, -9.7506e-06,  ...,  3.8729e-03,
          1.2328e-04, -1.8352e-04]])


In [84]:
model.layer1.weight.grad.shape, model.layer1.weight.shape

(torch.Size([32, 1500]), torch.Size([32, 1500]))

In [85]:
# manual update
model.layer1.weight.data -= model.layer1.weight.grad * 0.001

SyntaxError: ignored

In [None]:
# use torch.optim.optimizers