# Named Entity Recognition
- For a given word and its context window, estimate whether the given word is location or not

# 1. Download dataset
- CoNLL2003 

In [1]:
!wget https://data.deepai.org/conll2003.zip # Download dataset
!unzip conll2003.zip # Unzip dataset zip

--2023-03-23 06:30:19--  https://data.deepai.org/conll2003.zip
Resolving data.deepai.org (data.deepai.org)... 185.93.1.250, 2400:52e0:1a00::1068:1
Connecting to data.deepai.org (data.deepai.org)|185.93.1.250|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 982975 (960K) [application/zip]
Saving to: ‘conll2003.zip’


2023-03-23 06:30:20 (4.17 MB/s) - ‘conll2003.zip’ saved [982975/982975]

Archive:  conll2003.zip
  inflating: metadata                
  inflating: test.txt                
  inflating: train.txt               
  inflating: valid.txt               


## 2. Preprocess Dataset

In [3]:
with open("train.txt") as f:
  string = ''.join(f.readlines())
dataset = string.split('\n')

dataset[:70]

['-DOCSTART- -X- -X- O',
 '',
 'EU NNP B-NP B-ORG',
 'rejects VBZ B-VP O',
 'German JJ B-NP B-MISC',
 'call NN I-NP O',
 'to TO B-VP O',
 'boycott VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 '. . O O',
 '',
 'Peter NNP B-NP B-PER',
 'Blackburn NNP I-NP I-PER',
 '',
 'BRUSSELS NNP B-NP B-LOC',
 '1996-08-22 CD I-NP O',
 '',
 'The DT B-NP O',
 'European NNP I-NP B-ORG',
 'Commission NNP I-NP I-ORG',
 'said VBD B-VP O',
 'on IN B-PP O',
 'Thursday NNP B-NP O',
 'it PRP B-NP O',
 'disagreed VBD B-VP O',
 'with IN B-PP O',
 'German JJ B-NP B-MISC',
 'advice NN I-NP O',
 'to TO B-PP O',
 'consumers NNS B-NP O',
 'to TO B-VP O',
 'shun VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 'until IN B-SBAR O',
 'scientists NNS B-NP O',
 'determine VBP B-VP O',
 'whether IN B-SBAR O',
 'mad JJ B-NP O',
 'cow NN I-NP O',
 'disease NN I-NP O',
 'can MD B-VP O',
 'be VB I-VP O',
 'transmitted VBN I-VP O',
 'to TO B-PP O',
 'sheep NN B-NP O',
 '. . O O',
 '',
 'Germany NNP B-NP B

In [4]:
from itertools import groupby

dataset_in_sentence = [list(group) for k, group in groupby(dataset, lambda x: x == "") if not k]
dataset_in_sentence[:5]

[['-DOCSTART- -X- -X- O'],
 ['EU NNP B-NP B-ORG',
  'rejects VBZ B-VP O',
  'German JJ B-NP B-MISC',
  'call NN I-NP O',
  'to TO B-VP O',
  'boycott VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  '. . O O'],
 ['Peter NNP B-NP B-PER', 'Blackburn NNP I-NP I-PER'],
 ['BRUSSELS NNP B-NP B-LOC', '1996-08-22 CD I-NP O'],
 ['The DT B-NP O',
  'European NNP I-NP B-ORG',
  'Commission NNP I-NP I-ORG',
  'said VBD B-VP O',
  'on IN B-PP O',
  'Thursday NNP B-NP O',
  'it PRP B-NP O',
  'disagreed VBD B-VP O',
  'with IN B-PP O',
  'German JJ B-NP B-MISC',
  'advice NN I-NP O',
  'to TO B-PP O',
  'consumers NNS B-NP O',
  'to TO B-VP O',
  'shun VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  'until IN B-SBAR O',
  'scientists NNS B-NP O',
  'determine VBP B-VP O',
  'whether IN B-SBAR O',
  'mad JJ B-NP O',
  'cow NN I-NP O',
  'disease NN I-NP O',
  'can MD B-VP O',
  'be VB I-VP O',
  'transmitted VBN I-VP O',
  'to TO B-PP O',
  'sheep NN B-NP O',
  '. . O O']]

In [6]:
# [len(sentence) for sentence in dataset_in_sentence]
filtered_dataset = [sentence for sentence in dataset_in_sentence if len(sentence) > 5]
len(filtered_dataset)

10625

In [7]:
window_len = 2
sentence = filtered_dataset[0]

for i, word in enumerate(sentence):
  # print(word)
  splitted_word = word.split(' ')
  # print(splitted_word)
  center_word = splitted_word[0]
  label = splitted_word[-1]
  print(center_word, label)
  is_organization = label in ['B-ORG', 'I-ORG']
  # print(is_organization)
  
  prev_index = max(i - window_len, 0)
  prev_words = sentence[prev_index:i]
  prev_words = [word_str.split(' ')[0] for word_str in prev_words]

  # print(prev_words)

  next_index = i + window_len + 1
  next_words = sentence[i+1:next_index]
  # next_words = [sentence[next_index] ]
  next_words = [word_str.split(' ')[0] for word_str in next_words]

  # We have to add padding, if number of prev words or next words are shorter than expected
  if len(prev_words) != window_len:
    prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

  if len(next_words) != window_len:
    next_words = next_words + ['<pad>'] * (window_len - len(next_words))

  concatenated_words = prev_words + [center_word] + next_words
  print(concatenated_words)


EU B-ORG
['<pad>', '<pad>', 'EU', 'rejects', 'German']
rejects O
['<pad>', 'EU', 'rejects', 'German', 'call']
German B-MISC
['EU', 'rejects', 'German', 'call', 'to']
call O
['rejects', 'German', 'call', 'to', 'boycott']
to O
['German', 'call', 'to', 'boycott', 'British']
boycott O
['call', 'to', 'boycott', 'British', 'lamb']
British B-MISC
['to', 'boycott', 'British', 'lamb', '.']
lamb O
['boycott', 'British', 'lamb', '.', '<pad>']
. O
['British', 'lamb', '.', '<pad>', '<pad>']


In [2]:
import gensim.downloader

wrd2vec = gensim.downloader.load("glove-wiki-gigaword-300")



In [12]:
len(wrd2vec)

400000

In [31]:
import torch
import numpy as np
vec_dim = 300

corresp_vectors = []
for word in concatenated_words:
  if word.lower() in wrd2vec:
    vec = wrd2vec[word.lower()]
    # vec = torch.tensor(vec)
  else:
    # vec = torch.zeros(vec_dim)
    vec = np.zeros(vec_dim)
  corresp_vectors.append(vec)

# cat_vector = torch.cat(corresp_vectors)
cat_vector = torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float).dtype

cat_vector.shape

# torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float).dtype

In [32]:
corresp_vectors[0].dtype

dtype('float32')

In [27]:
cat_vector.dtype

torch.float32

In [21]:
cat_vector, is_organization

(tensor([ 0.4436, -0.2418,  0.2366,  ...,  0.0000,  0.0000,  0.0000],
        dtype=torch.float64), False)

# Design Model

In [41]:
import torch.nn as nn

class OrgClassifier(nn.Module):
  def __init__(self, input_dim=1500, hidden_size=32):
    super().__init__()
    self.layer1 = nn.Linear(in_features=input_dim, out_features=hidden_size)
    self.layer2 = nn.Linear(in_features=hidden_size, out_features=1)
  
  def forward(self, x):
    hidden = self.layer1(x)
    hidden = torch.relu(hidden)
    out = self.layer2(hidden)
    return out.sigmoid()

model = OrgClassifier()
model(cat_vector)

tensor([0.4865], grad_fn=<SigmoidBackward0>)

In [38]:
hidden = model.layer1(cat_vector)
print(hidden)
print(hidden.shape)
hidden = torch.relu(hidden)
print(hidden)
out = model.layer2(hidden)
print(out)
print(out.shape)

tensor([-0.1375, -0.1089, -0.1297,  0.0949,  0.2792, -0.0644, -0.1344,  0.1632,
         0.2122,  0.0221, -0.1565,  0.1963, -0.2480,  0.0403, -0.0452, -0.2661,
        -0.1092, -0.2058, -0.4242,  0.0146,  0.3180,  0.0561, -0.1044,  0.0235,
        -0.0238,  0.1866, -0.0081, -0.1687, -0.0784, -0.1129,  0.0550,  0.0794],
       grad_fn=<AddBackward0>)
torch.Size([32])
tensor([0.0000, 0.0000, 0.0000, 0.0949, 0.2792, 0.0000, 0.0000, 0.1632, 0.2122,
        0.0221, 0.0000, 0.1963, 0.0000, 0.0403, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0146, 0.3180, 0.0561, 0.0000, 0.0235, 0.0000, 0.1866, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0550, 0.0794], grad_fn=<ReluBackward0>)
torch.Size([32])
tensor([-0.1466], grad_fn=<AddBackward0>)
torch.Size([1])


In [40]:
relu_hidden = torch.relu(hidden) 
relu_hidden = hidden.relu()

torch.sigmoid(out) == out.sigmoid()

tensor([True])

# Make Dataset Class

In [43]:
class Dataset:
  def __init__(self, txt_fn):
    dataset = self.read_text_data(txt_fn)
    dataset_in_sentence = self.group_by_sentence(dataset)
    filtered_dataset = [sentence for sentence in dataset_in_sentence if len(sentence) > 5]
    self.data_in_sentence = filtered_dataset
  
  def read_text_data(self, txt_fn):
    with open("train.txt") as f:
      string = ''.join(f.readlines())
    dataset = string.split('\n')
    return dataset
  
  def group_by_sentence(self, dataset):
    dataset_in_sentence = [list(group) for k, group in groupby(dataset, lambda x: x == "") if not k]
    return dataset_in_sentence

  def get_windowed_words_from_sentence(self, sentence):
    result = []
    for i, word in enumerate(sentence):
      splitted_word = word.split(' ')
      center_word = splitted_word[0]
      label = splitted_word[-1]
      is_organization = label in ['B-ORG', 'I-ORG']
      
      prev_index = max(i - window_len, 0)
      prev_words = sentence[prev_index:i]
      prev_words = [word_str.split(' ')[0] for word_str in prev_words]


      next_index = i + window_len + 1
      next_words = sentence[i+1:next_index]
      next_words = [word_str.split(' ')[0] for word_str in next_words]

      # We have to add padding, if number of prev words or next words are shorter than expected
      if len(prev_words) != window_len:
        prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

      if len(next_words) != window_len:
        next_words = next_words + ['<pad>'] * (window_len - len(next_words))

      concatenated_words = prev_words + [center_word] + next_words
      result.append( (concatenated_words, is_organization))
    return result

dataset = Dataset("train.txt")

In [46]:
dataset.get_windowed_words_from_sentence(dataset.data_in_sentence[100])

[(['<pad>', '<pad>', 'Israel', "'s", 'Channel'], False),
 (['<pad>', 'Israel', "'s", 'Channel', 'Two'], False),
 (['Israel', "'s", 'Channel', 'Two', 'television'], True),
 (["'s", 'Channel', 'Two', 'television', 'said'], True),
 (['Channel', 'Two', 'television', 'said', 'Damascus'], False),
 (['Two', 'television', 'said', 'Damascus', 'had'], False),
 (['television', 'said', 'Damascus', 'had', 'sent'], False),
 (['said', 'Damascus', 'had', 'sent', 'a'], False),
 (['Damascus', 'had', 'sent', 'a', '"'], False),
 (['had', 'sent', 'a', '"', 'calming'], False),
 (['sent', 'a', '"', 'calming', 'signal'], False),
 (['a', '"', 'calming', 'signal', '"'], False),
 (['"', 'calming', 'signal', '"', 'to'], False),
 (['calming', 'signal', '"', 'to', 'Israel'], False),
 (['signal', '"', 'to', 'Israel', '.'], False),
 (['"', 'to', 'Israel', '.', '<pad>'], False),
 (['to', 'Israel', '.', '<pad>', '<pad>'], False)]