<a href="https://colab.research.google.com/github/jdasam/aat3020/blob/main/notebooks/2_named_entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition
- For a given word and its context window, estimate whether the given word is location or not

# 1. Download dataset
- CoNLL2003

In [1]:
!wget https://data.deepai.org/conll2003.zip # Download dataset
!unzip conll2003.zip # Unzip dataset zip

--2024-04-02 06:43:02--  https://data.deepai.org/conll2003.zip
Resolving data.deepai.org (data.deepai.org)... 143.244.50.87, 2400:52e0:1a01::993:1
Connecting to data.deepai.org (data.deepai.org)|143.244.50.87|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 982975 (960K) [application/zip]
Saving to: ‘conll2003.zip’


2024-04-02 06:43:03 (5.17 MB/s) - ‘conll2003.zip’ saved [982975/982975]

Archive:  conll2003.zip
  inflating: metadata                
  inflating: test.txt                
  inflating: train.txt               
  inflating: valid.txt               


## 2. Preprocess Dataset

In [2]:
with open("train.txt") as f:
  string = ''.join(f.readlines())
dataset = string.split('\n')

dataset[:70]

['-DOCSTART- -X- -X- O',
 '',
 'EU NNP B-NP B-ORG',
 'rejects VBZ B-VP O',
 'German JJ B-NP B-MISC',
 'call NN I-NP O',
 'to TO B-VP O',
 'boycott VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 '. . O O',
 '',
 'Peter NNP B-NP B-PER',
 'Blackburn NNP I-NP I-PER',
 '',
 'BRUSSELS NNP B-NP B-LOC',
 '1996-08-22 CD I-NP O',
 '',
 'The DT B-NP O',
 'European NNP I-NP B-ORG',
 'Commission NNP I-NP I-ORG',
 'said VBD B-VP O',
 'on IN B-PP O',
 'Thursday NNP B-NP O',
 'it PRP B-NP O',
 'disagreed VBD B-VP O',
 'with IN B-PP O',
 'German JJ B-NP B-MISC',
 'advice NN I-NP O',
 'to TO B-PP O',
 'consumers NNS B-NP O',
 'to TO B-VP O',
 'shun VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 'until IN B-SBAR O',
 'scientists NNS B-NP O',
 'determine VBP B-VP O',
 'whether IN B-SBAR O',
 'mad JJ B-NP O',
 'cow NN I-NP O',
 'disease NN I-NP O',
 'can MD B-VP O',
 'be VB I-VP O',
 'transmitted VBN I-VP O',
 'to TO B-PP O',
 'sheep NN B-NP O',
 '. . O O',
 '',
 'Germany NNP B-NP B

In [3]:
from itertools import groupby

dataset_in_sentence = [list(group) for k, group in groupby(dataset, lambda x: x == "") if not k]
dataset_in_sentence[:5]

[['-DOCSTART- -X- -X- O'],
 ['EU NNP B-NP B-ORG',
  'rejects VBZ B-VP O',
  'German JJ B-NP B-MISC',
  'call NN I-NP O',
  'to TO B-VP O',
  'boycott VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  '. . O O'],
 ['Peter NNP B-NP B-PER', 'Blackburn NNP I-NP I-PER'],
 ['BRUSSELS NNP B-NP B-LOC', '1996-08-22 CD I-NP O'],
 ['The DT B-NP O',
  'European NNP I-NP B-ORG',
  'Commission NNP I-NP I-ORG',
  'said VBD B-VP O',
  'on IN B-PP O',
  'Thursday NNP B-NP O',
  'it PRP B-NP O',
  'disagreed VBD B-VP O',
  'with IN B-PP O',
  'German JJ B-NP B-MISC',
  'advice NN I-NP O',
  'to TO B-PP O',
  'consumers NNS B-NP O',
  'to TO B-VP O',
  'shun VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  'until IN B-SBAR O',
  'scientists NNS B-NP O',
  'determine VBP B-VP O',
  'whether IN B-SBAR O',
  'mad JJ B-NP O',
  'cow NN I-NP O',
  'disease NN I-NP O',
  'can MD B-VP O',
  'be VB I-VP O',
  'transmitted VBN I-VP O',
  'to TO B-PP O',
  'sheep NN B-NP O',
  '. . O O']]

In [4]:
# [len(sentence) for sentence in dataset_in_sentence]
filtered_dataset = [sentence for sentence in dataset_in_sentence if len(sentence) > 5]
len(filtered_dataset)

10625

In [5]:
filtered_dataset[1000]

['" " O O',
 'I PRP B-NP O',
 'think VBP B-VP O',
 'this DT B-NP O',
 'is VBZ B-VP O',
 'a DT B-NP O',
 'bad JJ I-NP O',
 'beginning NN I-NP O',
 '. . O O']

In [9]:
window_len = 2
sentence = filtered_dataset[0]

for i, word in enumerate(sentence):
  print(f'word is {word}')
  splitted_word = word.split(' ')
  print(f'splitted_word is {splitted_word}')
  center_word = splitted_word[0]
  label = splitted_word[-1]
  print(center_word, label)
  is_organization = label in ['B-ORG', 'I-ORG']
  print(f"is organization: {is_organization}")

  # concatenating with neighboring words

  # words in the left
  prev_index = max(i - window_len, 0) # clipping minimum to zero
  prev_words = sentence[prev_index:i]
  prev_words = [word_str.split(' ')[0] for word_str in prev_words] # collect the main word

  # print(prev_words)

  next_index = i + window_len + 1
  next_words = sentence[i+1:next_index]
  # next_words = [sentence[next_index] ]
  next_words = [word_str.split(' ')[0] for word_str in next_words]

  # We have to add padding, if number of prev words or next words are shorter than expected
  if len(prev_words) != window_len:
    prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

  if len(next_words) != window_len:
    next_words = next_words + ['<pad>'] * (window_len - len(next_words))

  concatenated_words = prev_words + [center_word] + next_words
  print(concatenated_words)

word is EU NNP B-NP B-ORG
splitted_word is ['EU', 'NNP', 'B-NP', 'B-ORG']
EU B-ORG
is organization: True
['<pad>', '<pad>', 'EU', 'rejects', 'German']
word is rejects VBZ B-VP O
splitted_word is ['rejects', 'VBZ', 'B-VP', 'O']
rejects O
is organization: False
['<pad>', 'EU', 'rejects', 'German', 'call']
word is German JJ B-NP B-MISC
splitted_word is ['German', 'JJ', 'B-NP', 'B-MISC']
German B-MISC
is organization: False
['EU', 'rejects', 'German', 'call', 'to']
word is call NN I-NP O
splitted_word is ['call', 'NN', 'I-NP', 'O']
call O
is organization: False
['rejects', 'German', 'call', 'to', 'boycott']
word is to TO B-VP O
splitted_word is ['to', 'TO', 'B-VP', 'O']
to O
is organization: False
['German', 'call', 'to', 'boycott', 'British']
word is boycott VB I-VP O
splitted_word is ['boycott', 'VB', 'I-VP', 'O']
boycott O
is organization: False
['call', 'to', 'boycott', 'British', 'lamb']
word is British JJ B-NP B-MISC
splitted_word is ['British', 'JJ', 'B-NP', 'B-MISC']
British B-MISC

In [12]:
def make_window_words_and_label_from_sentence(sentence):
  total_output = []
  for i, word in enumerate(sentence):
    splitted_word = word.split(' ')
    center_word = splitted_word[0]
    label = splitted_word[-1]
    is_organization = label in ['B-ORG', 'I-ORG']

    # concatenating with neighboring words

    # words in the left
    prev_index = max(i - window_len, 0) # clipping minimum to zero
    prev_words = sentence[prev_index:i]
    prev_words = [word_str.split(' ')[0] for word_str in prev_words] # collect the main word

    # print(prev_words)

    next_index = i + window_len + 1
    next_words = sentence[i+1:next_index]
    # next_words = [sentence[next_index] ]
    next_words = [word_str.split(' ')[0] for word_str in next_words]

    # We have to add padding, if number of prev words or next words are shorter than expected
    if len(prev_words) != window_len:
      prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

    if len(next_words) != window_len:
      next_words = next_words + ['<pad>'] * (window_len - len(next_words))

    concatenated_words = prev_words + [center_word] + next_words
    total_output.append( (concatenated_words, is_organization)  )
  return total_output

make_window_words_and_label_from_sentence(sentence)


[(['<pad>', '<pad>', 'EU', 'rejects', 'German'], True),
 (['<pad>', 'EU', 'rejects', 'German', 'call'], False),
 (['EU', 'rejects', 'German', 'call', 'to'], False),
 (['rejects', 'German', 'call', 'to', 'boycott'], False),
 (['German', 'call', 'to', 'boycott', 'British'], False),
 (['call', 'to', 'boycott', 'British', 'lamb'], False),
 (['to', 'boycott', 'British', 'lamb', '.'], False),
 (['boycott', 'British', 'lamb', '.', '<pad>'], False),
 (['British', 'lamb', '.', '<pad>', '<pad>'], False)]

In [13]:
entire_dataset = [make_window_words_and_label_from_sentence(sentence) for sentence in filtered_dataset ]
entire_dataset = [windowed_word for sentence in entire_dataset for windowed_word in sentence]

In [14]:
len(entire_dataset)

192587

In [15]:
entire_dataset[10000]

(['eight', 'in', 'a', 'row', ','], False)

In [10]:
import gensim.downloader

wrd2vec = gensim.downloader.load("glove-wiki-gigaword-300")



In [26]:
len(wrd2vec)

400000

In [35]:
import numpy as np
data_example = entire_dataset[0]
word_list, label = data_example

# convert list of word in string into a concatenated vector
word_list

def get_flattened_vector(word_list:list, wrd2vec):
  flattened_vec = []
  for word in word_list:
    word = word.lower()
    if word in wrd2vec:
      vec = wrd2vec[word]
    else:
      vec = np.zeros(300)
    # print(vec.shape)
    flattened_vec.append(vec)
  flattened_vec = np.concatenate(flattened_vec)
  return flattened_vec

get_flattened_vector(word_list, wrd2vec).shape

(1500,)

In [38]:
import torch
import torch.nn as nn

class Classifier(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super().__init__() # initialize nn.Module first
    self.layer1 = nn.Linear(input_size, hidden_size)
    self.relu = nn.ReLU()
    self.layer2 = nn.Linear(hidden_size, output_size, bias=False)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    h = self.relu(self.layer1(x))
    s = self.layer2(h)
    out = self.sigmoid(s)
    return out.squeeze()

input_vec = get_flattened_vector(word_list, wrd2vec)
model = Classifier(input_size=1500, hidden_size=47, output_size=1)
model(torch.Tensor(input_vec))

tensor(0.5294, grad_fn=<SqueezeBackward0>)