# Part 1 Sequence Tagging: NER


## Q1.1 Word Embedding

### Q1.1.1 Download Dependency

In [1]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
     ---------------------------------------- 0.0/43.6 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.6 kB 320.0 kB/s eta 0:00:01
     -------------------------------------- 43.6/43.6 kB 709.9 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py): started
  Building wheel for seqeval (setup.py): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16185 sha256=441067483e9e85f71c52b2bcfc4597a45fbb754df43aad7f41e18e005e30fa36
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\bc\92\f0\243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Note: you may need to restart the kernel to use updated


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import csv
import os
import pandas as pd
import random
import numpy as np
import torch
import torch.nn as nn
import tqdm
from seqeval.metrics import f1_score
from torch.utils.data import Dataset, DataLoader,TensorDataset

import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers.csv_logs import CSVLogger


In [3]:
TRAIN_PATH  = 'Data/eng.train'
DEVELOPMENT_PATH = 'Data/eng.testa'
TEST_PATH = 'Data/eng.testb'
OUTPUT_DIR = 'Data/output'

In [4]:
#!pip install --upgrade gensim

### Q1.1.2 Download the pretrained word2vec embeddings 

In [5]:
import gensim.downloader

#Download the embeddings "word2vec-google-news-300"
glove_vectors = gensim.downloader.load('word2vec-google-news-300')

### Q1.1.3 Query the vector of any word

In [6]:
# Query the vecotr of any word by specifcying the word as the key
#glove_vectors['beautiful']
glove_vectors['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

### Q1.1 Finding most similar words

#### a) student

In [7]:
word = "student"

# Check if word is in vocab
if word in glove_vectors.key_to_index:
    
    # Use most_similar to find word with the most similar cosines
    # topn =1 only lists the most similar word
    similar_words = glove_vectors.most_similar(word, topn=1)  
    print(f"Words most similar to '{word}':")
    
    # Print the similarity scores
    for similar_word, similarity_score in similar_words:
        print(f"{similar_word}: {similarity_score}")
else:
    print(f"'{word}' is not in the vocabulary.")

KeyboardInterrupt: 

#### b) Apple

In [None]:
word = "Apple"

# Check if word is in vocab
if word in glove_vectors.key_to_index:
    
    # Use most_similar to find word with the most similar cosines
    # topn=1 only lists the most similar word
    similar_words = glove_vectors.most_similar(word, topn=1)  
    print(f"Words most similar to '{word}':")
    
    # Print the similarity scores
    for similar_word, similarity_score in similar_words:
        print(f"{similar_word}: {similarity_score}")
else:
    print(f"'{word}' is not in the vocabulary.")

Words most similar to 'Apple':
Apple_AAPL: 0.7456986308097839


#### c) apple

In [None]:
word = "apple"

# Check if word is in vocab
if word in glove_vectors.key_to_index:
    
    # Use most_similar to find word with the most similar cosines
    # topn =1 only lists the most similar word
    similar_words = glove_vectors.most_similar(word, topn=1)  
    print(f"Words most similar to '{word}':")
    
    # Print the similarity scores
    for similar_word, similarity_score in similar_words:
        print(f"{similar_word}: {similarity_score}")
else:
    print(f"'{word}' is not in the vocabulary.")

Words most similar to 'apple':
apples: 0.720359742641449


### Q1.2 a) Size of training, development and test files

#### Processing files and converting to csv

In [None]:
def convert_to_csv(filepath,name):
    OUTPUT_PATH = os.path.join(OUTPUT_DIR,name)
    HEADERS = ["sentence_number",'word','tag']
    sentence_number = 1
    with open(OUTPUT_PATH,'w',newline='') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow(HEADERS)
    data =[]
    with open(filepath, 'r') as file:
    # Read the file line by line
        for line in file:
            # Check for blank lines
            if line.strip() =='':
                sentence_number+=1
                with open(OUTPUT_PATH,'a',newline='') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow([]) 
            else:
                # Write sentence_number, word and its tag to csv
                words = line.split()
                data.append(sentence_number)
                data.append(words[0])
                data.append(words[-1])
                with open(OUTPUT_PATH,'a',newline='') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow(data) 
                data = []

#Converting all data to csv                 
convert_to_csv(TRAIN_PATH,'train.csv')
convert_to_csv(DEVELOPMENT_PATH,'development.csv')
convert_to_csv(TEST_PATH,'test.csv')

            

#### Training file

In [8]:
# Reading train.csv using pandas
train_df = pd.read_csv(os.path.join(OUTPUT_DIR,'train.csv'))

# Printing size, number of words and tags
print(f"Size of train file: {train_df['sentence_number'].iloc[-1]}")
print(f"number of words : {len(train_df['word'].unique())}")
print(f"tags : {sorted(train_df['tag'].unique())}")

Size of train file: 14987
number of words : 23623
tags : ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


#### Development File

In [9]:
# Reading development.csv using Pandas
development_df = pd.read_csv(os.path.join(OUTPUT_DIR,'development.csv'))

# Printing size, number of words and tags
print(f"Size of train file: {development_df['sentence_number'].iloc[-1]}")
print(f"number of words : {len(development_df['word'].unique())}")
print(f"tags : {sorted(development_df['tag'].unique())}")


Size of train file: 3466
number of words : 9966
tags : ['B-MISC', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


#### Test File

In [10]:
# Reading test.csv using pandas
test_df = pd.read_csv(os.path.join(OUTPUT_DIR,'test.csv'))


# Printing size, number of words and tags
print(f"Size of train file: {test_df['sentence_number'].iloc[-1]}")
print(f"number of words : {len(test_df['word'].unique())}")
print(f"tags : {sorted(test_df['tag'].unique())}")


Size of train file: 3684
number of words : 9489
tags : ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


#####  Q1b) choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word. Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [109]:
def q1b(df):
    while True:
        # Keeps track of number of entities
        count =0
        
        # Generate a random sentence number
        rand = random.randint(1,df['sentence_number'].iloc[-1]+1)

        # Filter df by sentence_number
        mask = df['sentence_number'] == rand
        rand_df = df[mask].reset_index()

        # Check is to ensure we do not double count consecutive word with same tag
        check = None

        # Check through dataframe and count number of entities
        for x in range(len(rand_df['tag'])):
            if x < len(rand_df['tag'])-1:
                
                # Check current tag !='O'
                if rand_df['tag'].iloc[x]!='O':
                    
                    # Check if current tag starts with 'B-'
                    if rand_df['tag'].iloc[x].startswith('B-'):
                        
                        # Strip current tag of 'B-' and next tag of 'I-'
                        if rand_df['tag'].iloc[x].strip('B-') == rand_df['tag'].iloc[x+1].strip('I-'):
                            
                             # updating check if tags are the same 
                            if check == rand_df['tag'].iloc[x-1]:
                                check = rand_df['tag'].iloc[x]

                            # Incrementing count and updating check
                            else:
                                count+=1
                                check = rand_df['tag'].iloc[x]
                
                
                    #Check if current tag == next tag
                    elif rand_df['tag'].iloc[x+1] == rand_df['tag'].iloc[x]:

                        # updating check if tags are the same 
                        if check == rand_df['tag'].iloc[x-1]:
                            check = rand_df['tag'].iloc[x]

                        # Incrementing count and updating check
                        else:
                            count+=1
                            check = rand_df['tag'].iloc[x]

                # Reinitialise check to None once 'O' detected to ensure other different named entities of same tag as previous is counted.
                elif rand_df['tag'].iloc[x]=='O':
                    check = None

                # Return once count >=2
                if count >=2:
                    return rand_df          
                
                

q1b_df = q1b(train_df)

In [110]:
def process(df):
    sentence = ''
    tag = ''
    sen_and_tag = ''
    for x in range (len(df['word'])):
        sentence += str(df['word'].iloc[x])
        sentence+= ' '
        tag += str(df['tag'].iloc[x])
        tag+=' '
        sen_and_tag +=str(df['word'].iloc[x])
        sen_and_tag +='/'
        sen_and_tag += str(df['tag'].iloc[x])
        sen_and_tag += ' '
    print(f"sentence: {sentence}")
    print(f"tag: {tag}")
    print(f"sentence with tag: {sen_and_tag}")

process(q1b_df)

sentence: South African provincial side Boland said on Thursday they had signed Leicestershire fast bowler David Millns on a one year contract . 
tag: I-MISC I-MISC O O I-ORG O O O O O O I-ORG O O I-PER I-PER O O O O O O 
sentence with tag: South/I-MISC African/I-MISC provincial/O side/O Boland/I-ORG said/O on/O Thursday/O they/O had/O signed/O Leicestershire/I-ORG fast/O bowler/O David/I-PER Millns/I-PER on/O a/O one/O year/O contract/O ./O 


In [133]:
def check(df):

    # Keeps track of number of entities
    count =0
    

    # Check is to ensure we do not double count consecutive word with same tag
    check = None

    # Check through dataframe and count number of entities
    for x in range(len(df['tag'])):
        if x < len(df['tag'])-1:
            # Check tag !='O', next tag is same as current tag 
            if df['tag'].iloc[x]!='O':
                if df['tag'].iloc[x].startswith('B-'):
                    if df['tag'].iloc[x].strip('B-') == df['tag'].iloc[x+1].strip('I-'):
                            # updating check if tags are the same 
                        if check == df['tag'].iloc[x-1]:
                            check = df['tag'].iloc[x]

                        # Incrementing count and updating check
                        else:
                            count+=1
                            check = df['tag'].iloc[x]
            
            
            
                elif df['tag'].iloc[x+1] == df['tag'].iloc[x]:

                    # updating check if tags are the same 
                    if check == df['tag'].iloc[x-1]:
                        check = df['tag'].iloc[x]

                    # Incrementing count and updating check
                    else:
                        count+=1
                        check = df['tag'].iloc[x]

            # Reinitialise check to None once 'O' detected to ensure other different named entities of same tag as previous is counted.
            elif df['tag'].iloc[x]=='O':
                check = None

           
    return count
  
            
            


In [137]:
mask = train_df['sentence_number'] == 679
df = train_df[mask].reset_index()
print(check(df))
process(df)

2
sentence: Australian Davis Cup captain John Newcombe on Thursday signalled his possible resignation if his team loses an away tie against Croatia next month . 
tag: I-MISC B-MISC I-MISC O I-PER I-PER O O O O O O O O O O O O O O I-LOC O O O 
sentence with tag: Australian/I-MISC Davis/B-MISC Cup/I-MISC captain/O John/I-PER Newcombe/I-PER on/O Thursday/O signalled/O his/O possible/O resignation/O if/O his/O team/O loses/O an/O away/O tie/O against/O Croatia/I-LOC next/O month/O ./O 


In [124]:
def list_named_entities(df):
    entity =''
    l =[]
    t = []
    for x in range(len(df['word'])):
        if df['tag'].iloc[x]!='O':
            entity+=str(df['word'].iloc[x])
            entity+=' '
        else:
            if (entity!=''):
                l.append(entity)
                t.append(str(df['tag'].iloc[x-1]))
                entity = ''
    d = {k: v for k, v in zip(l,t)} 
    return l,d

l,d= list_named_entities(q1b_df)
print(f"list of name entities: {l}")
print(f"list of name entities with tags {d}")


list of name entities: ['South African ', 'Boland ', 'Leicestershire ', 'David Millns ']
list of name entities with tags {'South African ': 'I-MISC', 'Boland ': 'I-ORG', 'Leicestershire ': 'I-ORG', 'David Millns ': 'I-PER'}


### 1.3) Model