# Part 1 Sequence Tagging: NER


## Q1.1 Word Embedding

### Q1.1.1 Download Dependency

In [20]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting scikit-learn>=0.21.3 (from seqeval)
  Obtaining dependency information for scikit-learn>=0.21.3 from https://files.pythonhosted.org/packages/db/0d/1f6d2cd52c886707b00ddb7ed2504cbf10903a60a7bebcd71f0f77d53505/scikit_learn-1.3.1-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading scikit_learn-1.3.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.1.1 (from scikit-learn>=0.21.3->seqeval)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn>=0.21.3->seqeval)
  O

In [21]:
import csv
import os
import pandas as pd
import random
import numpy as np
import torch
import torch.nn as nn
import tqdm
from seqeval.metrics import f1_score
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers.csv_logs import CSVLogger


In [6]:
TRAIN_PATH  = 'Data/eng.train'
DEVELOPMENT_PATH = 'Data/eng.testa'
TEST_PATH = 'Data/eng.testb'
OUTPUT_DIR = 'Data/output'

In [5]:
#!pip install --upgrade gensim

### Q1.1.2 Download the pretrained word2vec embeddings 

In [7]:
import gensim.downloader

#Download the embeddings "word2vec-google-news-300"
glove_vectors = gensim.downloader.load('word2vec-google-news-300')

### Q1.1.3 Query the vector of any word

In [7]:
# Query the vecotr of any word by specifcying the word as the key
#glove_vectors['beautiful']
glove_vectors['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

### Q1.1 Finding most similar words

#### a) student

In [8]:
word = "student"

# Check if word is in vocab
if word in glove_vectors.key_to_index:
    
    # Use most_similar to find word with the most similar cosines
    # topn =1 only lists the most similar word
    similar_words = glove_vectors.most_similar(word, topn=1)  
    print(f"Words most similar to '{word}':")
    
    # Print the similarity scores
    for similar_word, similarity_score in similar_words:
        print(f"{similar_word}: {similarity_score}")
else:
    print(f"'{word}' is not in the vocabulary.")

Words most similar to 'student':
students: 0.7294867038726807


#### b) Apple

In [9]:
word = "Apple"

# Check if word is in vocab
if word in glove_vectors.key_to_index:
    
    # Use most_similar to find word with the most similar cosines
    # topn=1 only lists the most similar word
    similar_words = glove_vectors.most_similar(word, topn=1)  
    print(f"Words most similar to '{word}':")
    
    # Print the similarity scores
    for similar_word, similarity_score in similar_words:
        print(f"{similar_word}: {similarity_score}")
else:
    print(f"'{word}' is not in the vocabulary.")

Words most similar to 'Apple':
Apple_AAPL: 0.7456986308097839


#### c) apple

In [10]:
word = "apple"

# Check if word is in vocab
if word in glove_vectors.key_to_index:
    
    # Use most_similar to find word with the most similar cosines
    # topn =1 only lists the most similar word
    similar_words = glove_vectors.most_similar(word, topn=1)  
    print(f"Words most similar to '{word}':")
    
    # Print the similarity scores
    for similar_word, similarity_score in similar_words:
        print(f"{similar_word}: {similarity_score}")
else:
    print(f"'{word}' is not in the vocabulary.")

Words most similar to 'apple':
apples: 0.720359742641449


### Q1.2 a) Size of training, development and test files

#### Processing files and converting to csv

In [11]:
def convert_to_csv(filepath,name):
    OUTPUT_PATH = os.path.join(OUTPUT_DIR,name)
    HEADERS = ["sentence_number",'word','tag']
    sentence_number = 1
    with open(OUTPUT_PATH,'w',newline='') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow(HEADERS)
    data =[]
    with open(filepath, 'r') as file:
    # Read the file line by line
        for line in file:
            # Check for blank lines
            if line.strip() =='':
                sentence_number+=1
                with open(OUTPUT_PATH,'a',newline='') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow([]) 
            else:
                # Write sentence_number, word and its tag to csv
                words = line.split()
                data.append(sentence_number)
                data.append(words[0])
                data.append(words[-1])
                with open(OUTPUT_PATH,'a',newline='') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow(data) 
                data = []

#Converting all data to csv                 
convert_to_csv(TRAIN_PATH,'train.csv')
convert_to_csv(DEVELOPMENT_PATH,'development.csv')
convert_to_csv(TEST_PATH,'test.csv')

            

#### Training file

In [8]:
# Reading train.csv using pandas
train_df = pd.read_csv(os.path.join(OUTPUT_DIR,'train.csv'))

# Printing size, number of words and tags
print(f"Size of train file: {train_df['sentence_number'].iloc[-1]}")
print(f"number of words : {len(train_df['word'].unique())}")
print(f"tags : {sorted(train_df['tag'].unique())}")

Size of train file: 14987
number of words : 23623
tags : ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


#### Development File

In [13]:
# Reading development.csv using Pandas
development_df = pd.read_csv(os.path.join(OUTPUT_DIR,'development.csv'))

# Printing size, number of words and tags
print(f"Size of train file: {development_df['sentence_number'].iloc[-1]}")
print(f"number of words : {len(development_df['word'].unique())}")
print(f"tags : {sorted(development_df['tag'].unique())}")


Size of train file: 3466
number of words : 9966
tags : ['B-MISC', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


#### Test File

In [14]:
# Reading test.csv using pandas
test_df = pd.read_csv(os.path.join(OUTPUT_DIR,'test.csv'))


# Printing size, number of words and tags
print(f"Size of train file: {test_df['sentence_number'].iloc[-1]}")
print(f"number of words : {len(test_df['word'].unique())}")
print(f"tags : {sorted(test_df['tag'].unique())}")


Size of train file: 3684
number of words : 9489
tags : ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


#####  Q1b) choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word. Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [15]:
def q1b(df):
    while True:
        # Keeps track of number of entities
        count =0
        
        # Generate a random sentence number
        rand = random.randint(1,df['sentence_number'].iloc[-1]+1)

        # Filter df by sentence_number
        mask = df['sentence_number'] == rand
        rand_df = df[mask].reset_index()

        # Check is to ensure we do not double count consecutive word with same tag
        check = None

        # Check through dataframe and count number of entities
        for x in range(len(rand_df['tag'])):
             
            if x < len(rand_df['tag'])-1:
                # Check tag !='O', next tag is same as current tag 
                if rand_df['tag'].iloc[x]!='O' and rand_df['tag'].iloc[x+1] == rand_df['tag'].iloc[x]:

                    # updating check if tags are the same 
                    if check == rand_df['tag'].iloc[x-1]:
                        check = rand_df['tag'].iloc[x]

                    # Incrementing count and updating check
                    else:
                        count+=1
                        check = rand_df['tag'].iloc[x]
                        
                # Return once count >=2
                if count >=2:
                    return rand_df          
                
                


q1b_df = q1b(train_df)

In [16]:
def process(df):
    sentence = ''
    tag = ''
    sen_and_tag = ''
    for x in range (len(df['word'])):
        sentence += str(df['word'].iloc[x])
        sentence+= ' '
        tag += str(df['tag'].iloc[x])
        tag+=' '
        sen_and_tag +=str(df['word'].iloc[x])
        sen_and_tag +='/'
        sen_and_tag += str(df['tag'].iloc[x])
        sen_and_tag += ' '
    print(f"sentence: {sentence}")
    print(f"tag: {tag}")
    print(f"sentence with tag: {sen_and_tag}")

process(q1b_df)

sentence: In Home Health Inc said on Thursday it will appeal to the U.S. Federal District Court in Minneapolis a decision by the Health Care Financing Administration ( HCFA ) that denied reimbursement of certain costs under Medicaid . 
tag: I-ORG I-ORG I-ORG I-ORG O O O O O O O O I-ORG I-ORG I-ORG I-ORG O I-LOC O O O O I-ORG I-ORG I-ORG I-ORG O I-ORG O O O O O O O O I-MISC O 
sentence with tag: In/I-ORG Home/I-ORG Health/I-ORG Inc/I-ORG said/O on/O Thursday/O it/O will/O appeal/O to/O the/O U.S./I-ORG Federal/I-ORG District/I-ORG Court/I-ORG in/O Minneapolis/I-LOC a/O decision/O by/O the/O Health/I-ORG Care/I-ORG Financing/I-ORG Administration/I-ORG (/O HCFA/I-ORG )/O that/O denied/O reimbursement/O of/O certain/O costs/O under/O Medicaid/I-MISC ./O 


In [22]:
def list_named_entities(df):
    entity =''
    l =[]
    t = []
    for x in range(len(df['word'])):
        if df['tag'].iloc[x]!='O':
            entity+=str(df['word'].iloc[x])
            entity+=' '
        else:
            if (entity!=''):
                l.append(entity)
                t.append(str(df['tag'].iloc[x-1]))
                entity = ''
    d = {k: v for k, v in zip(l,t)} 
    return l,d

l,d= list_named_entities(q1b_df)
print(f"list of name entities: {l}")
print(f"list of name entities with tags {d}")


list of name entities: ['In Home Health Inc ', 'U.S. Federal District Court ', 'Minneapolis ', 'Health Care Financing Administration ', 'HCFA ', 'Medicaid ']
list of name entities with tags {'In Home Health Inc ': 'I-ORG', 'U.S. Federal District Court ': 'I-ORG', 'Minneapolis ': 'I-LOC', 'Health Care Financing Administration ': 'I-ORG', 'HCFA ': 'I-ORG', 'Medicaid ': 'I-MISC'}


### 1.3) Model

In [14]:
z=list(train_df['word'].unique())
len(z)

23623

In [None]:


# class Model(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, num_labels, pretrained_embeddings):
#         super(Model, self).__init__()
#         self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)  # Freeze the embeddings
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.linear = nn.Linear(2 * hidden_dim, num_labels)  # Multiply by 2 for bidirectional LSTM
#         self.softmax = nn.Softmax(dim=2)
        
#     def forward(self, x):
#         x = x.long()
#         x = self.embedding(x)
#         lstm_out, _ = self.lstm(x)
#         logits = self.linear(lstm_out)
#         output = self.softmax(logits)
        
#         return output
    
#     # Define hyperparameters
# ner_labels = ['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'I-PER', 'O']
# embedding_dim = 300
# hidden_dim = 64 # Change to see if there are any improvements
# num_labels = len(ner_labels)
# pretrained_embeddings = torch.tensor(glove_vectors.vectors)

# # Initialize the NER model
# model = Model(embedding_dim, hidden_dim, num_labels, pretrained_embeddings)
# # # not updating the weights during training
# # model.embedding.weight.requires_grad = False
# # Define your loss function and optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# loss_fn = nn.CrossEntropyLoss()

In [None]:
# VOCAB_SIZE = 23623
# EMBEDDING_DIM = 300
# VOCAB_SIZE = 23623
# EMBEDDING_DIM = 300
# from gensim.models import KeyedVectors

# # Load the pre-trained GloVe vectors (e.g., 100-dimensional vectors)
# glove_model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False)

# glove_vectors

# # Define your vocabulary and vocabulary size

# vocab = list(train_df['word'].unique()) # Replace with your actual vocabulary
# vocab_size = len(VOCAB_SIZE)

# # Initialize an embedding matrix with zeros
# embedding_matrix = np.zeros((vocab_size, 300))  # 100 is the dimension of GloVe vectors

# # Fill the embedding matrix with GloVe vectors for words in your vocabulary
# for word, i in word_index.items():
#     if word in glove_model:
#         embedding_matrix[i] = glove_model[word]

# # Now, 'embedding_matrix' contains GloVe vectors f

# # Load pre-trained word embeddings (e.g., Word2Vec, GloVe)
# # Replace 'embedding_matrix' with your actual embedding matrix
# # Ensure 'embedding_matrix' has shape (vocab_size, embedding_dim)
# # Set trainable=False to freeze the embeddings during training
# pretrained_embedding_layer = Embedding(input_dim=VOCAB_SIZE,
#                                        output_dim=EMBEDDING_DIM,
#                                        weights=[embedding_matrix],
#                                        input_length=1,
#                                        trainable=False)

# # Define the LSTM model
# model = Sequential()

# # Add the pre-trained embedding layer
# model.add(pretrained_embedding_layer)

# # Add an LSTM layer to capture the context of each word
# model.add(LSTM(units=100, return_sequences=False))

# # Add a Dense layer with softmax activation for word classification
# model.add(Dense(units=VOCAB_SIZE, activation='softmax')

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # Print a summary of the model architecture
# model.summary()

   




