In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Y4S1/CZ4045 Natural Language Processing/Project/')

Mounted at /content/drive


# Part 1.1 Word Embedding
Based on word2vec embeddings you have downloaded, use cosine similarity to find the most similar word to each of these words: (a) “student”; (b) “Apple”; (c) “apple”. Report the most similar word and its cosine similarity.


In [2]:
import gensim.downloader as api

In [3]:
from gensim.models import KeyedVectors
model = KeyedVectors.load('./data/word2vec_vectors.kv')

In [4]:
print("The most similar word to student is :",(model.most_similar("student", topn=1))[0][0], "\nand its cosine similarity is:", (model.most_similar("student", topn=1))[0][1])
print("\nThe most similar word to Apple is:",(model.most_similar("Apple", topn=1))[0][0], "\nand its cosine similarity is:", (model.most_similar("Apple", topn=1))[0][1])
print("\nThe most similar word to apple is:",(model.most_similar("apple", topn=1))[0][0], "\nand its cosine similarity is:", (model.most_similar("apple", topn=1))[0][1])

The most similar word to student is : students 
and its cosine similarity is: 0.7294867038726807

The most similar word to Apple is: Apple_AAPL 
and its cosine similarity is: 0.7456986308097839

The most similar word to apple is: apples 
and its cosine similarity is: 0.720359742641449


# Part 1.2 Data

In [5]:
def preprocess_data(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()

    sentences = []
    sentence_words = []
    sentence_labels = []
    IO_labels_list = []

    for line in lines:
        tokens = line.strip().split()
        if tokens: # if the line is not empty
            word = tokens[0]
            if word == '-DOCSTART-':
                continue
            label = tokens[-1]
            if 'B-' in label:
                label = label.replace('B-', 'I-', 1)
            if label not in IO_labels_list:
                IO_labels_list.append(label)
            sentence_words.append(word)
            sentence_labels.append(label)
        else:
            if sentence_words and sentence_labels:
                sentences.append((list(sentence_words), list(sentence_labels)))
                sentence_words.clear()
                sentence_labels.clear()

    print(f"The tagging labels list for {file_path[7:]} are : {IO_labels_list}")
    return sentences

train_sentences = preprocess_data('./data/eng.train')
development_sentences = preprocess_data('./data/eng.testa')
test_sentences = preprocess_data('./data/eng.testb')

The tagging labels list for eng.train are : ['I-ORG', 'O', 'I-MISC', 'I-PER', 'I-LOC']
The tagging labels list for eng.testa are : ['O', 'I-ORG', 'I-LOC', 'I-MISC', 'I-PER']
The tagging labels list for eng.testb are : ['O', 'I-LOC', 'I-PER', 'I-MISC', 'I-ORG']


(a) Describe the size (number of sentences) of the training, development and test file for CoNLL2003. Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO, etc.) you chose.

In [6]:
print(f"Number of sentences for train data is : {len(train_sentences)}")
print(f"Number of sentences for development data is : {len(development_sentences)}")
print(f"Number of sentences for test data is : {len(test_sentences)}")

Number of sentences for train data is : 14041
Number of sentences for development data is : 3250
Number of sentences for test data is : 3453


For the tagging scheme, we have decided to choose the IO scheme.
The complete set of all possible word labels are:
1. I-PER
2. I-ORG
3. I-MISC
4. I-LOC
8. O

(b) Choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word. Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [9]:
import random
def has_multi_word_entities(sentence, labels):
    current_entity = []
    multi_word_entities_count = 0

    for word, label in zip(sentence, labels):
        if label.startswith('I-'):
            current_entity.append(word)
        elif current_entity:
            if len(current_entity) > 1:
                multi_word_entities_count += 1
            current_entity = []

    if current_entity and len(current_entity) > 1:
        multi_word_entities_count += 1

    return multi_word_entities_count >= 2

def sentences_with_criteria(dataset):
    result = []

    for sentence, labels in dataset:
        if has_multi_word_entities(sentence, labels):
            result.append((sentence, labels))

    return result

# Extract sentences and labels that meet the criteria
filtered_train_sentences = sentences_with_criteria(train_sentences)

random_sentence_idx =  random.randint(0, len(filtered_train_sentences)-1)


print("The sentence choosen is: ")
for word in filtered_train_sentences[random_sentence_idx][0]:
    print(word, end=" ")

The sentence choosen is: 
" I do n't normally do this but can you please sign , " he said thrusting an ornate white book in front of Americans Harrison Dillard ( 1948 ) , Lindy Remigino ( 1952 ) , Jim Hines ( 1968 ) , Trinidad 's Hasely Crawford ( 1976 ) and Britain 's Allan Wells ( 1980 ) . 

**In order to form complete named entities from the label for each word**

1. We start with the first word in the sequence.
2. If we encounter a word with an I- label, it's part of a named entity. Collect the words with consecutive I- labels of the same type to form multi-word entities.
4. If we encounter a word with an O label, or an I- label, or an I- label that is of a different type (e.g. 'I-MISC', 'I-PER') , it indicates the end of the current entity and possibly the start of a new one.
5. We continue this process until we've traversed the entire sequence.

In [10]:
def extract_entities_io(words, labels):

    entities = []
    current_entity_words = []
    current_entity_type = None

    for word, label in zip(words, labels):
        if label.startswith("I-"):
            label_type = label.split("-")[1]

            # If we have a current entity and the label type has changed, or an unexpected label is encountered
            if current_entity_words and (not current_entity_type or current_entity_type != label_type):
                entities.append((current_entity_type, " ".join(current_entity_words)))
                current_entity_words = []

            # Update the current entity type and add the word to the current entity
            current_entity_type = label_type
            current_entity_words.append(word)

        else:  # For 'O' labels or any other unexpected label
            if current_entity_words:
                entities.append((current_entity_type, " ".join(current_entity_words)))
                current_entity_words = []
                current_entity_type = None

    # Handle any remaining entity at the end of the sequence
    if current_entity_words:
        entities.append((current_entity_type, " ".join(current_entity_words)))

    return entities

named_entities = extract_entities_io(filtered_train_sentences[random_sentence_idx][0],filtered_train_sentences[random_sentence_idx][1])
print("Named entities of this sentence are: ")
for idx, entities in enumerate(named_entities, start=1):
    print(str(idx) + ". " + entities[1] + ", TAG = " + entities[0])

Named entities of this sentence are: 
1. Americans, TAG = MISC
2. Harrison Dillard, TAG = PER
3. Lindy Remigino, TAG = PER
4. Jim Hines, TAG = PER
5. Trinidad, TAG = LOC
6. Hasely Crawford, TAG = PER
7. Britain, TAG = LOC
8. Allan Wells, TAG = PER


# Part 1.3 Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
import numpy as np

In [None]:
def prepare_data(train_sentences):
    data = []

    for sentence, tags in train_sentences:
        for word, tag in zip(sentence, tags):
            data.append((word, tag))

    return data

train_data = prepare_data(train_sentences)
development_data = prepare_data(development_sentences)
test_data = prepare_data(test_sentences)
print(train_sentences[0])
print(train_data[0:9])

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O'])
[('EU', 'I-ORG'), ('rejects', 'O'), ('German', 'I-MISC'), ('call', 'O'), ('to', 'O'), ('boycott', 'O'), ('British', 'I-MISC'), ('lamb', 'O'), ('.', 'O')]


In [None]:
word_vectors = KeyedVectors.load("./data/word2vec_vectors.kv", mmap='r')

In [None]:
# Create word and tag mappings
words = list(set(word for word, tag in train_data))
tags = list(set(tag for word, tag in train_data))

# Creating the mappings
word2idx = {word: i + 1 for i, word in enumerate(words)}  # +1 to leave 0 for padding
tag2idx = {tag: i for i, tag in enumerate(tags)}
idx2tag = {i: tag for tag, i in tag2idx.items()}

In [None]:
# print(idx2tag)

In [None]:
# Create embedding matrix
embedding_dim = word_vectors.vector_size
embedding_matrix = np.zeros((len(word2idx) + 1, embedding_dim))  # +1 for padding token

for word, i in word2idx.items():
    try:
        embedding_matrix[i] = word_vectors[word]
    except KeyError:
        # Word not in pretrained embeddings; initialize randomly
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))

# Convert data to integer sequences
def encode_sentences_flattened(data, word2idx, tag2idx):
    X_data = [word2idx.get(word, 0) for word, tag in data]  # 0 for OOV words
    Y_data = [tag2idx[tag] for word, tag in data]
    return X_data, Y_data

X_train, Y_train = encode_sentences_flattened(train_data, word2idx, tag2idx)
X_dev, Y_dev = encode_sentences_flattened(development_data, word2idx, tag2idx)
X_test, Y_test = encode_sentences_flattened(test_data, word2idx, tag2idx)

X_train[:5], Y_train[:5]  # Displaying the first 5 encoded words and their tags for verification

([6947, 12257, 22426, 9997, 15139], [4, 0, 2, 0, 0])

In [None]:
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

# Create instances of the custom dataset
train_dataset = NERDataset(X_train, Y_train)
dev_dataset = NERDataset(X_dev, Y_dev)
test_dataset = NERDataset(X_test, Y_test)
# Create data loaders
BATCH_SIZE = 128  # Define your desired batch size
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class NERModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix, dropout_rate = 0.5):
        super(NERModel, self).__init__()

        # Embedding layer with pretrained embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = False  # Freeze the pretrained embeddings

        # Bidirectional LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)

        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)

        # Dense layer
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)  # times 2 because of bidirectionality

        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        # Apply dropout and then pass through fc to get refined representation
        dropped = self.dropout(lstm_out)
        dense_out = self.fc(dropped)

        # Pass through fc2 to get final tag predictions
        tag_space = self.fc2(dense_out)
        return tag_space

# Hyperparameters
EMBEDDING_DIM = embedding_matrix.shape[1]
HIDDEN_DIM = 256
OUTPUT_DIM = len(tag2idx)

# Create an instance of the NER model
model = NERModel(len(word2idx) + 1, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, embedding_matrix)

# Loss function and optimizer
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# X_train_tensor = torch.tensor(X_train, dtype=torch.long)
# Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)

# epochs = 50
# for epoch in range(epochs):
#     model.train()

#     optimizer.zero_grad()

#     predictions = model(X_train_tensor)
#     loss = criterion(predictions.view(-1, OUTPUT_DIM), Y_train_tensor.view(-1))

#     loss.backward()
#     optimizer.step()

#     print(f"Epoch: {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# ------------------------------------------------------------------------------
# epochs = 50
# for epoch in range(epochs):
#     model.train()
#     total_loss = 0

#     for batch_X, batch_Y in train_loader:
#         optimizer.zero_grad()
#         predictions = model(batch_X)
#         loss = criterion(predictions.view(-1, OUTPUT_DIM), batch_Y.view(-1))
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     avg_loss = total_loss / len(train_loader)
#     print(f"Epoch: {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
# ------------------------------------------------------------------------------
from sklearn.metrics import f1_score
import time
def compute_f1(model, loader):
    model.eval()
    all_preds = []
    all_true = []

    with torch.no_grad():
        for batch_X, batch_Y in loader:
            predictions = model(batch_X)
            _, predicted_tags = torch.max(predictions, 1)  # get the index of the max probability
            all_preds.extend(predicted_tags.view(-1).cpu().numpy())
            all_true.extend(batch_Y.view(-1).cpu().numpy())

    return f1_score(all_true, all_preds, average='macro')  # you can use 'micro' or 'weighted' based on your preference

def evaluate_model(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_X, batch_Y in data_loader:
            predictions = model(batch_X)
            loss = criterion(predictions.view(-1, OUTPUT_DIM), batch_Y.view(-1))
            total_loss += loss.item()
    return total_loss / len(data_loader)

from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize the learning rate scheduler (adjusts learning rate based on validation loss)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5, verbose=True)

epochs = 100  # Increased as early stopping might stop it earlier
best_val_loss = float('inf')
epochs_without_improvement = 0
MAX_EPOCHS_WITHOUT_IMPROVEMENT = 10  # Number of epochs to wait before stopping
best_val_f1 = 0
start_time = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_Y in train_loader:
        optimizer.zero_grad()
        predictions = model(batch_X)
        loss = criterion(predictions.view(-1, OUTPUT_DIM), batch_Y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Evaluate on validation set
    val_loss = evaluate_model(model, dev_loader, criterion)
    val_f1 = compute_f1(model, dev_loader)

    print(f"Epoch: {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation F1: {val_f1:.4f}")


    # Save the model with the best validation loss
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pt')
        epochs_without_improvement = 0  # reset the count
    else:
        epochs_without_improvement += 1

    # Adjust learning rate based on validation f1
    scheduler.step(val_f1)

    # Early stopping
    if epochs_without_improvement >= MAX_EPOCHS_WITHOUT_IMPROVEMENT:
        print("Early stopping due to no improvement in validation f1.")
        break
end_time = time.time()
run_time = end_time - start_time
print(f"run time is : {run_time:.2f} seconds")

Epoch: 1/100, Train Loss: 0.2391, Validation Loss: 0.2434, Validation F1: 0.7645
Epoch: 2/100, Train Loss: 0.1695, Validation Loss: 0.2256, Validation F1: 0.7834
Epoch: 3/100, Train Loss: 0.1409, Validation Loss: 0.2134, Validation F1: 0.7863
Epoch: 4/100, Train Loss: 0.1221, Validation Loss: 0.1963, Validation F1: 0.8016
Epoch: 5/100, Train Loss: 0.1104, Validation Loss: 0.1908, Validation F1: 0.8017
Epoch: 6/100, Train Loss: 0.1043, Validation Loss: 0.1901, Validation F1: 0.8038
Epoch: 7/100, Train Loss: 0.0987, Validation Loss: 0.1828, Validation F1: 0.8075
Epoch 00007: reducing learning rate of group 0 to 5.0000e-04.
Epoch: 8/100, Train Loss: 0.0897, Validation Loss: 0.1825, Validation F1: 0.8112
Epoch: 9/100, Train Loss: 0.0867, Validation Loss: 0.1769, Validation F1: 0.8098
Epoch: 10/100, Train Loss: 0.0844, Validation Loss: 0.1759, Validation F1: 0.8109
Epoch: 11/100, Train Loss: 0.0822, Validation Loss: 0.1780, Validation F1: 0.8112
Epoch: 12/100, Train Loss: 0.0802, Validation

In [None]:
# print(predictions)

In [None]:
# Step 1: Get the indices of the maximum scores
# _, predicted_indices = torch.max(predictions, dim=1)

# Step 2: Convert these indices to tag strings
# predicted_tags = [idx2tag[idx.item()] for idx in predicted_indices]

# print(predicted_tags)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
def evaluate(model, X_data, Y_data):
    # Set model to evaluation mode
    model.eval()

    # Predict tags
    with torch.no_grad():
        predictions = model(torch.tensor(X_data, dtype=torch.long))
        _, predicted_tags = torch.max(predictions, dim=1)

    # Flatten both predicted and true tags for evaluation
    predicted_tags = predicted_tags.view(-1).numpy()
    true_tags = torch.tensor(Y_data, dtype=torch.long).view(-1).numpy()

    # Filter out padding tokens (if you have used padding)
    non_padding_indices = np.where(true_tags != tag2idx["O"])[0]  # Assuming "O" is the padding tag

    predicted_tags = predicted_tags[non_padding_indices]
    true_tags = true_tags[non_padding_indices]

    precision, recall, f1, _ = precision_recall_fscore_support(true_tags, predicted_tags, average='macro')

    print(f"overall f1 score is : {f1:.2f}")
    # Print classification report
    print(classification_report(true_tags, predicted_tags, target_names=tags))


In [None]:
saved_state_dict = torch.load('best_model.pt')
model.load_state_dict(saved_state_dict)
evaluate(model, X_dev, Y_dev)

overall f1 score is : 0.63
              precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
       I-PER       0.96      0.66      0.78      3149
      I-MISC       0.92      0.71      0.80      1268
       I-LOC       0.88      0.77      0.82      2094
       I-ORG       0.84      0.66      0.74      2092

    accuracy                           0.69      8603
   macro avg       0.72      0.56      0.63      8603
weighted avg       0.91      0.69      0.78      8603



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evaluate(model, X_test, Y_test)

overall f1 score is : 0.57
              precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
       I-PER       0.96      0.44      0.60      2773
      I-MISC       0.85      0.68      0.76       918
       I-LOC       0.83      0.74      0.78      1925
       I-ORG       0.85      0.58      0.69      2496

    accuracy                           0.58      8112
   macro avg       0.70      0.49      0.57      8112
weighted avg       0.88      0.58      0.69      8112



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evaluate(model, X_train, Y_train)

overall f1 score is : 0.72
              precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
       I-PER       0.96      0.98      0.97     11128
      I-MISC       0.93      0.85      0.89      4593
       I-LOC       0.90      0.87      0.88      8297
       I-ORG       0.88      0.85      0.86     10025

    accuracy                           0.89     34043
   macro avg       0.73      0.71      0.72     34043
weighted avg       0.92      0.89      0.91     34043



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
