In [1]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: __MACOSX/._data         
  inflating: data/.DS_Store          
  inflating: __MACOSX/data/._.DS_Store  
  inflating: data/test               
  inflating: __MACOSX/data/._test    
  inflating: data/train              
  inflating: __MACOSX/data/._train   
  inflating: data/dev                
  inflating: __MACOSX/data/._dev     


In [2]:
# Find all sentences from the training data

from collections import defaultdict

def get_sentences_and_states(file_path, vocab = None):
  sentences = []
  states = []
  states_set = set()
  if vocab == None:
    vocab = set()
  with open(file_path, "r") as file:
    lines = file.readlines()
    # singular sentence
    sentence = []
    # singular tag of format (prev tag, curr tag)
    state_array = []
    for line in lines:
      # New sentence
      if len(line.strip()) == 0 and len(sentence) > 0:
        sentences.append([_ for _ in sentence])
        states.append([_ for _ in state_array])
        # print(sentences, states)
        sentence = []
        state_array = []
        continue
      word_split = line.strip().split(" ")
      # print(word_split)
      word = word_split[1] if len(word_split) >= 2 else None
      curr_state = word_split[2] if len(word_split) >= 3 else None
      if curr_state not in states_set:
        states_set.add(curr_state)
      if word not in vocab:
        vocab.add(word)
      # print(word, curr_state)
      sentence.append(word)
      state_array.append(curr_state)
    sentences.append([_ for _ in sentence])
    states.append([_ for _ in state_array])
    return sentences, states, states_set, vocab


In [3]:
def decode_sentences_and_tokens(sentences, tokens, true_tags):
  decoded_sents = []
  decoded_tokens = []
  true_tokens = []
  for sentence, token_arr, true_token_arr in zip(sentences, tokens, true_tags):
    sentence_decoded = [idx2word[idx] for idx in sentence]
    token_decoded = [idx2state[idx] for idx in token_arr]
    true_tags_decoded = [idx2state[idx] for idx in true_token_arr]
    decoded_sents.extend([sentence_decoded])
    decoded_tokens.extend([token_decoded])
    true_tokens.extend([true_tags_decoded])
  return decoded_sents, decoded_tokens, true_tokens

In [4]:
def detach(all_sentences, decoded_tags, gt):
  all_sentences = [torch.squeeze(sentence, 0) for sentence in all_sentences]
  all_sentences = [sentence.cpu().detach().numpy() for sentence in all_sentences]
  out_states = [tokens.cpu().detach().numpy() for tokens in decoded_tags]
  gt_states = [tokens.cpu().detach().numpy() for tokens in gt]
  return all_sentences, out_states, gt_states

In [89]:
def write_inference_to_file(decoded_tags, true_tags, test_sentences, filename, eval = True):
    with open(filename, "w") as file:
        for sentence, decoded_sentence_tags, gt_tags in zip(test_sentences, decoded_tags, true_tags):
            for i, (word, pred, gt) in enumerate(zip(sentence, decoded_sentence_tags, gt_tags), 1):
                if eval:
                  line = f'{i} {word} {gt} {pred}'
                else:
                  line = f'{i} {word} {pred}'
                file.write(line + '\n')
            file.write('\n')


In [6]:
sentences, states, all_states, vocab = get_sentences_and_states("./data/train")
dev_sentences, dev_states, _, vocab = get_sentences_and_states("./data/dev", vocab)
test_sentences, test_states, _, vocab = get_sentences_and_states("./data/test", vocab)

In [73]:
# No tags provided for test data
for sublist in test_states:
    # Iterate through each element in the sublist
    for i in range(len(sublist)):
        # Set the element to '<PAD>'
        sublist[i] = '<PAD>'

In [None]:
word2idx = {word: idx for idx, word in enumerate(vocab, 1)}
state2idx = {state: idx for idx, state in enumerate(all_states, 1)}
idx2word = {idx: word for idx, word in enumerate(vocab, 1)}
idx2state = {idx: state for idx, state in enumerate(all_states, 1)}

In [None]:
word2idx.update({"<PAD>": 0})
state2idx.update({"<PAD>": 0})
idx2word.update({0: "<PAD>"})
idx2state.update({0: "<PAD>"})

In [7]:
def create_char_vocab(sentences):
    """
    Creates a character-level vocabulary from a list of sentences.

    Args:
        sentences (list): A list of sentences.

    Returns:
        tuple: A tuple containing two dictionaries:
            char2idx (dict): A dictionary mapping characters to their integer indices.
            idx2char (dict): A dictionary mapping integer indices to their characters.
    """
    vocab = set()
    for sentence in sentences:
        for word in sentence:
            for char in word:
              vocab.add(char)
    vocab = sorted(vocab)
    char2idx = {char: i for i, char in enumerate(vocab)}
    idx2char = {i: char for i, char in enumerate(vocab)}
    return char2idx, idx2char

In [8]:
import json
def save_dicts_to_json(word2idx, state2idx, idx2word, idx2state):
    # Save word2idx as JSON
    with open('word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    # Save state2idx as JSON
    with open('state2idx.json', 'w') as f:
        json.dump(state2idx, f)

    # Save idx2word as JSON
    with open('idx2word.json', 'w') as f:
        json.dump(idx2word, f)

    # Save idx2state as JSON
    with open('idx2state.json', 'w') as f:
        json.dump(idx2state, f)

In [9]:
def save_char_dicts_to_json(char2idx, idx2char):
  with open('char2idx.json', 'w') as f:
    json.dump(char2idx, f)
  with open('idx2char.json', 'w') as f:
    json.dump(idx2char, f)

In [10]:
import json
def load_chars_from_json():
  with open('char2idx.json', 'r') as f:
    char2idx = json.load(f)
  with open('idx2char.json', 'r') as f:
    idx2char = json.load(f)
  return char2idx, idx2char

In [11]:
import json
def load_dicts_from_json():
    # Load word2idx from JSON
    with open('word2idx.json', 'r') as f:
        word2idx = json.load(f)

    # Load state2idx from JSON
    with open('state2idx.json', 'r') as f:
        state2idx = json.load(f)

    # Load idx2word from JSON
    with open('idx2word.json', 'r') as f:
        idx2word = json.load(f)
        idx2word = {int(k): v for k, v in idx2word.items()}

    # Load idx2state from JSON
    with open('idx2state.json', 'r') as f:
        idx2state = json.load(f)
        idx2state = {int(k): v for k, v in idx2state.items()}

    return word2idx, state2idx, idx2word, idx2state

In [None]:
save_dicts_to_json(word2idx, state2idx, idx2word, idx2state)

In [69]:
word2idx, state2idx, idx2word, idx2state = load_dicts_from_json()

In [None]:
char2idx, idx2char = create_char_vocab(sentences)

In [None]:
save_char_dicts_to_json(char2idx, idx2char)

In [12]:
from collections import Counter
import torch
def label_weights(data, state2idx):
    """
    Assigns weights to labels in a dataset based on their inverse frequency of appearance
    in the entire label set.

    Parameters:
    data List[List[Str]]: A list of lists of labels for the dataset.

    Returns:
    dict: A dictionary of label weights.
    """
    # Count the number of occurrences of each label in the dataset.
    data = sum(data, [])
    label_counts = Counter(data)

    # Compute the total number of labels in the dataset.
    total_labels = sum(label_counts.values())

    # Compute the inverse frequency of appearance for each label.
    label_weights = {label: total_labels / count for label, count in label_counts.items()}

    # Normalize the weights so that they sum to 1.
    total_weights = sum(label_weights.values())
    label_weights = {label: weight / total_weights for label, weight in label_weights.items()}
    
    weight_tensor = torch.zeros(len(state2idx))
    for label, weight in label_weights.items():
        weight_tensor[state2idx[label]] = weight

    return weight_tensor

In [16]:
label_weights = label_weights(states, state2idx)

## Loading Data

In [17]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict
from typing import List, Tuple

class SentencesDataset(Dataset):
  def __init__(self, sentences: List[str], states: List[List[str]], all_states: List[str], vocab: List[str], state2idx, word2idx, idx2word):
    self.sentences = sentences
    self.states = states
    self.all_states = all_states
    self.vocab = vocab
    self.state2idx = state2idx
    self.word2idx = word2idx
    self.idx2word = idx2word

  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, idx):
    sentence = self.sentences[idx]
    encoded_sentence = [self.word2idx[word] for word in sentence]

    tags = self.states[idx]
    tags_encoded = []
    for tag in tags:
      one_hot = torch.zeros(len(self.all_states) + 1)
      one_hot[self.state2idx[tag]] = 1
      tags_encoded.append(one_hot)
    
    return encoded_sentence, tags_encoded

  



In [None]:
class CharactersDataset(Dataset):
  def __init__(self, sentences: List[str], states: List[List[str]], all_states: List[str], vocab: List[str], state2idx, char2idx, idx2char):
    self.sentences = sentences
    self.states = states
    self.all_states = all_states
    self.vocab = vocab
    self.state2idx = state2idx
    self.char2idx = char2idx
    self.idx2char = idx2char

  def __len__(self):
    return len(self.sentences)
  
  def _get_chars(self, sentence):
    chars = []
    for word in sentence:
      for char in word:
        chars.append(char)
    return chars
  
  def __getitem__(self, idx):
    sentence = self.sentences[idx]
    chars = self._get_chars(sentence)
    encoded_sentence = [self.char2idx[char] for char in chars]

    tags = self.states[idx]
    tags_encoded = []
    for tag in tags:
      one_hot = torch.zeros(len(self.all_states) + 1)
      one_hot[self.state2idx[tag]] = 1
      tags_encoded.append(one_hot)
    
    return encoded_sentence, tags_encoded


In [19]:
def collate_fn(batch):
    # Get the sequences and labels from the batch
    sequences, labels = zip(*batch)

    # Find the maximum sequence length in the batch
    max_len = max([len(seq) for seq in sequences])

    # Pad the sequences and labels to the maximum length
    padded_sequences = pad_sequence([torch.tensor(seq) for seq in sequences], batch_first=True, padding_value=0)
    
    for label in labels:
      while len(label) < max_len:
          zeros = torch.zeros(10)
          zeros[0] = 1
          label.append(zeros)
    labels = torch.stack([torch.stack(lst) for lst in labels])
    return padded_sequences, labels

In [20]:
train_dataset = SentencesDataset(sentences, states, all_states, vocab, state2idx = state2idx, word2idx = word2idx, idx2word = idx2word)

In [None]:
train_dataset_chars = CharactersDataset(sentences, states, all_states, vocab, state2idx = state2idx, idx2char = idx2char, char2idx = char2idx)

In [21]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False, collate_fn=lambda batch: collate_fn(batch))

In [None]:
train_dataloader_chars =  DataLoader(train_dataset_chars, batch_size=2, shuffle=False, collate_fn=lambda batch: collate_fn(batch))

In [22]:
dev_dataset = SentencesDataset(dev_sentences, dev_states, all_states, vocab, state2idx = state2idx, word2idx = word2idx, idx2word = idx2word)

In [23]:
dev_dataloader = DataLoader(dev_dataset, batch_size=1, shuffle=False, collate_fn=lambda batch: collate_fn(batch))

In [74]:
test_dataset = SentencesDataset(test_sentences, test_states, all_states, vocab, state2idx = state2idx, word2idx = word2idx, idx2word = idx2word)

In [52]:
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=lambda batch: collate_fn(batch))

## Trainer Class and Inference Method

In [24]:
import math
import gc
import os
import subprocess

class Trainer:
    def __init__(self, model, training_data, device, lr=1e-3, epochs=200, betas=(0.9, 0.999), warmup_epochs=1, optim_type = 'SGD', validation_file = None, validation_data = None, label_weights = None):
        self.hyperparams = {
            'lr': lr,
            'epochs': epochs,
            'betas': betas,
            'warmup_epochs': warmup_epochs
        }
        self.td = training_data
        self.device = device
        self.label_weights = label_weights.to(device)
        self.loss_fn = nn.CrossEntropyLoss(weight=self.label_weights, ignore_index=0)
        self.model = model
        self.optim_type = optim_type
        
        if self.optim_type == 'Adam':
          self.optim = torch.optim.Adam(self.model.parameters(), lr=lr, betas=self.hyperparams['betas'])
        
        elif self.optim_type == 'SGD':
          self.optim = torch.optim.SGD(self.model.parameters(), lr=lr)
        
        
          
        self.validation_file = validation_file
        self.validation_data = validation_data

    def load_optim_state_dict(self, checkpoint):
      self.optim.load_state_dict(checkpoint['optimizer_state_dict'])

    def get_lr(self, epoch):
        if epoch < self.hyperparams['warmup_epochs']:
            return self.hyperparams['lr'] * epoch / self.hyperparams['warmup_epochs']
        else:
            return self.hyperparams['lr']
  

    def calc_accuracy(self, output, y):
        pred = torch.argmax(output, dim=1)
        y = torch.argmax(y, dim=1)
        return (pred == y).sum().item() / len(y)

    def calc_accuracy_test(self, output, y):
        pred = torch.argmax(output, dim=1)
        return (pred == y).sum().item() / len(y)

    def train_epoch(self):
        running_loss = 0.0
        running_acc = 0.0

        for sentence, label in self.td:
            self.optim.zero_grad()
            sentence = sentence.long().to(self.device)
            label = label.to(self.device)
            output = self.model(sentence)

            # reshape output and label to (batch_size*seq_len, # of classes)
            output = output.view(-1, output.size(-1))
            label = label.view(-1, label.size(-1))

            # compute cross-entropy loss
            loss = self.loss_fn(output, torch.argmax(label, dim=1))

            loss.backward()
            self.optim.step()

            running_loss += loss.item()
            running_acc += self.calc_accuracy(output, label)

            del sentence, label, output

        train_loss = running_loss / len(self.td)
        training_acc = running_acc / len(self.td)
        return train_loss, training_acc
    
    @torch.no_grad()
    def validate_epoch_with_script(self):
        if self.validation_file == None or self.validation_data == None:
          pass
        else:
          all_sentences = []
          test_output = []
          true_res = []
          for x, y in self.validation_data:
            x = x.long().to(device)
            y = y.to(device).float()
            all_sentences.append(x)
            output = self.model(x)
            output = output.view(-1, output.size(-1))
            y = y.view(-1, y.size(-1))
            pred = torch.argmax(output, dim=1)
            y = torch.argmax(y, dim=1)
            test_output.append(pred)
            true_res.append(y)
            del x, y, output
          decoded_sentences, decoded_tags_out, decoded_tags_true = detach(all_sentences, test_output, true_res)
          decoded_sentences_blstm2, decoded_tags_out_blstm2, decoded_tags_true_blstm2 = decode_sentences_and_tokens(decoded_sentences, decoded_tags_out, decoded_tags_true)
          write_inference_to_file(decoded_tags_out_blstm2, decoded_tags_true_blstm2, decoded_sentences_blstm2, "blstm2_dev1.out")
          batcmd = f"perl {self.validation_file} < blstm2_dev1.out"
          result = subprocess.check_output(batcmd, shell=True)
          print(result)


    def fit(self):
        train_losses,train_accs = [], []
        min_vl = float("-inf")

        for epoch in range(self.hyperparams['epochs']):
            print(f"------EPOCH {epoch+1}/{self.hyperparams['epochs']}------")

            self.model.train()

            lr = self.get_lr(epoch)


            train_loss, train_acc = self.train_epoch()
            train_losses.append(train_loss)
            train_accs.append(train_acc)

            print(f"Training LOSS: {train_loss} | ACCURACY: {train_acc} | LR: {lr}")

            if epoch % 3 == 0:
              self.validate_epoch_with_script()
              torch.save({'epoch': epoch + 1,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optim.state_dict(),
                        'loss': train_loss}, 
                        'cmps544_hw4_bstml_model_simple_checkpoint.pth')
              torch.save(self.model, "b_lstm_model_simple.pt")
            cmd = "cp cmps544_hw4_bstml_model_simple_checkpoint.pth /content/drive/MyDrive/Colab/"
            cmd2 = "cp b_lstm_model_simple.pt /content/drive/MyDrive/Colab/"
            result = subprocess.check_output(cmd, shell=True)
            result = subprocess.check_output(cmd2, shell=True)


            # CLEANUP
            gc.collect()
            torch.cuda.empty_cache()

        return (train_losses, train_accs)


In [123]:
import numpy as np
from sklearn.metrics import confusion_matrix

@torch.no_grad()
def infer(model, test_dataloader, trainer = None):

    running_loss = 0
    running_acc = 0
    test_output = []
    true_res = []
    incorrect_examples = []
    incorrect_labels = []
    incorrect_pred = []
    all_x = []

    for x, y in test_dataloader:

        x = x.long().to(device)
        y = y.to(device).float()
        all_x.append(x)
        output = model(x)
        output = output.view(-1, output.size(-1))
        y = y.view(-1, y.size(-1))
        pred = torch.argmax(output, dim=1)
        y = torch.argmax(y, dim=1)
        test_output.append(pred)
        true_res.append(y)
        if trainer:
          running_acc += trainer.calc_accuracy_test(output, y)
          loss = trainer.loss_fn(output, y)
          running_loss += loss.item()
        del x, y, output

    test_loss = running_loss / len(test_dataloader)
    test_acc = running_acc / len(test_dataloader)

    # calculate F1 score, precision, and recall
    test_output_cf = torch.cat(test_output, dim=0).cpu().numpy()
    true_res_cf = torch.cat(true_res, dim=0).cpu().numpy()

    conf_mat = confusion_matrix(true_res_cf, test_output_cf)
    true_positives = np.diag(conf_mat)
    false_positives = np.sum(conf_mat, axis=0) - true_positives
    false_negatives = np.sum(conf_mat, axis=1) - true_positives

    precision = np.sum(true_positives) / (np.sum(true_positives) + np.sum(false_positives))
    recall = np.sum(true_positives) / (np.sum(true_positives) + np.sum(false_negatives))
    f1 = 2 * precision * recall / (precision + recall)

    return test_loss, test_acc, test_output, true_res, f1, precision, recall, all_x


In [26]:
import torch.nn as nn

In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Task 1: Simple Bidirectional LSTM

In [28]:
class SimpleBidirectionalLSTM(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, n_layers, device, dropout_pct, num_classes):
        super(SimpleBidirectionalLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.device = device

        self.embedding_layer = nn.Embedding(num_embeddings, embedding_dim)
        self.dropout_pct = dropout_pct

        self.bilstm = nn.LSTM(input_size=embedding_dim,
                              hidden_size=hidden_dim,
                              num_layers=n_layers,
                              batch_first=True,
                              bidirectional=True)

        # Linear layer
        self.linear = nn.Linear(hidden_dim * 2, hidden_dim)

        # ELU activation function
        self.elu = nn.ELU()

        # Dropout layer
        self.dropout = nn.Dropout(self.dropout_pct)

        # Classifier layer
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, text):
        # print(text)
        # print(text.shape)
        # Embedding layer
        embedded = self.embedding_layer(text)

        # BiLSTM layer
        bilstm_output, _ = self.bilstm(embedded)

        # Apply dropout layer
        dropped = self.dropout(bilstm_output)

        # Apply linear layer
        linear_output = self.linear(dropped)

        # Apply ELU activation function
        elu_output = self.elu(linear_output)

        # Apply classifier layer to every time step
        output = self.classifier(elu_output)
        #print(output.shape)

        return output


In [29]:
b_lstm_model = SimpleBidirectionalLSTM(len(word2idx.keys()), 100, 256, 1, device, 0.33, len(state2idx.keys())).to(device)
print(b_lstm_model)

SimpleBidirectionalLSTM(
  (embedding_layer): Embedding(30291, 100)
  (bilstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=256, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (classifier): Linear(in_features=256, out_features=10, bias=True)
)


In [30]:
b_lstm_model.load_state_dict(torch.load("/content/cmps544_hw4_bstml_model_simple_checkpoint.pth")["model_state_dict"])
b_lstm_model.to(device)

SimpleBidirectionalLSTM(
  (embedding_layer): Embedding(30291, 100)
  (bilstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=256, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (classifier): Linear(in_features=256, out_features=10, bias=True)
)

In [31]:
trainer_blstm = Trainer(b_lstm_model, train_dataloader, device, epochs=200, warmup_epochs=1, validation_data = dev_dataloader, validation_file = "conll03eval", lr = 1e-1, label_weights=label_weights)

In [None]:
(train_losses, train_accs) = trainer_blstm.fit()

------EPOCH 1/200------
Training LOSS: 0.00291117375813826 | ACCURACY: 0.40638496740024765 | LR: 0.0
b'processed 51578 tokens with 5942 phrases; found: 5927 phrases; correct: 4271.\naccuracy:  94.26%; precision:  72.06%; recall:  71.88%; FB1:  71.97\n              LOC: precision:  85.79%; recall:  78.55%; FB1:  82.01  1682\n             MISC: precision:  72.76%; recall:  74.73%; FB1:  73.73  947\n              ORG: precision:  62.40%; recall:  68.68%; FB1:  65.39  1476\n              PER: precision:  66.85%; recall:  66.12%; FB1:  66.48  1822\n'
------EPOCH 2/200------
Training LOSS: 0.0026215796066134988 | ACCURACY: 0.40649013539541673 | LR: 0.1
------EPOCH 3/200------
Training LOSS: 0.002785542371687754 | ACCURACY: 0.40641794898813927 | LR: 0.1
------EPOCH 4/200------
Training LOSS: 0.002573491708530303 | ACCURACY: 0.4065614227579458 | LR: 0.1
b'processed 51578 tokens with 5942 phrases; found: 5948 phrases; correct: 4306.\naccuracy:  94.30%; precision:  72.39%; recall:  72.47%; FB1: 

In [None]:
torch.save(b_lstm_model.state_dict(), 'blstm1.pth')

In [38]:
b_lstm_model.eval()

SimpleBidirectionalLSTM(
  (embedding_layer): Embedding(30291, 100)
  (bilstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=256, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (classifier): Linear(in_features=256, out_features=10, bias=True)
)

In [45]:
torch.save(b_lstm_model, "blstm1.pt")

In [128]:
dev_loss, dev_acc, dev_output, dev_true_res, dev_f1, dev_precision, dev_recall, all_sentences = infer(model=b_lstm_model, test_dataloader=dev_dataloader)

In [125]:
test1_loss, test1_acc, test1_output, test1_true_res, test1_f1, test1_precision, test1_recall, test_sentences = infer(model=b_lstm_model, test_dataloader=test_dataloader)

  f1 = 2 * precision * recall / (precision + recall)


In [129]:
print(f'F1 Score on Dev set: {dev_f1}')
print(f'Precision on Dev set: {dev_precision}')
print(f'Recall on Dev set: {dev_recall}')

F1 Score on Dev set: 0.9518011555314281
Precision on Dev set: 0.9518011555314281
Recall on Dev set: 0.9518011555314281


In [41]:
decoded_sentences, decoded_tags_out, decoded_tags_true = detach(all_sentences, dev_output, dev_true_res)

In [42]:
decoded_sentences, decoded_tags_out, decoded_tags_true = decode_sentences_and_tokens(decoded_sentences, decoded_tags_out, decoded_tags_true)

In [82]:
decoded_sentences_t, decoded_tags_out_t, decoded_tags_true_t = detach(test_sentences, test1_output, test1_true_res)

In [83]:
decoded_sentences_t, decoded_tags_out_t, decoded_tags_true_t = decode_sentences_and_tokens(decoded_sentences_t, decoded_tags_out_t, decoded_tags_true_t)

In [90]:
write_inference_to_file(decoded_tags_out, decoded_tags_true, decoded_sentences, "dev1.out", eval = False)

In [91]:
write_inference_to_file(decoded_tags_out_t, decoded_tags_true_t, decoded_sentences_t, "test1.out", eval = False)

In [44]:
!perl conll03eval < dev1.out

processed 51578 tokens with 5942 phrases; found: 5567 phrases; correct: 4425.
accuracy:  95.18%; precision:  79.49%; recall:  74.47%; FB1:  76.90
              LOC: precision:  89.78%; recall:  81.33%; FB1:  85.35  1664
             MISC: precision:  83.20%; recall:  76.79%; FB1:  79.86  851
              ORG: precision:  71.55%; recall:  70.69%; FB1:  71.12  1325
              PER: precision:  73.83%; recall:  69.22%; FB1:  71.45  1727


## Task 2: BLSTM with GloVE Embeddings

In [92]:
EMBEDDING_DIM = 101

In [93]:
import gzip
import numpy as np

def load_glove_embeddings(embedding_file_path):
    embeddings_index = {}
    with gzip.open(embedding_file_path, 'rt', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings('/content/drive/MyDrive/Colab/glove.6B.100d.gz')


In [94]:
def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))    
    np.random.seed(123)  # Set a fixed seed for reproducibility
    mean = 0
    stddev = 1
    unk_vec = np.random.normal(mean, stddev, size=(100,))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word.lower())
        if embedding_vector is None:
          embedding_vector = unk_vec        
        casing_encoding = np.array([int(word[0].isupper())])
        combined_vector = np.hstack((embedding_vector, casing_encoding))
        embedding_matrix[i] = combined_vector
    return embedding_matrix



In [95]:
embedding_mat = create_embedding_matrix(word2idx, glove_embeddings, EMBEDDING_DIM)

In [97]:
class GloVEBidirectionalLSTM(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, n_layers, device, dropout_pct, num_classes, embedding_mat):
        super(GloVEBidirectionalLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.device = device
      

        # Create embedding layer
        self.embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat), freeze=True)
        self.dropout_pct = dropout_pct

        self.bilstm = nn.LSTM(input_size=embedding_dim,
                              hidden_size=hidden_dim,
                              num_layers=n_layers,
                              batch_first=True,
                              bidirectional=True)

        # Linear layer
        self.linear = nn.Linear(hidden_dim * 2, hidden_dim)

        # ELU activation function
        self.elu = nn.ELU()

        # Dropout layer
        self.dropout = nn.Dropout(self.dropout_pct)

        # Classifier layer
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, text):
        # print(text)
        # print(text.shape)
        # Embedding layer
        embedded = self.embedding_layer(text)

        # BiLSTM layer
        bilstm_output, _ = self.bilstm(embedded)

        # Apply dropout layer
        dropped = self.dropout(bilstm_output)

        # Apply linear layer
        linear_output = self.linear(dropped)

        # Apply ELU activation function
        elu_output = self.elu(linear_output)

        # Apply classifier layer to every time step
        output = self.classifier(elu_output)
        #print(output.shape)

        return output


In [98]:
b_lstm_model_glove = GloVEBidirectionalLSTM(len(word2idx.keys()), EMBEDDING_DIM, 256, 1, device, 0.33, len(state2idx.keys()), embedding_mat).to(device)

In [99]:
b_lstm_model_glove.load_state_dict(torch.load("/content/cmps544_hw4_bstml_model_checkpoint.pth")["model_state_dict"])

<All keys matched successfully>

In [103]:
trainer_blstm_2 = Trainer(b_lstm_model_glove, train_dataloader, device, epochs=100, warmup_epochs=20, validation_data = dev_dataloader, validation_file = "conll03eval", lr = 1e-3, optim_type = 'SGD', label_weights=label_weights)

In [None]:
(train_losses, train_accs) = trainer_blstm_2.fit()

------EPOCH 1/100------
Training LOSS: 0.027388364530295473 | ACCURACY: 0.9917700161406517 | LR: 0.0
b'processed 51578 tokens with 5942 phrases; found: 6101 phrases; correct: 5248.\naccuracy:  97.91%; precision:  86.02%; recall:  88.32%; FB1:  87.15\n              LOC: precision:  89.09%; recall:  93.30%; FB1:  91.15  1924\n             MISC: precision:  76.70%; recall:  80.69%; FB1:  78.65  970\n              ORG: precision:  79.67%; recall:  78.90%; FB1:  79.28  1328\n              PER: precision:  92.18%; recall:  94.03%; FB1:  93.09  1879\n'
------EPOCH 2/100------
Training LOSS: 0.027832139971334085 | ACCURACY: 0.9914834325154747 | LR: 5e-05
------EPOCH 3/100------
Training LOSS: 0.02816464930293323 | ACCURACY: 0.991720224221421 | LR: 0.0001
------EPOCH 4/100------
Training LOSS: 0.02814234041544249 | ACCURACY: 0.9912678122019191 | LR: 0.00015000000000000001
b'processed 51578 tokens with 5942 phrases; found: 6141 phrases; correct: 5276.\naccuracy:  97.98%; precision:  85.91%; reca

KeyboardInterrupt: ignored

In [108]:
b_lstm_model_glove.eval()

GloVEBidirectionalLSTM(
  (embedding_layer): Embedding(30292, 101)
  (bilstm): LSTM(101, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=256, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (classifier): Linear(in_features=256, out_features=10, bias=True)
)

In [113]:
torch.save(b_lstm_model_glove, "blstm2.pt")

In [109]:
dev_loss_blstm2, dev_acc_blstm2, dev_output_blstm2, dev_true_res_blstm2, dev_f1_blstm2, dev_precision_blstm2, dev_recall_blstm2, all_sentences_blstm2 = infer(trainer_blstm_2, b_lstm_model_glove, dev_dataloader)

In [110]:
decoded_sentences_blstm2, decoded_tags_out_blstm2, decoded_tags_true_blstm2 = detach(all_sentences_blstm2, dev_output_blstm2, dev_true_res_blstm2)
decoded_sentences_blstm2, decoded_tags_out_blstm2, decoded_tags_true_blstm2 = decode_sentences_and_tokens(decoded_sentences_blstm2, decoded_tags_out_blstm2, decoded_tags_true_blstm2)

In [116]:
test_loss_blstm2, test_acc_blstm2, test_output_blstm2, test_true_res_blstm2, test_f1_blstm2, test_precision_blstm2, test_recall_blstm2, test_sentences_blstm2 = infer(trainer_blstm_2, b_lstm_model_glove, test_dataloader)

  f1 = 2 * precision * recall / (precision + recall)


In [117]:
decoded_sentences_blstm2_t, decoded_tags_out_blstm2_t, decoded_tags_true_blstm2_t = detach(test_sentences_blstm2, test_output_blstm2, test_true_res_blstm2)
decoded_sentences_blstm2_t, decoded_tags_out_blstm2_t, decoded_tags_true_blstm2_t = decode_sentences_and_tokens(decoded_sentences_blstm2_t, decoded_tags_out_blstm2_t, decoded_tags_true_blstm2_t)

In [114]:
write_inference_to_file(decoded_tags_out_blstm2, decoded_tags_true_blstm2, decoded_sentences_blstm2, "dev2.out", eval = False)

In [119]:
write_inference_to_file(decoded_tags_out_blstm2_t, decoded_tags_true_blstm2_t, decoded_sentences_blstm2_t, "test2.out", eval = False)

In [112]:
!perl conll03eval < blstm2_dev1.out

processed 51578 tokens with 5942 phrases; found: 6044 phrases; correct: 5349.
accuracy:  98.21%; precision:  88.50%; recall:  90.02%; FB1:  89.25
              LOC: precision:  92.06%; recall:  93.36%; FB1:  92.70  1863
             MISC: precision:  79.41%; recall:  84.06%; FB1:  81.66  976
              ORG: precision:  83.36%; recall:  82.18%; FB1:  82.76  1322
              PER: precision:  93.31%; recall:  95.39%; FB1:  94.34  1883


## Task 3: BLSTM - CNN

In [None]:
class GloVEBidirectionalLSTMCNN(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, n_layers, device, dropout_pct, num_classes, embedding_mat):
        super(GloVEBidirectionalLSTMCNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.device = device
      

        # Create embedding layer
        self.embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat), freeze=True)
        self.dropout_pct = dropout_pct

        self.bilstm = nn.LSTM(input_size=embedding_dim,
                              hidden_size=hidden_dim,
                              num_layers=n_layers,
                              batch_first=True,
                              bidirectional=True)

        # Linear layer
        self.linear = nn.Linear(hidden_dim * 2, hidden_dim)

        # ELU activation function
        self.elu = nn.ELU()

        # Dropout layer
        self.dropout = nn.Dropout(self.dropout_pct)

        # Classifier layer
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, text):
        # print(text)
        # print(text.shape)
        # Embedding layer
        embedded = self.embedding_layer(text)

        # BiLSTM layer
        bilstm_output, _ = self.bilstm(embedded)

        # Apply dropout layer
        dropped = self.dropout(bilstm_output)

        # Apply linear layer
        linear_output = self.linear(dropped)

        # Apply ELU activation function
        elu_output = self.elu(linear_output)

        # Apply classifier layer to every time step
        output = self.classifier(elu_output)
        #print(output.shape)

        return output


In [None]:
class CharCNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_filters, filter_sizes, output_dim):
        super(CharCNN, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        # Convolutional layers
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        
        # Fully-connected layer
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        
        # Activation function
        self.activation = nn.ReLU()
        
    def forward(self, x):
        # Convert input to embeddings
        x = self.embedding(x)
        
        # Transpose tensor for Conv1d input
        x = x.permute(0, 2, 1)
        
        # Apply convolutional filters and activation
        conv_outputs = []
        for conv in self.convs:
            conv_output = conv(x)
            conv_output = self.activation(conv_output)
            conv_output = nn.functional.max_pool1d(conv_output, conv_output.shape[2])
            conv_outputs.append(conv_output.squeeze())
        
        # Concatenate convolutional outputs
        x = torch.cat(conv_outputs, dim=1)
        
        # Apply fully-connected layer
        x = self.fc(x)
        
        return x