In [1]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, AutoModelForQuestionAnswering, AutoTokenizer
import os
import re
import string
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import import_ipynb
import extract_transform_evaluate_ds as docstrideFn
from torch.optim.lr_scheduler import ExponentialLR

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def train_model(self, dataloader, optimizer, scheduler):
    self.train()
    accuracy_arr, loss_arr = [], []
    for batch in tqdm(dataloader, desc='Training Started!'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(self.device)
        attention_mask = batch['attention_mask'].to(self.device)
        start_pos = batch['start_positions'].to(self.device)
        end_pos = batch['end_positions'].to(self.device)
        start_res, end_res = self(input_ids=input_ids, attention_mask=attention_mask)
        loss = self.find_focal_loss(start_res, end_res, start_pos, end_pos)
        loss_arr.append(loss.item())
        loss.backward()
        optimizer.step()

        start_pred = torch.argmax(start_res, dim=1)
        end_pred = torch.argmax(end_res, dim=1)

        accuracy_arr.append(((start_pred == start_pos).sum()/len(start_pred)).item())
        accuracy_arr.append(((end_pred == end_pos).sum()/len(end_pred)).item())
    scheduler.step()
    return sum(accuracy_arr)/len(accuracy_arr), sum(loss_arr)/len(loss_arr)

def plot_metrics(epochs, train_losses, train_accuracies, wer_scores, f1_scores):
    metrics = {
        'Training Loss': train_losses,
        'Training Accuracy': train_accuracies,
        'Word Error Rate (WER)': wer_scores,
        'F1 Score': f1_scores
    }

    colors = ['blue','green','red','brown']
    plt.figure(figsize=(12, 8))
    
    # Loop over metrics dictionary to create subplots
    for i, (title, values) in enumerate(metrics.items(), start=1):
        plt.subplot(2, 2, i)
        plt.plot(epochs, values, marker='o', linestyle='-', color=colors[i-1])
        plt.title(title)
        plt.xlabel('Epoch')
        plt.ylabel(title.split()[1])
    
    plt.tight_layout()
    plt.show()

In [2]:
#Noise V1
train_data = '../spoken_train-v1.1.json'
test_data = '../spoken_test-v1.1_WER44.json'
MODEL_PATH = "bert-large-uncased-whole-word-masking-finetuned-squad"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

train_encodings = docstrideFn.collect_and_find_positions(train_data, tokenizer, 512)
test_encodings = docstrideFn.collect_and_find_positions(test_data, tokenizer, 512)

train_set = docstrideFn.SpokenSquad(train_encodings)
test_set = docstrideFn.SpokenSquad(test_encodings)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
test_loader = DataLoader(test_set, batch_size=1)

bert_large_uncased_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
qa_model = docstrideFn.QAModel(bert_large_uncased_model, device)
optimizer = AdamW(qa_model.parameters(), lr=2e-5, weight_decay=2e-2)

MODEL_SAVE_PATH = "qa_strong_model"

print("Current working directory:", os.getcwd())
print("MODEL_SAVE_PATH:", MODEL_SAVE_PATH)
print("Directory exists:", os.path.isdir(MODEL_SAVE_PATH)) 
scheduler = ExponentialLR(optimizer, gamma=0.9)
if os.path.isdir(MODEL_SAVE_PATH):
    # Just Evaluate
    qa_model.load_state_dict(torch.load(os.path.join(MODEL_SAVE_PATH, 'model_weights.pt')))
    qa_model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)
    print(f"{MODEL_SAVE_PATH} Model loaded!")
    avg_f1_score, wer_score = qa_model.evaluate_model(test_loader, tokenizer)
    print(f"WER Score: {wer_score}")
    print(f"F1 Score: {avg_f1_score}")
else:
    # Train and Evaluate
    wer_scores, accuracies, losses, f1_scores = [], [], [], []
    epoch = 0
    while epoch < 6:
        print(f'Epoch - {epoch + 1}')
        accuracy, loss = train_model(qa_model, train_loader, optimizer, scheduler)
        accuracies.append(accuracy)
        losses.append(loss)
        print(f"Train Accuracy: {accuracy} and Train Loss: {loss}")
    
        avg_f1_score, wer_score = qa_model.evaluate_model(test_loader, tokenizer)
        f1_scores.append(avg_f1_score)
        wer_scores.append(wer_score)
        print(f"F1 Score: {avg_f1_score} and WER Score: {wer_score}")
    
        epoch += 1
    os.makedirs(MODEL_SAVE_PATH)
    torch.save(qa_model.state_dict(), os.path.join(MODEL_SAVE_PATH, 'model_weights.pt'))
    tokenizer.save_pretrained(MODEL_SAVE_PATH)
    plot_metrics(range(1, 7), losses, accuracies, wer_scores, f1_scores)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The

Current working directory: /home/lmadipa/HW3/Strong
MODEL_SAVE_PATH: qa_strong_model
Directory exists: True
qa_strong_model Model loaded!


Evaluating Model!: 100%|██████████| 17841/17841 [03:53<00:00, 76.32it/s]


WER Score: 1.8800556801484805
F1 Score: 0.4884262998399399


In [3]:
#Noise V2
train_data = '../spoken_train-v1.1.json'
test_data = '../spoken_test-v1.1_WER54.json'
MODEL_PATH = "bert-large-uncased-whole-word-masking-finetuned-squad"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

train_encodings = docstrideFn.collect_and_find_positions(train_data, tokenizer, 512)
test_encodings = docstrideFn.collect_and_find_positions(test_data, tokenizer, 512)

train_set = docstrideFn.SpokenSquad(train_encodings)
test_set = docstrideFn.SpokenSquad(test_encodings)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
test_loader = DataLoader(test_set, batch_size=1)

bert_large_uncased_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
qa_model = docstrideFn.QAModel(bert_large_uncased_model, device)
optimizer = AdamW(qa_model.parameters(), lr=2e-5, weight_decay=2e-2)

MODEL_SAVE_PATH = "qa_strong_model"

scheduler = ExponentialLR(optimizer, gamma=0.9)
if os.path.isdir(MODEL_SAVE_PATH):
    # Just Evaluate
    qa_model.load_state_dict(torch.load(os.path.join(MODEL_SAVE_PATH, 'model_weights.pt')))
    qa_model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)
    print(f"{MODEL_SAVE_PATH} Model loaded!")
    avg_f1_score, wer_score = qa_model.evaluate_model(test_loader, tokenizer)
    print(f"WER Score: {wer_score}")
    print(f"F1 Score: {avg_f1_score}")
else:
    # Train and Evaluate
    wer_scores, accuracies, losses, f1_scores = [], [], [], []
    epoch = 0
    while epoch < 6:
        print(f'Epoch - {epoch + 1}')
        accuracy, loss = train_model(qa_model, train_loader, optimizer, scheduler)
        accuracies.append(accuracy)
        losses.append(loss)
        print(f"Train Accuracy: {accuracy} and Train Loss: {loss}")
    
        avg_f1_score, wer_score = qa_model.evaluate_model(test_loader, tokenizer)
        f1_scores.append(avg_f1_score)
        wer_scores.append(wer_score)
        print(f"F1 Score: {avg_f1_score} and WER Score: {wer_score}")
    
        epoch += 1
    os.makedirs(MODEL_SAVE_PATH)
    torch.save(qa_model.state_dict(), os.path.join(MODEL_SAVE_PATH, 'model_weights.pt'))
    tokenizer.save_pretrained(MODEL_SAVE_PATH)
    plot_metrics(range(1, 7), losses, accuracies, wer_scores, f1_scores)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  qa_model.load_state_dict(torch.load(os.path.join(MODEL_SAVE_PATH, 'model_weights.pt')))


qa_strong_model Model loaded!


Evaluating Model!: 100%|██████████| 17841/17841 [03:53<00:00, 76.35it/s]


WER Score: 2.716631736995214
F1 Score: 0.40775785935154174
