#Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

#Process Dataset

In [None]:
import json
def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/dataset/spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = read_squad('/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/dataset/spoken_test-v1.1.json')
val_contexts44, val_questions44, val_answers44 = read_squad('/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/dataset/spoken_test-v1.1_WER44.json')
val_contexts54, val_questions54, val_answers54 = read_squad('/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/dataset/spoken_test-v1.1_WER54.json')

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2, 3, 4]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.0 (from python-Levenshtein)
  Downloading Levenshtein-0.25.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.25.0->python-Levenshtein)
  Downloading rapidfuzz-3.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.0 python-Levenshtein-0.25.0 rapid

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            best_match, score = process.extractOne(gold_text, [context[start_idx-n:end_idx+n] for n in range(-4, 5)], scorer=fuzz.partial_ratio)

            if score > 40:
                match_start_idx = context.find(best_match, start_idx - 4)
                if match_start_idx != -1:
                    answer['answer_start'] = match_start_idx
                    answer['answer_end'] = match_start_idx + len(best_match)

add_end_idx(val_answers44, val_contexts44)
add_end_idx(val_answers54, val_contexts54)



In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
val_encodings44 = tokenizer(val_contexts44, val_questions44, truncation=True, padding=True)
val_encodings54 = tokenizer(val_contexts54, val_questions54, truncation=True, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

            shift = 1
            while end_positions[-1] is None:
                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
                shift += 1

        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if any(v is None for v in item.values()):
            item = {key: torch.tensor(val if val is not None else 0) for key, val in item.items()}
        else:
            item = {key: torch.tensor(val) for key, val in item.items()}
        return item


    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)
val_dataset44 = SquadDataset(val_encodings44)
val_dataset54 = SquadDataset(val_encodings54)

#Load/Train Huggingface model

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from accelerate import Accelerator
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=2e-6)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

num_training_steps = len(train_loader) * 30
scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=0, num_training_steps=num_training_steps
)
accelerator = Accelerator()
model, optimizer, training_dataloader, scheduler = accelerator.prepare(model, optim, train_loader, scheduler)
for epoch in range(30):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs[0]
        accelerator.backward(loss)
        optim.step()
        scheduler.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item(), lr=optim.param_groups[0]['lr'])

Epoch 0: 100%|██████████| 2320/2320 [28:47<00:00,  1.34it/s, loss=1.94, lr=1.93e-6]
Epoch 1: 100%|██████████| 2320/2320 [28:50<00:00,  1.34it/s, loss=1.33, lr=1.87e-6]
Epoch 2: 100%|██████████| 2320/2320 [28:50<00:00,  1.34it/s, loss=1.13, lr=1.8e-6]


In [None]:
model.save_pretrained("/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/distilbert-custom")
tokenizer.save_pretrained("/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/distilbert-custom")

('/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/distilbert-custom/tokenizer_config.json',
 '/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/distilbert-custom/special_tokens_map.json',
 '/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/distilbert-custom/vocab.txt',
 '/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/distilbert-custom/added_tokens.json')

#Load Pre-Trained Model

In [None]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/custom")
model = DistilBertForQuestionAnswering.from_pretrained("/content/drive/MyDrive/CPSC 8430 Deep Learning/HW3/models/custom")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#Evaluate Model

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm


*Run for standard test evaluation*

In [13]:
model.eval()

val_loader = DataLoader(val_dataset, batch_size=16)
acc = []

loop = tqdm(val_loader)
answers = []
references = []
for batch in loop:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        for i in range(start_pred.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = ' '.join(all_tokens[start_pred[i] : end_pred[i]+1])
            ref = ' '.join(all_tokens[start_true[i] : end_true[i]+1])
            ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
            answer = tokenizer.decode(ans_ids)
            answers.append(answer)
            references.append(ref)

 19%|█▊        | 186/993 [39:00<2:49:16, 12.59s/it]


KeyboardInterrupt: 

*Run for WER44 test evaluation*

In [None]:
model.eval()

val_loader = DataLoader(val_dataset, batch_size=16)
acc = []

loop = tqdm(val_loader)
answers = []
references = []
for batch in loop:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        for i in range(start_pred.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = ' '.join(all_tokens[start_pred[i] : end_pred[i]+1])
            ref = ' '.join(all_tokens[start_true[i] : end_true[i]+1])
            ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
            answer = tokenizer.decode(ans_ids)
            answers.append(answer)
            references.append(ref)

*Run for WER54 test evaluation*

In [None]:
model.eval()

val_loader = DataLoader(val_dataset, batch_size=16)
acc = []

loop = tqdm(val_loader)
answers = []
references = []
for batch in loop:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        for i in range(start_pred.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = ' '.join(all_tokens[start_pred[i] : end_pred[i]+1])
            ref = ' '.join(all_tokens[start_true[i] : end_true[i]+1])
            ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
            answer = tokenizer.decode(ans_ids)
            answers.append(answer)
            references.append(ref)

*Run for score calculation*

In [None]:
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    if len(scores_for_ground_truths)==0: return 0
    return max(scores_for_ground_truths)

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1
def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
          f1_score, prediction, [ground_truths])

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'f1': f1}

In [None]:
evaluate(references,answers)

{'f1': 11.305952767864614}