In [1]:
import numpy as np
import evaluate
import collections
import json
import datasets
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AutoModel, default_data_collator, get_scheduler
from accelerate import Accelerator, notebook_launcher
from huggingface_hub import Repository, get_full_repo_name, notebook_login
from tqdm import tqdm

In [2]:
train = 'spoken_train-v1.1.json'
test = 'spoken_test-v1.1.json'
test_44 = 'spoken_test-v1.1_WER44.json'
test_54 = 'spoken_test-v1.1_WER54.json'

def preprocess(file):
    S = []
    with open(file,'r') as f: data = json.load(f)
    for i in data['data']:
        T = i['title']
        for j in i['paragraphs']:
            c = j['context']
            for k in j['qas']:
                l = {}
                l['id'] = k['id']
                l['context'] = c.strip()
                l['title'] = T.strip()
                l['question'] = k['question'].strip()
                l['answers'] = {}
                l['answers']['answer_start'] = [z['answer_start'] for z in k['answers']]
                l['answers']['text'] = [z['text'] for z in k['answers']]
                S.append(l)
    Q = {'data':S}
    output = 'out_'+file
    with open(output,'w') as f: json.dump(Q,f)
    return output

train = preprocess(train)
test = preprocess(test)
test_44 = preprocess(test_44)
test_54 = preprocess(test_54)
dataset = datasets.load_dataset('json', data_files= {'train': train, 'test': test, 'test_44': test_44, 'test_54': test_54}, field='data')

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating test_44 split: 0 examples [00:00, ? examples/s]

Generating test_54 split: 0 examples [00:00, ? examples/s]

In [3]:
model = AutoModelForQuestionAnswering.from_pretrained("rein5/bert-base-uncased-finetuned-spoken-squad")
tokenizer = AutoTokenizer.from_pretrained("rein5/bert-base-uncased-finetuned-spoken-squad")

In [4]:
def fnc(l):
    q = [i.strip() for i in l['question']]
    I = tokenizer(q, l['context'], max_length=384, stride=64, truncation='only_second',
                  return_overflowing_tokens=True, return_offsets_mapping=True, padding='max_length')
    Q = I.pop('offset_mapping')
    S = I.pop('overflow_to_sample_mapping')
    ans = l['answers']
    first = []; last = []
    for i, j in enumerate(Q):
        index = S[i]
        ans_ = ans[index]
        first_chr = ans_['answer_start'][0]
        end_char = ans_['answer_start'][0]+len(ans_["text"][0])
        z = 0
        ID = I.sequence_ids(i)
        while ID[z]!=1: z+=1
        start = z
        while ID[z]==1: z+=1
        end = z-1
        if j[start][0]>first_chr or j[end][1]<end_char:
            first.append(0)
            last.append(0)
        else:
            z = start
            while z<=end and j[z][0]<=first_chr: z += 1
            first.append(z-1)
            z = end
            while z>=start and j[z][1]>=end_char: z -= 1
            last.append(z+1)
    I['start_positions'] = first
    I['end_positions'] = last
    return I

train_data = dataset['train'].map(fnc, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/37111 [00:00<?, ? examples/s]

In [5]:
def fnc1(l):
    q = [i.strip() for i in l['question']]
    I = tokenizer(q, l['context'], max_length=384, stride=64, truncation='only_second',
                  return_overflowing_tokens=True, return_offsets_mapping=True, padding='max_length')
    r = []
    S = I.pop('overflow_to_sample_mapping')
    for i in range(len(I['input_ids'])):
        idx = S[i]
        r.append(l["id"][idx])
        ID = I.sequence_ids(i)
        u = I['offset_mapping'][i]
        I["offset_mapping"][i] = [j if ID[k] == 1 else None for k, j in enumerate(u)]
    I['example_id'] = r
    return I

val_data = dataset['test'].map(fnc1, batched=True, remove_columns=dataset['test'].column_names)
test_44_data = dataset['test_44'].map(fnc1, batched=True, remove_columns=dataset['test_44'].column_names)
test_54_data = dataset['test_54'].map(fnc1, batched=True, remove_columns=dataset['test_54'].column_names)

Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

In [6]:
W = evaluate.load("squad")
def fnc2(x, y, f, l):
    alpha = collections.defaultdict(list)
    beta = []
    for i, j in enumerate(f):
        alpha[j["example_id"]].append(i)
    for k in tqdm(l):
        id = k["id"]
        c = k["context"]
        res = []
        for q in alpha[id]: 
            x_ = x[q]
            y_ = y[q]
            z_ = f[q]["offset_mapping"]
            x_idx = np.argsort(x_)[-1:-21:-1].tolist()
            y_idx = np.argsort(y_)[-1:-21:-1].tolist()
            for x_idx_ in x_idx: 
                for y_idx_ in y_idx: 
                    if z_[x_idx_] is None or z_[y_idx_] is None: continue
                    if y_idx_<x_idx_ or y_idx_-x_idx_+1>30: continue
                    res_ = {"text":c[z_[x_idx_][0]:z_[y_idx_][1]], "logit_score":x_[x_idx_]+y_[y_idx_]}
                    res.append(res_)
        if len(res) > 0:
            res__ = max(res, key=lambda x:x["logit_score"])
            beta.append({"id": id, "prediction_text": res__["text"]})
        else: 
            beta.append({"id": id, "prediction_text": ""})
    ref = [{"id":p["id"], "answers":p["answers"]} for p in l]
    return W.compute(predictions=beta, references=ref)

In [7]:
train_data.set_format("torch")
val_ = val_data.remove_columns(["example_id", "offset_mapping"])
val_.set_format("torch")
WER44_ = test_44_data.remove_columns(["example_id", "offset_mapping"])
WER44_.set_format("torch")
WER54_ = test_54_data.remove_columns(["example_id", "offset_mapping"])
WER54_.set_format("torch")

train_loader = DataLoader(train_data, shuffle = True, collate_fn=default_data_collator, batch_size=8)
test_loader = DataLoader(val_, collate_fn=default_data_collator, batch_size=8)
WER44_loader = DataLoader(WER44_, collate_fn=default_data_collator, batch_size=8)
WER54_loader = DataLoader(WER54_, collate_fn=default_data_collator, batch_size=8)

In [8]:
def eval_(model, dataloader, dataset, dataset_, acc=None):
    if not acc: 
        acc = Accelerator(mixed_precision='fp16')
        model, dataloader = acc.prepare(model, dataloader)
    model.eval()
    S = []
    E = []
    for i in tqdm(dataloader):
        with torch.no_grad(): 
            outputs = model(**i)
        S.append(acc.gather(outputs.start_logits).cpu().numpy())
        E.append(acc.gather(outputs.end_logits).cpu().numpy())
    S = np.concatenate(S)
    E = np.concatenate(E)
    S = S[: len(dataset)]
    E = E[: len(dataset)]
    m = fnc2(S, E, dataset, dataset_)
    return m

def train_(model=model, train_loader=train_loader, test_loader=test_loader, epochs=1):
    steps = epochs*len(train_dataloader)
    acc = Accelerator(mixed_precision='fp16')
    optimizer = AdamW(model.parameters(), lr=1e-4)
    model, optimizer, train_dataloader, eval_dataloader = acc.prepare(model, optimizer, train_loader, test_loader)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=steps)
    pr = tqdm(range(steps))
    for epoch in range(epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            acc.backward(loss)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            pr.update(1)
        acc.print("Eval...")
        m__ = eval_(model, test_loader, val_data, dataset['validation'], acc)
        print(f"epoch {epoch}:", m__)
        acc.wait_for_everyone()
        model_ = acc.unwrap_model(model)
        model_.save_pretrained("bert-base-uncased-finetuned-spoken-squad", save_function=acc.save)

test__ = eval_(model, test_loader, val_data, dataset['test'])
WER44__ = eval_(model, WER44_loader, test_44_data, dataset['test_44'])
WER54__ = eval_(model, WER54_loader, test_54_data, dataset['test_54'])

print("\n**************************************** RESULTS ****************************************")
print('Test Set  - Exact match: {:.2f}, F1 score: {:.2f}'.format(test__['exact_match'],test__['f1']))
print('WER44 Set - Exact match: {:.2f}, F1 score: {:.2f}'.format(WER44__['exact_match'],WER44__['f1']))
print('WER44 Set - Exact match: {:.2f}, F1 score: {:.2f}'.format(WER54__['exact_match'],WER54__['f1']))

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
100%|██████████| 678/678 [01:27<00:00,  7.77it/s]
100%|██████████| 5351/5351 [00:07<00:00, 758.17it/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
100%|██████████| 679/679 [01:25<00:00,  7.93it/s]
100%|██████████| 5351/5351 [00:06<00:00, 778.57it/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
100%|██████████| 679/679 [01:25<00:00,  7.92it/s]
100%|██████████| 5351/5351 [00:07<00:00, 752.48it/s]



**************************************** RESULTS ****************************************
Test Set  - Exact match: 62.08, F1 score: 72.70
WER44 Set - Exact match: 39.06, F1 score: 53.93
WER44 Set - Exact match: 27.85, F1 score: 41.57
