In [1]:
import numpy as np
np.__version__

'1.21.3'

In [1]:
from datasets import load_dataset
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, pipeline
import json
from torch.nn.utils.rnn import pad_sequence



class SquadDS(torch.utils.data.Dataset):

    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


def unpack_dataset():
    train_ds = load_dataset('squad', split='train')
    test_ds = load_dataset('squad', split='validation')
    df1 = pd.DataFrame(train_ds)
    df2 = pd.DataFrame(test_ds)

    faulty_df1 = [3275, 3276, 11308, 49094, 50763, 57586, 57587, 57588, 57589, 57590, 60024, 60027, 66282]

    faulty_df2 = [4145, 4146, 4264, 4269, 4282, 4283, 4851, 4852, 4853]

    df1 = df1.drop(faulty_df1)
    df1 = df1.reset_index(drop=True)
    df2 = df2.drop(faulty_df2)
    df2 = df2.reset_index(drop=True)

    train_questions = df1.question.tolist()
    test_questions = df2.question.tolist()

    train_context = df1.context.tolist()
    test_context = df2.context.tolist()

    train_answers = df1.answers.tolist()
    test_answers = df2.answers.tolist()

    return train_questions, train_context, train_answers, test_questions, test_context, test_answers


def add_answer_end(answers, context):
    for answer, context in zip(answers, context):
        if answer['answer_start'] is None:
            answer['answer_end'] = None
        else:
            if len(answer['text']) > 1:
                temp_end = []
                temp_start = []
                for ans, start in zip(answer['text'], answer['answer_start']):
                    gold = ans
                    end_idx = start + len(gold)
                    if context[start:end_idx] == gold:
                        temp_start.append(start)
                        temp_end.append(end_idx)

                    elif context[start - 1:end_idx - 1] == gold:
                        temp_start.append(start - 1)
                        temp_end.append(end_idx - 1)

                    elif context[start - 2:end_idx - 2] == gold:
                        temp_start.append(start - 2)
                        temp_end.append(end_idx - 2)

                answer['answer_end'] = temp_end
                answer['answer_start'] = temp_start

            else:
                gold = answer['text'][0]
                start_idx = answer['answer_start']
                end_idx = start_idx[0] + len(gold)
                if context[start_idx[0]:end_idx] == gold:
                    answer['answer_end'] = [end_idx]
                elif context[start_idx[0] - 1:end_idx - 1] == gold:
                    answer['answer_start'] = [start_idx[0] - 1]
                    answer['answer_end'] = [end_idx - 1]  # When the gold label is off by one character
                elif context[start_idx[0] - 2:end_idx - 2] == gold:
                    answer['answer_start'] = [start_idx[0] - 2]
                    answer['answer_end'] = [end_idx - 2]  # When the gold label is off by two characters


def define_token_position(encoding, answers):
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-distilled-squad')
    start_positions = []
    end_positions = []

    for idx in range(len(answers)):
        if len(answers[idx]['answer_start']) > 1:
            temp_st = []
            temp_en = []
            for ans_st in answers[idx]['answer_start']:
                if ans_st is None:
                    temp_st.append(tokenizer.model_max_length - 1)
                else:
                    temp_st.append(encoding.char_to_token(idx, ans_st))
            for ans_en in answers[idx]['answer_end']:
                if ans_en is None:
                    temp_en.append(tokenizer.model_max_length - 1)
                else:
                    temp_en.append(encoding.char_to_token(idx, ans_en - 1))
            start_positions.append(temp_st)
            end_positions.append(temp_en)
            # answers[idx]['answer_start'] = temp_st
            # answers[idx]['answer_end'] = temp_en
        else:

            if answers[idx]['answer_start'] is None:
                # answers[idx]['answer_start'] = tokenizer.model_max_length - 1
                st = tokenizer.model_max_length - 1
            else:
                # answers[idx]['answer_start'] = encoding.char_to_token(idx, answers[idx]['answer_start'][0])
                st = encoding.char_to_token(idx, answers[idx]['answer_start'][0])
            if answers[idx]['answer_end'] is None:
                # answers[idx]['answer_end'] = tokenizer.model_max_length - 1
                en = tokenizer.model_max_length - 1
            else:
                # answers[idx]['answer_end'] = encoding.char_to_token(idx, answers[idx]['answer_end'][0] - 1)
                en = encoding.char_to_token(idx, answers[idx]['answer_end'][0] - 1)
            # if None, the answer passage has been truncated due to words > 512 so setting last position as 511
            start_positions.append(st)
            end_positions.append(en)

    encoding.update({'start_positions': start_positions, 'end_positions': end_positions})


def save_metrics_json(loss_epoch,
                      acc_epoch,
                      f1_epoch,
                      test_acc,
                      avg_test_f1,
                      file_path):
    metrics = {}
    metrics["train"] = {}
    metrics["test"] = {}

    for i in range(len(loss_epoch)):
        metrics["train"][f"epoch{i + 1}"] = {}
        metrics["train"][f"epoch{i + 1}"]["loss"] = loss_epoch[i]
        metrics["train"][f"epoch{i + 1}"]["accuracy"] = acc_epoch[i]
        metrics["train"][f"epoch{i + 1}"]["f1_score"] = f1_epoch[i]
        metrics["test"]["accuracy"] = test_acc
        metrics["test"]["f1_score"] = avg_test_f1
    with open(file_path, 'w') as json_file:
        json.dump(metrics, json_file, indent=4)


def load_model(path):
    """
    Load and return a RobertaForQuestionAnswering model with parameters from the file provided.

    Parameters
    ----------
    path : path to the saved model parameters.

    Returns
    -------
    model : a RobertaForQuestionAnswering model with the parameters from the file provided.

    """
    state_dict = torch.load(path, map_location=torch.device('cuda:2'))

    model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased", state_dict=state_dict)
    model.eval()
    return model


def f(questions, start):
    outs = []
    for q in questions:
        question, context = q.split("[SEP]")
        d = pmodel.tokenizer(question, context)
        out = pmodel.model.forward(**{k: torch.tensor(d[k]).reshape(1, -1) for k in d})
        logits = out.start_logits if start else out.end_logits
        outs.append(logits.reshape(-1).detach().numpy())
    return outs


#model = load_model('distillbert_done')
# define two predictions, one that outputs the logits for the range start,
#pmodel = pipeline(task='question-answering', model=model,
#                  tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-distilled-squad'))

def f_start(questions):
    return f(questions, True)


def f_end(questions):
    return f(questions, False)


def out_names(inputs):
    question, context = inputs.split("[SEP]")
    d = pmodel.tokenizer(question, context)
    return [pmodel.tokenizer.decode([id]) for id in d["input_ids"]]

#get's the output of true and false items for shap visualizations
def shap_test(tt):
    device = 'cuda:3'  # if torch.cuda.is_available()# else 'cpu'

    # print('Cached:   ', round(torch.cuda.memory_reserved(torch.cuda.current_device)/1024**3,1), 'GB')
    # print('Allocated:', round(torch.cuda.memory_allocated(torch.cuda.current_device)/1024**3,1), 'GB')

    torch.cuda.empty_cache()  # Free up GPU memory

    true = []
    false = []
    model.to(device)
    device = 'cuda:3' if torch.cuda.is_available() else 'cpu'
    input_ids = tt['input_ids'].to(device)
    attention_mask = tt['attention_mask'].to(device)
    start_positions = tt['start_positions'].to(device)
    end_positions = tt['end_positions'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.start_logits
    pred = torch.argmax(logits, dim=1)

    # print(start_positions.shape[0])
    if start_positions.numel() > 1:
        for j in range(start_positions.shape[0]):
            if pred == start_positions[j]:
                return True

        else:
            return False
    else:
        return False



In [2]:
import torch
import tqdm

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm

PATH = 'distillbert_done'


def train(model, train_dataset):
    device = 'cuda:2' if torch.cuda.is_available() else 'cpu'
    save_path = 'model_checkpoint_epoch{}.pt'
    save_interval = 1

    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_epoch = []
    acc_epoch = []
    f1_epoch = []
    model.to(device)

    for epoch in range(6):

        print('Cached:   ', round(torch.cuda.memory_reserved(torch.cuda.current_device) / 1024 ** 3, 1), 'GB')
        print('Allocated:', round(torch.cuda.memory_allocated(torch.cuda.current_device) / 1024 ** 3, 1), 'GB')

        total_loss = 0
        total_correct = 0
        total_pred = 0
        true_labels = []
        predicted_labels = []
        model.train()

        for batch in tqdm(train_dataloader):
            torch.cuda.empty_cache()  # Free up GPU memory

            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions,
                            end_positions=end_positions)

            logits = outputs.start_logits
            pred = torch.argmax(logits, dim=1)
            total_correct += torch.sum(pred == start_positions).item()
            total_pred += len(start_positions)

            loss = outputs[0]
            total_loss += loss.item()

            true_labels.extend(start_positions.tolist())
            predicted_labels.extend(pred.tolist())
            avg_train_f1 = f1_score(true_labels, predicted_labels, average='macro')

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)
        avg_train_acc = total_correct / total_pred
        acc_epoch.append(avg_train_acc)
        loss_epoch.append(avg_train_loss)
        f1_epoch.append(avg_train_f1)
        print("average training loss: {0:.2f}".format(avg_train_loss))
        print("average training accuracy: {0:.2f}".format(avg_train_acc))
        print("average training f1_score: {0:.2f}".format(avg_train_f1))

        if (epoch + 1) % save_interval == 0:
            torch.save(model.state_dict(), save_path.format(epoch + 1))

    torch.save(model.state_dict(), PATH)

    return model, loss_epoch, f1_epoch, acc_epoch


def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    start_positions = [item['start_positions'] for item in batch]
    end_positions = [item['end_positions'] for item in batch]

    input_ids_padded = pad_sequence([x.unsqueeze(0) if x.ndim == 0 else x for x in input_ids], batch_first=True,
                                    padding_value=0)
    attention_mask_padded = pad_sequence([x.unsqueeze(0) if x.ndim == 0 else x for x in attention_mask],
                                         batch_first=True, padding_value=0)
    start_positions_padded = pad_sequence([x.unsqueeze(0) if x.ndim == 0 else x for x in start_positions],
                                          batch_first=True, padding_value=-1)
    end_positions_padded = pad_sequence([x.unsqueeze(0) if x.ndim == 0 else x for x in end_positions], batch_first=True,
                                        padding_value=-1)

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_mask_padded,
        'start_positions': start_positions_padded,
        'end_positions': end_positions_padded
    }


def test(model, test_dataset):
    device = 'cuda:2' if torch.cuda.is_available() else 'cpu'
    test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
    total_correct = 0
    total_pred = 0
    predicted_labels = []
    true_labels = []
    model.eval()
    model.to(device)

    for batch in tqdm(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.start_logits
        pred = torch.argmax(logits, dim=1)
        for i in range(len(pred)):
            for j in range(len(start_positions[i])):
                if pred[i] == start_positions[i][j]:
                    total_correct += 1
                    break

        total_pred += len(start_positions)

        true_labels.extend(start_positions[:, 2].tolist())
        predicted_labels.extend(pred.tolist())
        avg_test_f1 = f1_score(true_labels, predicted_labels, average='macro')

    test_acc = total_correct / total_pred
    print("average testing accuracy: {0:.2f}".format(test_acc))
    print("average testing f1_score: {0:.2f}".format(avg_test_f1))

    return avg_test_f1, test_acc


In [3]:
torch.cuda.empty_cache()  # Free up GPU memory

model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-distilled-squad')

train_questions, train_context, train_answers, test_questions, test_context, test_answers = unpack_dataset()
add_answer_end(train_answers, train_context)
add_answer_end(test_answers, test_context)

train_encodings = tokenizer(train_context, train_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_context, test_questions, truncation=True, padding=True)

define_token_position(train_encodings, train_answers)
define_token_position(test_encodings, test_answers)

# -----------#continue training#------------
#saved_state_dict = torch.load('distillbert_done')
#model.load_state_dict(saved_state_dict)
# -----------###################------------

train_dataset = SquadDS(train_encodings)
test_dataset = SquadDS(test_encodings)
model, loss_epoch, f1_epoch, acc_epoch = train(model, train_dataset)
avg_test_f1, test_acc = test(model, test_dataset)
save_metrics_json(loss_epoch, acc_epoch, f1_epoch, test_acc, avg_test_f1, 'metrics_distillbert_uncased.json')


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to

Cached:    0.0 GB
Allocated: 0.0 GB


 40%|████      | 4380/10949 [26:26<42:17,  2.59it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 29%|██▊       | 3138/10949 [18:36<48:01,  2.71it/s]  ]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 10949/10949 [1:12:34<00:00,  2.51it/s]


average training loss: 0.98
average training accuracy: 0.69
average training f1_score: 0.59
Cached:    0.0 GB
Allocated: 0.0 GB


100%|██████████| 10949/10949 [1:12:39<00:00,  2.51it/s]


average training loss: 0.73
average training accuracy: 0.75
average training f1_score: 0.67
Cached:    0.0 GB
Allocated: 0.0 GB


100%|██████████| 10949/10949 [1:12:35<00:00,  2.51it/s]


average training loss: 0.57
average training accuracy: 0.80
average training f1_score: 0.72
Cached:    0.0 GB
Allocated: 0.0 GB


 47%|████▋     | 5142/10949 [31:26<41:22,  2.34it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 10949/10949 [1:12:34<00:00,  2.51it/s]


average training loss: 0.46
average training accuracy: 0.84
average training f1_score: 0.78
Cached:    0.0 GB
Allocated: 0.0 GB


 50%|█████     | 5503/10949 [33:56<36:01,  2.52it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 98%|█████████▊| 10710/10949 [1:10:50<01:47,  2.22it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
###test###


model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-distilled-squad')
#--------/Load the model/--------#
state_dict = torch.load('distillbert_done.pt')#, map_location=torch.device('cpu'))
model.load_state_dict(state_dict)
#--------/-------------/--------#

_, _, _, test_questions, test_context, test_answers = unpack_dataset()
add_answer_end(test_answers, test_context)
test_encodings = tokenizer(test_context, test_questions, truncation=True, padding=True)
define_token_position(test_encodings, test_answers)
test_dataset = SquadDS(test_encodings)
avg_test_f1, test_acc = test(model,test_dataset)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to

KeyboardInterrupt: 

In [3]:
from torch.nn.utils.rnn import pad_sequence

saved_state_dict = torch.load('distillbert_done')
model.load_state_dict(saved_state_dict)
avg_test_f1, test_acc = test(model,test_dataset)



NameError: name 'model' is not defined

In [7]:
save_metrics_json(loss_epoch, acc_epoch, f1_epoch, test_acc, avg_test_f1, 'metrics1.json')


In [None]:
from transformers import DistilBertTokenizerFast,pipeline
import numpy as np
import shap
import torch

np.bool_ = bool

def load_model(path):
    """
    Load and return a RobertaForQuestionAnswering model with parameters from the file provided.

    Parameters
    ----------
    path : path to the saved model parameters.

    Returns
    -------
    model : a RobertaForQuestionAnswering model with the parameters from the file provided.

    """
    state_dict = torch.load(path, map_location=torch.device('cpu'))

    model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased", state_dict = state_dict)
    model.eval()
    return model
# load the model

#model = load_model('saved_model_complete')
#tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')# define two predictions, one that outputs the logits for the range start,

#pmodel = pipeline(task='question-answering', model=model, tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad'))
# and the other for the range end
def f(questions, start):
    outs = []
    for q in questions:
        question, context = q.split("[SEP]")
        d = pmodel.tokenizer(question, context)
        out = pmodel.model.forward(**{k: torch.tensor(d[k]).reshape(1, -1) for k in d})
        logits = out.start_logits if start else out.end_logits
        outs.append(logits.reshape(-1).detach().numpy())
    return outs
def f_start(questions):
    return f(questions, True)
def f_end(questions):
    return f(questions, False)

# attach a dynamic output_names property to the models so we can plot the tokens at each output position
def out_names(inputs):
    question, context = inputs.split("[SEP]")
    d = pmodel.tokenizer(question, context)
    return [pmodel.tokenizer.decode([id]) for id in d["input_ids"]]
f_start.output_names = out_names
f_end.output_names = out_names