**Installing necessary packages**

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -q simpletransformers datasets sacrebleu evaluate torch accelerate tqdm

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8

**Importing necessary libraries**

In [2]:
import numpy as np
import pandas as pd
import re, torch, collections, evaluate, datasets
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, get_scheduler, pipeline

**Reading the dataset**

In [3]:
df = pd.read_csv ('/content/drive/MyDrive/qa/qa_dataset.csv')
df.head(1)

Unnamed: 0,question,context,answer
0,When was the Vat formally opened?,"The Vatican Apostolic Library (), more commonl...",It was formally established in 1475


**Checking for missing values**

In [4]:
df.isnull().sum()

question    0
context     0
answer      0
dtype: int64

**No missing values**

**Looking for and removing duplicates**

In [5]:
print (df.shape)
df = df.drop_duplicates()
print (df.shape)

(29157, 3)
(29147, 3)


**There were some 10 duplicates**

**Resetting index**

In [6]:
df = df.reset_index(drop=True)

In [7]:
df.head(4)

Unnamed: 0,question,context,answer
0,When was the Vat formally opened?,"The Vatican Apostolic Library (), more commonl...",It was formally established in 1475
1,what is the library for?,"The Vatican Apostolic Library (), more commonl...",research
2,for what subjects?,"The Vatican Apostolic Library (), more commonl...","history, and law"
3,and?,"The Vatican Apostolic Library (), more commonl...","philosophy, science and theology"


**Creating index**

In [8]:
df['id'] = np.arange (0,29147)

**Defining a function to find the answer start indices**

In [9]:
def find_answer_start(context, answer):
    match = re.search(re.escape(answer), context)
    if match is None:
        return -1
    else:
        return match.start()

**Applying the function to each row in the dataframe**

In [10]:
df['answer_start'] = df.apply(lambda row: find_answer_start(row['context'], row['answer']), axis=1)

**Checking the entries for each the answer was not found in the context**

In [11]:
a = df[df['answer_start']== -1]
print (a.shape)

(9748, 5)


**Removing the entries for each the answer was not found in the context**

In [12]:
df = df[df['answer_start']!=-1]

**Converting 'answer' and 'answer_start' columns to list format as requied by the model and saving them in separate columns**

In [13]:
answer_lists = df['answer'].apply(lambda x: [x]).tolist()
answer_start_lists = df['answer_start'].apply(lambda x: [x]).tolist()

df['answer_new'] = answer_lists
df['answer_start_new'] = answer_start_lists

In [14]:
df.head(3)

Unnamed: 0,question,context,answer,id,answer_start,answer_new,answer_start_new
1,what is the library for?,"The Vatican Apostolic Library (), more commonl...",research,1,478,[research],[478]
3,and?,"The Vatican Apostolic Library (), more commonl...","philosophy, science and theology",3,513,"[philosophy, science and theology]",[513]
6,how many?,"The Vatican Apostolic Library (), more commonl...",five,6,1116,[five],[1116]


**Dropping the previously present 'answer' and 'answer_start' columns**

In [15]:
df = df.drop (['answer', 'answer_start'], axis =1)

In [16]:
df.head(1)

Unnamed: 0,question,context,id,answer_new,answer_start_new
1,what is the library for?,"The Vatican Apostolic Library (), more commonl...",1,[research],[478]


**Renaming columns**

In [17]:
df = df.rename(columns={'answer_new':'answer','answer_start_new':'answer_start'})
df.head(1)

Unnamed: 0,question,context,id,answer,answer_start
1,what is the library for?,"The Vatican Apostolic Library (), more commonl...",1,[research],[478]


**Making train, test and validation splits**

In [18]:
train_old, test = train_test_split (df, test_size =0.2, random_state =6)

train, validation = train_test_split (train_old, test_size =0.2, random_state =6)

**Checking the shapes of train, test and validation sets**

In [19]:
print (train.shape, validation.shape, test.shape)

(12415, 5) (3104, 5) (3880, 5)


**Converting individiual datasets to arrow format for some preprocessing**

In [20]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
validation = Dataset.from_pandas(validation)

train

Dataset({
    features: ['question', 'context', 'id', 'answer', 'answer_start', '__index_level_0__'],
    num_rows: 12415
})

**Removing the newly made index column**

In [21]:
train = train.remove_columns (['__index_level_0__'])

test = test.remove_columns (['__index_level_0__'])

validation = validation.remove_columns (['__index_level_0__'])

train

Dataset({
    features: ['question', 'context', 'id', 'answer', 'answer_start'],
    num_rows: 12415
})

**Converting the answers to appropriate dictionary format**

In [22]:
train = train.map(lambda example: {'id': example['id'], 'context': example['context'], 'question': example['question'],
                                  'answers': {'text': example['answer'], 'answer_start': example['answer_start']}})

validation = validation.map(lambda example: {'id': example['id'], 'context': example['context'], 'question': example['question'],
                                  'answers': {'text': example['answer'], 'answer_start': example['answer_start']}})

test = test.map(lambda example: {'id': example['id'], 'context': example['context'], 'question': example['question'],
                                  'answers': {'text': example['answer'], 'answer_start': example['answer_start']}})

Map:   0%|          | 0/12415 [00:00<?, ? examples/s]

Map:   0%|          | 0/3104 [00:00<?, ? examples/s]

Map:   0%|          | 0/3880 [00:00<?, ? examples/s]

**Removing 'answer' and 'answer_start' columns since we no longer need them**

In [23]:
train = train.remove_columns(['answer', 'answer_start'])

test = test.remove_columns(['answer', 'answer_start'])

validation = validation.remove_columns(['answer', 'answer_start'])

train

Dataset({
    features: ['question', 'context', 'id', 'answers'],
    num_rows: 12415
})

**Combining the 3 datasets (in the arrow format) in to a single dataset**

In [24]:
ds = DatasetDict()

ds['train'] = train
ds['test'] = test
ds['validation'] = validation

ds

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'id', 'answers'],
        num_rows: 12415
    })
    test: Dataset({
        features: ['question', 'context', 'id', 'answers'],
        num_rows: 3880
    })
    validation: Dataset({
        features: ['question', 'context', 'id', 'answers'],
        num_rows: 3104
    })
})

**Initiating the model and tokenizer**

In [25]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

**Inserting special tokens in the context and question columns of train dataset**

In [26]:
context = ds["train"][0]["context"]
question = ds["train"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

Token indices sequence length is longer than the specified maximum sequence length for this model (824 > 512). Running this sequence through the model will result in indexing errors


'[CLS] grow up where? [SEP] Below are reviews for three books and two book series. Each has been read and loved by students across the country. The Outsiders This book, first published in 1967. has become a classic for teens across the nation. It focuses on Ponyboy, who has been labeled all his life as a greaser. The greaser\'s opposing group is the " socs ". kids who have lots of money and can break any rules without getting in trouble. As the novel develops, S. E. Hinton allows the reader to see exactly how these labels affect teens in both the greaser and the soc group. If you\'ve ever watched the movie The Outsiders, this story may sound familiar, as the movie was based on the book. The Outsiders gives teens a look into life in the 50\'s and 60\'s, offering timeless lessons that still apply to today\'s youth. Out of the Dust Any student interested in the Great Depression and the Dust Bowl should read Out of the Dust by Karen Hesse. Hesse is able to capture the mood and spirit of th

**Limiting the input to 100, using a sliding window of 50 tokens and returing overfowing tokens**

In [27]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
)

for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

[CLS] grow up where? [SEP] Below are reviews for three books and two book series. Each has been read and loved by students across the country. The Outsiders This book, first published in 1967. has become a classic for teens across the nation. It focuses on Ponyboy, who has been labeled all his life as a greaser. The greaser's opposing group is the " socs ". kids who have lots of money and can break any rules without getting in trouble [SEP]
[CLS] grow up where? [SEP]. It focuses on Ponyboy, who has been labeled all his life as a greaser. The greaser's opposing group is the " socs ". kids who have lots of money and can break any rules without getting in trouble. As the novel develops, S. E. Hinton allows the reader to see exactly how these labels affect teens in both the greaser and the soc group. If you've ever watched the movie [SEP]
[CLS] grow up where? [SEP] break any rules without getting in trouble. As the novel develops, S. E. Hinton allows the reader to see exactly how these lab

**Finding the end character of the answer in the context by setting offset mapping to be True**

In [28]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

**Creating a function for preprocessing training dataset**

In [29]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

**Applying the function to training dataset**

In [30]:
train_dataset = ds["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=ds["train"].column_names,
)
len(ds["train"]), len(train_dataset)

Map:   0%|          | 0/12415 [00:00<?, ? examples/s]

(12415, 17451)

**Creating a function for preprocessing validation dataset**

In [31]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

**Applying the function to validation dataset**

In [32]:
validation_dataset = ds["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=ds["validation"].column_names,
)
len(ds["validation"]), len(validation_dataset)

Map:   0%|          | 0/3104 [00:00<?, ? examples/s]

(3104, 4326)

**Using a default model for the QA pipeline to generate some predictions on a small part of the validation set**

In [33]:
small_eval_set = ds["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=ds["validation"].column_names,)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

**Initializing the tokenizer again**

In [34]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

**Removing the columns of validation dataset that are not expected by the model**

In [35]:
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

**Converting the predictions to numpy arrays**

In [36]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

**Mapping each example in small validation dataset to the corresponding features in validation dataset**

In [37]:
example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

**Picking the best answer**

In [38]:
n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

**Loading evlaution metric**

In [39]:
metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [40]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set]

**Defining the function to compute metrics**

In [41]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

**Converting train and validation datasets to torch format and preparing train and validation dataloader**

In [42]:
train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=15,
)
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=1)

**Initializing the model and adam optimizer**

In [43]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
optimizer = AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Setting configurations for accelerator**

In [44]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

**Setting training configurations**

In [45]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,)

**Defining model name and output directory**

In [46]:
model_name = "qa_model"
output_dir = "/content/drive/MyDrive/qa/model2/"

**Model training**

In [47]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/3492 [00:00<?, ?it/s]

In [48]:
test['context'][141]

'Can you write down a Chinese word being read to you? Sometimes it can be difficult when you find the word is not included in most dictionaries. Lu Jialei, 14, from Hangzhou Foreign Language School, won the CCTV Chinese Character Spelling Contest on Oct 18th, 2013. About 160 people competed. The host read a word to the contestant, explained its meaning and gave sample sentences. The contestant was required to write down the word. Winning the contest was a "surprise" to Lu. "I was not the smartest one," she said. "But I paid attention to details." To prepare for the contest, she and her teammates studied the Modern Chinese Dictionary for 10 days. There are more than 56,000 entries including characters, words and phrases in it. She also had a secret weapon. She studied how Chinese characters were formed. "When others paid attention to the plot of a story, she looked at how authors use words and sentences to express themselves," said Su Yunsheng, Lu\'s Chinese teacher. Su is happy to see 

In [49]:
test['question'][141]

'How many people entered?'

In [50]:
test['answers'][141]['text']

['About 160']

**Taking custom inputs from user and finding answers by the trained model**

In [51]:
model_checkpoint = "/content/drive/MyDrive/qa/model2/"
question_answerer = pipeline("question-answering", model=model_checkpoint)

context = input ("Enter the context: ")
question = input ('Enter the question: ')
answer = question_answerer(question=question, context=context)['answer']
print (answer)

Enter the context: Can you write down a Chinese word being read to you? Sometimes it can be difficult when you find the word is not included in most dictionaries. Lu Jialei, 14, from Hangzhou Foreign Language School, won the CCTV Chinese Character Spelling Contest on Oct 18th, 2013. About 160 people competed. The host read a word to the contestant, explained its meaning and gave sample sentences. The contestant was required to write down the word. Winning the contest was a "surprise" to Lu. "I was not the smartest one," she said. "But I paid attention to details." To prepare for the contest, she and her teammates studied the Modern Chinese Dictionary for 10 days. There are more than 56,000 entries including characters, words and phrases in it. She also had a secret weapon. She studied how Chinese characters were formed. "When others paid attention to the plot of a story, she looked at how authors use words and sentences to express themselves," said Su Yunsheng, Lu's Chinese teacher. Su