In [15]:
import pandas as pd
import torch
from transformers import BertTokenizer, AutoTokenizer, BertForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from google.colab import files

#Loading the Data

In [8]:
trainData = pd.read_json("/content/drive/MyDrive/CoQA Dataset/coqa-train-v1.0.json")
trainData

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."
...,...,...
7194,1,"{'source': 'gutenberg', 'id': '34j10vatjfyw0ao..."
7195,1,"{'source': 'cnn', 'id': '3vj40nv2qinjocrcy7k4z..."
7196,1,"{'source': 'race', 'id': '3rjsc4xj10uw0to3vq0v..."
7197,1,"{'source': 'wikipedia', 'id': '3gs6s824sqxty8v..."


In [9]:
#data cleaing
del trainData['version']

#required columns
cols = ['context', 'question', 'answer']

#list of lists to create our dataframe
comp_list = []
for index, row in trainData.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)

train = pd.DataFrame(comp_list, columns=cols)
train

Unnamed: 0,context,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project
...,...,...,...
108642,(CNN) -- Cristiano Ronaldo provided the perfec...,Who was a sub?,Xabi Alonso
108643,(CNN) -- Cristiano Ronaldo provided the perfec...,Was it his first game this year?,Yes
108644,(CNN) -- Cristiano Ronaldo provided the perfec...,What position did the team reach?,third
108645,(CNN) -- Cristiano Ronaldo provided the perfec...,Who was ahead of them?,Barca.


In [10]:
#function to handle the data and tokenize
class CoQADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        context = str(self.data.loc[index, 'context'])
        question = str(self.data.loc[index, 'question'])
        answer = str(self.data.loc[index, 'answer'])

        # Tokenize the input context and question
        inputs = self.tokenizer(question, context, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        # Get start and end positions of the answer span
        encoding = self.tokenizer.encode_plus(question, context, return_offsets_mapping=True, max_length=self.max_length, padding='max_length', truncation=True)
        answer_ids = tokenizer.encode(answer)[1:]  # Exclude [CLS] token
        start_positions = end_positions = None

        # Find answer span in input_ids
        for i in range(len(encoding['input_ids']) - len(answer_ids) + 1):
            if encoding['input_ids'][i:i+len(answer_ids)] == answer_ids:
                start_positions = i
                end_positions = i + len(answer_ids) - 1
                break

        # If answer not found, set start and end positions to special tokens [CLS]
        if start_positions is None:
            start_positions = end_positions = 0  # Index of [CLS] token

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'start_positions': torch.tensor(start_positions),
            'end_positions': torch.tensor(end_positions)
        }

#BERT Model

In [11]:
#Defind model name and initilize the tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

#maximum sequence length
max_length = 512

#preprocessing the data
coqa_dataset = CoQADataset(train.iloc[:100], tokenizer, max_length)

#defining the dataloader
batch_size = 4
train_dataloader = DataLoader(coqa_dataset, batch_size=batch_size, shuffle=True)

#defining the BERT model
model = BertForQuestionAnswering.from_pretrained(model_name)

#initilizing the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader))

#training the model
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        start_positions = batch['start_positions']
        end_positions = batch['end_positions']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(train_dataloader)}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 25/25 [10:41<00:00, 25.67s/it]


Epoch 1, Average Loss: 2.4427449798583982


Epoch 2: 100%|██████████| 25/25 [10:23<00:00, 24.93s/it]


Epoch 2, Average Loss: 0.7286084032058716


Epoch 3: 100%|██████████| 25/25 [10:16<00:00, 24.68s/it]

Epoch 3, Average Loss: 0.731701307296753





In [12]:
#function to predict answer given the context and question
def predict_answer(context, question):
    #tokennize the input
    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt")

    #prediction
    with torch.no_grad():
        outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    tokenized_input = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    answer_tokens = tokenized_input[start_index : end_index + 1]
    answer = tokenizer.convert_tokens_to_string(answer_tokens)

    return answer

# Take input from user
context = input("Enter the context: ")
question = input("Enter the question: ")

# Predict the answer
predicted_answer = predict_answer(context, question)

# Print the predicted answer
if predicted_answer.strip() == "[CLS]":
    print("Model could not find an answer to the question in the given context.")
else:
    print("Predicted Answer:", predicted_answer)

Enter the context: Amazon Drops 11 New Movies on Prime Video for Summer Streaming
Enter the question: Number of new movies
Model could not find an answer to the question in the given context.
