<a href="https://colab.research.google.com/github/kushal-h/Missing-informations-in-News-article/blob/main/Training_model/QA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import io
from transformers import BertTokenizerFast
import json
from pathlib import Path
from transformers import BertForQuestionAnswering
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [None]:
from google.colab import files
 
 
uploaded = files.upload()

Saving test.json to test.json
Saving train.json to train.json


**TRAINING THE MODEL**

In [None]:
def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('train.json')
val_contexts, val_questions, val_answers = read_squad('test.json')

def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    # this means the answer is off by 'n' tokens
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
print(val_encodings)

import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')



# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(100):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model_path = 'bert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

{'input_ids': [[101, 2383, 25292, 19357, 2140, 6638, 2030, 1523, 5023, 6638, 1524, 2003, 3383, 2028, 1997, 1996, 2087, 2691, 11785, 2111, 2227, 13367, 1012, 2429, 2000, 2740, 8519, 1010, 2009, 2003, 3303, 3952, 2011, 2048, 5876, 1024, 15330, 2152, 1011, 10250, 10050, 2063, 8381, 1998, 3768, 1997, 3558, 4023, 1012, 7059, 9712, 1010, 7378, 28268, 2873, 1998, 8543, 1997, 1996, 27467, 2850, 8738, 19428, 2008, 25292, 19357, 2140, 6638, 2064, 7461, 2256, 4230, 2011, 5155, 12141, 1998, 20752, 2029, 2064, 2022, 4795, 1010, 1998, 2004, 1037, 2765, 2173, 2149, 1523, 2012, 3020, 3891, 2005, 3809, 2740, 3314, 1012, 1524, 2007, 2008, 1999, 2568, 1010, 2017, 2467, 2215, 2000, 2191, 2469, 2008, 2017, 1521, 2128, 2206, 1037, 2092, 1011, 12042, 8738, 2007, 2307, 28268, 3643, 1998, 9992, 10250, 10050, 2063, 9742, 12278, 1012, 2021, 2054, 2785, 1997, 2833, 3599, 5260, 2000, 25292, 19357, 2140, 6638, 1029, 2057, 2584, 2041, 2000, 2740, 8519, 1517, 2164, 13012, 9153, 2190, 1010, 5601, 1010, 16428, 1010, 25

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

('bert-custom/tokenizer_config.json',
 'bert-custom/special_tokens_map.json',
 'bert-custom/vocab.txt',
 'bert-custom/added_tokens.json',
 'bert-custom/tokenizer.json')

**TESTING THE MODEL**

In [None]:
model = BertForQuestionAnswering.from_pretrained(model_path)
model.to(device)
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)
print(acc)

print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")


100%|██████████| 1/1 [00:00<00:00,  2.02it/s]

0.1071428619325161
T/F	start	end

true	235	236
pred	235	159

true	40	45
pred	289	76

true	121	137
pred	83	93

true	275	289
pred	44	46

true	101	123
pred	118	6

true	188	191
pred	147	152

true	239	246
pred	21	70

true	87	96
pred	404	131

true	140	146
pred	105	325

true	49	53
pred	49	20

true	90	94
pred	10	10

true	261	276
pred	382	397

true	119	120
pred	13	15

true	11	18
pred	27	18




