In [None]:
!pip install torch



In [None]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time
device = torch.device("cpu")  # or "cpu" if you want to use CPU


In [None]:
import os

print(os.getcwd())

/home/jovyan/workspace


In [None]:
# Give the path for validation data
path = Path('/home/jovyan/workspace/squad/dev-v2.0.json')

# Open .json file
with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []

for group in squad_dict['data']:
    for passage in group['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            for answer in qa['answers']:
                texts.append(context)
                queries.append(question)
                answers.append(answer)

val_texts, val_queries, val_answers = texts, queries, answers
len(val_texts)

20302

In [None]:
for answer, text in zip(val_answers, val_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    # Get the real end index
    end_idx = start_idx + len(real_answer)

    # Deal with the problem of 1 or 2 more characters
    if text[start_idx:end_idx] == real_answer:
        answer['answer_end'] = end_idx
    # When the real answer is more by one character
    elif text[start_idx-1:end_idx-1] == real_answer:
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1
    # When the real answer is more by two characters
    elif text[start_idx-2:end_idx-2] == real_answer:
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2')

# Prepare input
question = "Why is model conversion important?"
context = "The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks."

# Tokenize the input
inputs = tokenizer(question, context, return_tensors='pt', truncation=True)

# Move tensors to the appropriate device

inputs = {key: value.to(device) for key, value in inputs.items()}


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_path = "/home/jovyan/V1_model"
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
model.to(device)



RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [None]:
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []

  count = 0

  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length

    # if end position is None, the 'char_to_token' function points to the space after the correct token, so add - 1
    if end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - 1)
      # if end position is still None the answer passage has been truncated
      if end_positions[-1] is None:
        count += 1
        end_positions[-1] = tokenizer.model_max_length

  print(count)

  # Update the data in dictionary
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


add_token_positions(val_encodings, val_answers)

16


In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

val_dataset = SquadDataset(val_encodings)

In [None]:
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [None]:
# Initialize the model and optimizer
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast

epochs = 1

model.to(device)
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline



# Define the question answering pipeline

nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Get the answer

pred = []
ans = []
c = 0
total = 0
for x, y, z in zip(val_texts, val_queries, val_answers):
 # Move question tensor to the device
    answer = nlp(question=y, context=x)


    total += 1
    c += 1
    pred.append(answer['answer'])  # prediction
    ans.append(z['text'])

    if c == 1000:
        c = 0
        emp = (len(val_texts) - total)
        print(emp)


# Print the answer
accuracy = 0
exact_match = 0

for prediction, answer in zip(pred, ans):
    if prediction == answer:
        exact_match += 1
    if prediction in answer or answer in prediction:
        accuracy += 1

accuracy = accuracy / len(pred)  # Calculate accuracy as a ratio
exact_match = exact_match / len(pred)  # Calculate exact match as a ratio

print("Accuracy:", accuracy)
print("Exact Match:", exact_match)

19302
18302
17302
16302
15302
14302
13302
12302
11302
10302
9302
8302
7302
6302
5302
4302
3302
2302
1302
302
Accuracy: 0.8867106688996158
Exact Match: 0.5978721308245493


In [None]:
for prediction, answer in zip(pred, ans):
    if prediction == answer:
        exact_match += 1
    if prediction in answer or answer in prediction:
        accuracy += 1

accuracy = accuracy / len(pred)  # Calculate accuracy as a ratio
exact_match = exact_match / len(pred)  # Calculate exact match as a ratio

print("Accuracy:", accuracy)
print("Exact Match:", exact_match)

Accuracy: 0.8867543449250763
Exact Match: 0.5979015797522818
