# COMP-8730_Project Evaluation
> ## Student Information
> * Name: Jiajie Yang

In [None]:
!pip install transformers


In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
import transformers
import pandas as pd
import numpy as np
import json
import random

#import pandas as pd

> ## Sampling
>> We pseudo randomly select 500 samples from the JSON file

In [None]:
# Load data
df_train = pd.read_json('train_data.json')
df_train_subset = df_train.sample(n=1, random_state=42)

# Save the subset to a new JSON file
df_train_subset.to_json('train_data_subset_500.json')
###


In [None]:
# /content/BioASQ-train-list-7b-full-annotated.json

# set path
input_file = '/content/BioASQ-train-list-7b-full-annotated.json'

num_s = 500

# load data
with open(input_file, 'r') as f:
    data = json.load(f)

# choose subset
sampled_data = random.sample(data['data'][0]['paragraphs'], num_s)

# create file
output_data = {}
output_data['version'] = data['version']
output_data['data'] = []
for paragraph in sampled_data:
    new_paragraph = {}
    new_paragraph['context'] = paragraph['context']
    new_paragraph['qas'] = []
    for qa in paragraph['qas']:
        new_qa = {}
        new_qa['id'] = qa['id']
        new_qa['question'] = qa['question']
        new_qa['answers'] = qa['answers']
        new_paragraph['qas'].append(new_qa)
    output_data['data'].append({'paragraphs': [new_paragraph]})

# output file
output_file = 'BioASQ-train-list-7b-sampled.json'
with open(output_file, 'w') as f:
    json.dump(output_data, f, indent=4)


In [None]:
# load data
df_train = pd.read_json(path)
df_test = pd.read_json('test_data.json')
# tokenization
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(df_train['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(df_test['text']), truncation=True, padding=True)

# create BERT
train_inputs = np.array(train_encodings['input_ids'])
train_masks = np.array(train_encodings['attention_mask'])
train_labels = np.array(df_train['labels'])
test_inputs = np.array(test_encodings['input_ids'])
test_masks = np.array(test_encodings['attention_mask'])
test_labels = np.array(df_test['labels'])
###


> ## Produing results
>> We apply the model on the sampled JSON file

In [None]:
# load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

# create training dataset
training_data = []
for paragraph in sampled_data:
    context = paragraph['context']
    for qa in paragraph['qas']:
        question = qa['question']
        answer_start = qa['answers'][0]['answer_start']
        answer_text = qa['answers'][0]['text']
        training_data.append({'context': context, 'question': question, 'answer_text': answer_text, 'answer_start': answer_start})

# tokenization
input_ids = []
attention_masks = []
token_type_ids = []
start_positions = []
end_positions = []
for example in training_data:
    encoded_dict = tokenizer.encode_plus(
        example['question'],
        example['context'],
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_tensors='pt',
        return_attention_mask=True,
        return_token_type_ids=True)
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])
    start_positions.append(torch.tensor([example['answer_start']]))
    end_positions.append(torch.tensor([example['answer_start'] + len(example['answer_text'])]))

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)
start_positions = torch.cat(start_positions, dim=0)
end_positions = torch.cat(end_positions, dim=0)

# create optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# fine-tuning
for epoch in range(num_epochs):
    model.train()
    losses = []
    for i in range(len(input_ids)):
        input_id = input_ids[i].unsqueeze(0).to(device)
        attention_mask = attention_masks[i].unsqueeze(0).to(device)
        token_type_id = token_type_ids[i].unsqueeze(0).to(device)
        start_pos = start_positions[i].unsqueeze(0).to(device)
        end_pos = end_positions[i].unsqueeze(0).to(device)
        outputs = model(input_ids=input_id, attention_mask=attention_mask, token_type_ids=token_type_id, start_positions=start_pos, end_positions=end_pos)
        loss = outputs.loss
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    avg_loss = sum(losses) / len(losses)
    print(avg_loss)

# save
model.save_pretrained('model_1')


> ## Testing
>> We apply the fine-tuned model on a test

In [None]:
question = "List symptoms of the IFAP syndrome."
context = "Ichthyosis follicularis with atrichia and photophobia (IFAP) syndrome in two unrelated female patients. The IFAP syndrome is characterized by the congenital onset of ichthyosis follicularis, absence of hair, and photophobia. A limited number of patients with the disorder have been described, and X-linked recessive inheritance has been proposed. Two unrelated female patients with a complete IFAP syndrome are reported. Both patients show a diffuse distribution of the disorder without linear arrangement. Because the suggested X-linked recessive pattern of inheritance is unlikely in these patients, a different way of transmission or, alternatively, genetic heterogeneity of the disorder has to be considered."
input_dict = tokenizer.encode_plus(question, context, return_tensors='pt')
input_ids = input_dict['input_ids'].to(device)
token_type_ids = input_dict['token_type_ids'].to(device)
attention_mask = input_dict['attention_mask'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
start_scores = outputs.start_logits
end_scores = outputs.end_logits
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)
answer_tokens = input_ids[0][start_index:end_index+1]
answer_tokens = tokenizer.convert_ids_to_tokens(answer_tokens)
answer = tokenizer.convert_tokens_to_string(answer_tokens)

print('Question:', question)
print('Answer:', answer)


> ## Evaluation
>> We will first evaluate our model above and then compare with other simpler models and naive models

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics import accuracy_score, f1_score, average_precision_score
import matplotlib.pyplot as plt

# use model-1
model = AutoModelForQuestionAnswering.from_pretrained('/content/model_1')
tokenizer = AutoTokenizer.from_pretrained('/content/model_1')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# load data
with open('/content/BioASQ-train-list-7b-full-annotated-sampled.json', 'r') as f:
    data = json.load(f)

# evaluation
def evaluate_model_1(data):
    true_answers = []
    predicted_answers = []
    for query in data['queries']:
        question = query['question']
        for document in query['documents']:
            context = document['title'] + ' ' + document['abstractText']
            true_answer = document['exactAnswer']
            input_dict = tokenizer.encode_plus(question, context, return_tensors='pt')
            input_ids = input_dict['input_ids'].to(device)
            token_type_ids = input_dict['token_type_ids'].to(device)
            attention_mask = input_dict['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits
            start_index = torch.argmax(start_scores)
            end_index = torch.argmax(end_scores)
            answer_tokens = input_ids[0][start_index:end_index+1]
            answer_tokens = tokenizer.convert_ids_to_tokens(answer_tokens)
            predicted_answer = tokenizer.convert_tokens_to_string(answer_tokens)
            true_answers.append(true_answer)
            predicted_answers.append(predicted_answer)
    accuracy = accuracy_score(true_answers, predicted_answers)
    f1 = f1_score(true_answers, predicted_answers, average='weighted')
    map_score = average_precision_score(true_answers, predicted_answers)
    return accuracy, f1, map_score

accuracy, f1, map_score = evaluate_model_1(data)
print('Accuracy:', accuracy)
print('F1-score:', f1)
print('MAP:',map_score)


# standard scores
fig, accf_s = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
accf_s = accf_s.ravel()

# loss
accf_s[0].plot(range(1, len(avg_loss_history)+1), avg_loss_history)
accf_s[0].set_xlabel('Epoch')
accf_s[0].set_ylabel('Training Loss')
accf_s[0].set_title('Training Loss')

# accuracy
accf_s[1].plot(range(1, len(accuracy_history)+1), accuracy_history)
accf_s[1].set_xlabel('Epoch')
accf_s[1].set_ylabel('Accuracy')
accf_s[1].set_title('Accuracy')

# F1
accf_s[2].plot(range(1, len(f1_score_history)+1), f1_score_history)
accf_s[2].set_xlabel('Epoch')
accf_s[2].set_ylabel('F1-score')
accf_s[2].set_title('F1-score')

# MAP
accf_s[3].plot(range(1, len(map_score_history)+1), map_score_history)
accf_s[3].set_xlabel('Epoch')
accf_s[3].set_ylabel('MAP score')
accf_s[3].set_title('MAP score')

fig.tight_layout()
plt.show()

