In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

## Read csv data

In [2]:
data = pd.read_csv('data.csv')
data = data.iloc[:,1:]
data

Unnamed: 0,Header,Paragraph
0,Change in glycaemic control with structured di...,"\nBMC Health Services Research\nvolume 23, Art..."
1,Background,"In high-resource settings, structured diabetes..."
2,Aim,"To compare, structured diabetes self-managemen..."
3,Design,Single-blind randomised parallel comparator co...
4,Results,Recruitment: 22nd until 29th January 2021.We r...
5,Conclusion,"In low-resource settings, diabetes self-manage..."
6,Study design and approval,"A multicentre, parallel-group, single-blind ra..."
7,Ethical approval,Ethical approval was provided by the Ghana Hea...
8,Study participants and study setting,Eligibility criteria included aged 18 years or...
9,Randomisation and masking,Participants were randomly assigned either to ...


## Question Answering using BERT QA fine tuned on SQUAD

In [3]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [4]:
def question_answer(question, text):
    
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

In [5]:
text = data.loc[14,'Paragraph']
text

'At endline, HbA1c decreased within both groups:-0·9% in the intervention group and -0·3% in the control group. Although this decrease was greater in the intervention group than in the control group, the difference between groups was not significant (Table 2). The primary outcome failed to reach significance. There was insufficient evidence that the intervention had an effect on HbA1c (Supplementary Figs.\xa01 and 2).'

In [6]:
question = input("Please enter your question: ")
question_answer(question, text)

Please enter your question: In which group did HbA1c levels decrease more?

Predicted answer:
Intervention group


## Compiled Function

In [None]:
def question_answer(data):
    
    for i in range(len(data)):
        text=data.loc[i,'Paragraph']
        
        print("-------------Text------------------")
        print(text)
        print("-----------------------------------")
        question = input("Please enter the question")
        
        #tokenize question and text as a pair
        input_ids = tokenizer.encode(question, text)

        #string version of tokenized ids
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        #segment IDs
        #first occurence of [SEP] token
        sep_idx = input_ids.index(tokenizer.sep_token_id)
        #number of tokens in segment A (question)
        num_seg_a = sep_idx+1
        #number of tokens in segment B (text)
        num_seg_b = len(input_ids) - num_seg_a

        #list of 0s and 1s for segment embeddings
        segment_ids = [0]*num_seg_a + [1]*num_seg_b
        assert len(segment_ids) == len(input_ids)

        #model output using input_ids and segment_ids
        output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

        #reconstructing the answer
        answer_start = torch.argmax(output.start_logits)
        answer_end = torch.argmax(output.end_logits)
        if answer_end >= answer_start:
            answer = tokens[answer_start]
            for i in range(answer_start+1, answer_end+1):
                if tokens[i][0:2] == "##":
                    answer += tokens[i][2:]
                else:
                    answer += " " + tokens[i]

        if answer.startswith("[CLS]"):
            answer = "Unable to find the answer to your question."

        print("\nPredicted answer:\n{}\n".format(answer.capitalize()))