In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [3]:
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."




In [4]:
import altair as alt
from google.colab import autoviz
df_8375871047186156634 = autoviz.get_registered_df('df_8375871047186156634')

def value_plot(df, y, sort_ascending=False, width=100, height=50):
  if sort_ascending:
    df = df.sort_values(y).reset_index(drop=True)
  return (alt.Chart(df.reset_index()).mark_line()
          .encode(x=alt.X('index', title=''), y=alt.X(y, title='value'))
          .properties(width=width, height=height, title=y))

chart = value_plot(df_8375871047186156634, *['index'], **{})
chart

In [5]:
del coqa["version"]

In [7]:
#required columns in our dataframe
cols = ["text","question","answer"]#list of lists to create our dataframe
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)

new_df = pd.DataFrame(comp_list, columns=cols) #saving the dataframe to csv file for further loading
new_df.to_csv("CoQA_data.csv", index=False)

In [8]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [9]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


In [10]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [11]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

In [12]:
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))

The input has a total of 353 tokens.


In [13]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
  print('{:8}{:8,}'.format(token,id))

[CLS]        101
how        2,129
did        2,106
del        3,972
##ancy    11,656
feel       2,514
afterward   9,707
?          1,029
[SEP]        102
(          1,006
cnn       13,229
)          1,007
-          1,011
-          1,011
a          1,037
florida    3,516
judge      3,648
'          1,005
s          1,055
ruling     6,996
wednesday   9,317
will       2,097
allow      3,499
a          1,037
foreign    3,097
-          1,011
born       2,141
high       2,152
school     2,082
basketball   3,455
player     2,447
who        2,040
was        2,001
ruled      5,451
ineligible  22,023
and        1,998
his        2,010
team       2,136
to         2,000
compete    5,566
in         1,999
the        1,996
playoffs   7,555
,          1,010
even       2,130
though     2,295
they       2,027
could      2,071
ultimately   4,821
be         2,022
stripped  10,040
of         1,997
any        2,151
title      2,516
they       2,027
win        2,663
.          1,012
miami      5,631
-     

In [14]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  8
Number of tokens in segment A:  9
Number of tokens in segment B:  344


In [15]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

In [16]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    

print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
How did delancy feel afterward?

Answer:
Very happy and relieved.


In [18]:
    def question_answer(question, text):
        
        #tokenize question and text as a pair
        input_ids = tokenizer.encode(question, text)
        
        #string version of tokenized ids
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        
        #segment IDs
        #first occurence of [SEP] token
        sep_idx = input_ids.index(tokenizer.sep_token_id)    #number of tokens in segment A (question)
        num_seg_a = sep_idx+1    #number of tokens in segment B (text)
        num_seg_b = len(input_ids) - num_seg_a
        
        #list of 0s and 1s for segment embeddings
        segment_ids = [0]*num_seg_a + [1]*num_seg_b
        assert len(segment_ids) == len(input_ids)
        
        #model output using input_ids and segment_ids
        output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
        
        #reconstructing the answer
        answer_start = torch.argmax(output.start_logits)
        answer_end = torch.argmax(output.end_logits)
        if answer_end >= answer_start:
            answer = tokens[answer_start]
            for i in range(answer_start+1, answer_end+1):
                if tokens[i][0:2] == "##":
                    answer += tokens[i][2:]
                else:
                    answer += " " + tokens[i]
                    
        if answer.startswith("[CLS]"):
            answer = "Unable to find the answer to your question."
        
        print("\nPredicted answer:\n{}".format(answer.capitalize()))

In [24]:
text = """ The appellant having been convicted under Section 80 of
the Karnataka Police Act, 1963 (for short, ‘the 1963 Act’) has filed the
present appeal.
Notice in the appeal was issued on 27.02.2023
limited to the extent of consideration as to whether the appellant can
be granted benefit of probation.

The brief facts of the case are that FIR dated 16.8.2007 was
registered against 24 accused persons including the appellant under
sections 79 and 80 of the 1963 Act as they were found to be indulging
in gambling.
The charge sheet was filed and the Trial Court vide order
dated 21.8.2007 convicted them under Section 79 & 80 of the 1963 Act
and sentenced them to undergo imprisonment for a period of one year
each under both the provisions along with a fine of ₹ 600/- after the
accused had pleaded guilty. At that stage, the accused filed affidavits before the Trial Court undertaking that they will not commit such
offence in future and taking note of that, the Trial Court sentenced the
accused to imprisonment till the rising of the Court."""

question = "How many accused were involved in this case"

In [27]:
print(question_answer(question, text))


Predicted answer:
24
None


In [29]:
    text = input("Please enter your text: \n")
    question = input("\nPlease enter your question: \n")
    while True:
        question_answer(question, text)
        
        flag = True
        flag_N = False
        
        while flag:
            response = input("\nDo you want to ask another question based on this text (Y/N)? ")
            if response[0] == "Y":
                question = input("\nPlease enter your question: \n")
                flag = False
            elif response[0] == "N":
                print("\nBye!")
                flag = False
                flag_N = True
                
        if flag_N == True:
            break

Please enter your text: 
The appellant having been convicted under Section 80 of the Karnataka Police Act, 1963 (for short, ‘the 1963 Act’) has filed the present appeal. Notice in the appeal was issued on 27.02.2023 limited to the extent of consideration as to whether the appellant can be granted benefit of probation.  The brief facts of the case are that FIR dated 16.8.2007 was registered against 24 accused persons including the appellant under sections 79 and 80 of the 1963 Act as they were found to be indulging in gambling. The charge sheet was filed and the Trial Court vide order dated 21.8.2007 convicted them under Section 79 & 80 of the 1963 Act and sentenced them to undergo imprisonment for a period of one year each under both the provisions along with a fine of ₹ 600/- after the accused had pleaded guilty. At that stage, the accused filed affidavits before the Trial Court undertaking that they will not commit such offence in future and taking note of that, the Trial Court sente