In [None]:
# Mounting the Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/qna_package_nlp_final/

/content/drive/MyDrive/qna_package_nlp_final


# Downloading squad_v2 Data

Method 1

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset,load_metric
raw_datasets = load_dataset("squad_v2")
metric = load_metric("rouge")



  0%|          | 0/2 [00:00<?, ?it/s]

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
# # qna model needs data in this json format

# {
#   "data": [
#     {
#       "title": "some text",
#       "context": "some text",
#       "question": "some question",
#       "answers": {
#         "text": [
#           "some text"
#         ],
#         "answer_start": [
#           45
#         ]
#       },
#       "id": "0"
#     },

### Saving train data in json format

In [None]:
import pandas as pd
import json

In [None]:
%%time
temp=[]
for i in range(len(raw_datasets['train'])):
  temp.append(raw_datasets['train'][i])
train_df=pd.DataFrame(temp)
train_df = train_df.reset_index().drop(columns=['index'])
train_df = train_df.to_json(orient="records")
train_df = json.loads(train_df)
train_df = {'data':train_df}

with open('squad_data/train_data_squadv2.json', 'w', encoding='utf-8') as f:
    json.dump(train_df, f, ensure_ascii=False, indent=4)

CPU times: user 15.8 s, sys: 237 ms, total: 16 s
Wall time: 16 s


### Saving validation data in json format

In [None]:
temp=[]
for i in range(len(raw_datasets['validation'])):
  temp.append(raw_datasets['validation'][i])
val_df=pd.DataFrame(temp)
val_df = val_df.reset_index().drop(columns=['index'])

In [None]:
val_df.head(2)

Unnamed: 0,id,title,context,question,answers
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France', 'France', 'France', 'Franc..."
1,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"{'text': ['10th and 11th centuries', 'in the 1..."


taking frst 500 rows as test data and remaining data to val data

In [None]:
test_df = val_df[:500].to_json(orient="records")
test_df = json.loads(test_df)
test_df = {'data':test_df}
with open('squad_data/test_data_squadv2.json', 'w', encoding='utf-8') as f:
    json.dump(test_df, f, ensure_ascii=False, indent=4)

In [None]:
val_df_final = val_df[501:].to_json(orient="records")
val_df_final = json.loads(val_df_final)
val_df_final = {'data':val_df_final}

with open('squad_data/validation_data_squadv2.json', 'w', encoding='utf-8') as f:
    json.dump(val_df_final, f, ensure_ascii=False, indent=4)

# Question - Answering System

In [None]:
!pip install -r requirements.txt

In [None]:
from question_answer import QnA



## Initialize the class

Created a package QnA for fine tuning and prediction

In [None]:
qna=QnA()

## Fine tuning

Use Colab with GPU accelerators to accelerate the training process

This cell trains the given model on the given dataset and performs the evaluation.

* The pre-trained model from huggingface which needs to be fine tuned on the dataset.

* The user should provide the training and validation file path(optional) in json/csv format.

* The user should also provide the output path where all the model results will be saved.

* If the model name is not specified in the function,default model**(deepset/tinyroberta-squad2)** is taken into consideration

For more information the user can refer to the following italicized text 

https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering

In [None]:
finetuned_model=qna.train(train_data_path='squad_data/train_data_500_squadv2.json',output_path='model',model_name='deepset/roberta-base-squad2',valn_path='squad_data/validation_data_500_squadv2.json')

##Prediction using single context

We can use any pre trained/fine tuned model from Huggingface to infer on the test input. 

The user should provide:

* query/question
* context

* If the model name is not specified,default model**(deepset/tinyroberta-squad2)** is taken into consideration

We can pass any kwargs inside predict function. (i.e doc_stride,max_answer_length,..)

In [None]:
context = "The US has passed the peak on new coronavirus cases, " \
          "President Donald Trump said and predicted that some states would reopen this month. " \
          "The US has over 637,000 confirmed Covid-19 cases and over 30,826 deaths, the highest for any country in the world."

question = "What was President Donald Trump's prediction?"

In [None]:
# qna.predict(question=question,context=context,doc_stride=128,max_answer_length=20,learning_rate=3e-5,n_best_size=20)
qna.predict(question=question,context=context,model_name='deepset/roberta-base-squad2')

Executed
Model Name: deepset/roberta-base-squad2


[{'question': "What was President Donald Trump's prediction?",
  'context': 'The US has passed the peak on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month. The US has over 637,000 confirmed Covid-19 cases and over 30,826 deaths, the highest for any country in the world.',
  'predicted_answer': 'some states would reopen this month',
  'model_name': 'deepset/roberta-base-squad2',
  'score': 0.5812742710113525,
  'start': 100,
  'end': 135}]

##Prediction using test data

In [None]:
qna.predict(test_path="squad_data/test_data_squadv2.json",output_path="model/test_prediction",model_name='deepset/roberta-base-squad2')