#Installing and Importing Libraries

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [None]:
!pip install transformers faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3


In [None]:
import csv
import requests
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from pprint import pprint
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import torch
import numpy as np
import faiss

#Load the dataset

In [None]:
# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = list(cr)

    print(f"Number of examples = {len(train_data)}")
    ans, noans = 0, 0
    for x in train_data:
        if x[4] == 'False':
            noans += 1
        else:
            ans += 1
    print(f"\tAnswerable questions = {ans}")
    print(f"\tNon-Answerable questions = {noans}\n")
    print("Examples:")
    for i in [0, 1000, 1300]:
        print(' | '.join(train_data[i][:2]), ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
def load_theme_wise_data(train_data):
    theme_wise_data = {}
    for x in train_data[1:]:
        if x[1] not in theme_wise_data:
            theme_wise_data[x[1]] = {
                'para': [],
                'ques': [],
                'ans': []
            }
        if x[2] not in theme_wise_data[x[1]]['para']:
            theme_wise_data[x[1]]['para'].append(x[2])
        theme_wise_data[x[1]]['ques'].append(x[3])
        # ans contains a list -> [Para_Number, Answer_possible, Answer_text, Answer_start]
        theme_wise_data[x[1]]['ans'].append([theme_wise_data[x[1]]['para'].index(x[2])] + x[4:])
    print(f'\nTotal {len(theme_wise_data)} themes present.')
    return theme_wise_data

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)

In [None]:
metric = faiss.METRIC_INNER_PRODUCT

In [None]:
train_data = load_data()
theme_wise_data = load_theme_wise_data(train_data)

Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []

Total 361 themes present.


In [None]:
thresholds = [0.1, 0.2, 0.3, 0.4]

In [None]:
td_ques1 = list(theme_wise_data['YouTube']['ques'])
td_ques2 = list(theme_wise_data['Adolescence']['ques'])
td_ques3 = list(theme_wise_data['National_Archives_and_Records_Administration']['ques'])
td_ques4 = list(theme_wise_data['Hindu_philosophy']['ques'])
td_ques = td_ques1+td_ques2+td_ques3+td_ques4
td_ans1 = list(theme_wise_data['YouTube']['ans'])
td_ans2 = list(theme_wise_data['Adolescence']['ans'])
td_ans3 = list(theme_wise_data['National_Archives_and_Records_Administration']['ans'])
td_ans4 = list(theme_wise_data['Hindu_philosophy']['ans'])
td_ans = td_ans1+td_ans2+td_ans3+td_ans4
td_ans = [i[2][2:-2] for i in td_ans]
td = [[i, j] for i,j in zip(td_ques , td_ans )]
new_td = {idx:[td[idx][0],td[idx][1]] for idx in range(len(td))}

In [None]:
td_ques2

['A surge in hormone production triggers a number of physical changes during what stage of life?',
 'Which part of the body releases testosterone in males?',
 'Which part of the body relesases estrogen in females?',
 'What is the transitional period between childhood and adulthood viewed as?',
 'What transitions occur during puberty in addition to living circumstances?',
 "How do a person's heart and lungs change during puberty?",
 'Increased size and capacity of the heart and lungs result in what changes to the body?',
 'Which sex tends to have more red blood cells than the other?',
 "Which body system are a person's lungs a major proponent of?",
 "Is a person's brain fully developed by the time they reach puberty?",
 'The brain reaches what percentage of its adult size by the time a person is six years old?',
 'The biggest changes in the brain during puberty occur in the parts of the cortex that process what kinds of information?',
 'What parts of the brain continue to become more co

In [None]:
!gdown 1li7y86DCuZKuYfYtN6fddPUtXfSfW_Z5
df1 = pd.read_csv('Question Generation - Sheet1 (1).csv')

Downloading...
From: https://drive.google.com/uc?id=1li7y86DCuZKuYfYtN6fddPUtXfSfW_Z5
To: /content/Question Generation - Sheet1 (1).csv
  0% 0.00/41.6k [00:00<?, ?B/s]100% 41.6k/41.6k [00:00<00:00, 47.6MB/s]


In [None]:
df1['Theme'].value_counts()

YouTube                                         68
National_Archives_and_Records_Administration    67
Hindu_philosophy                                65
Adolescence                                     64
Name: Theme, dtype: int64

In [None]:
sim_query = df1['Similar Question']
actual_ques_large = df1['Question']

#Creating Embeddings and Index for FAISS Similarity Search

In [None]:
encoded_data=model(td_ques)
encoded_data=encoded_data.numpy()
encoded_data = np.array(encoded_data)

In [None]:
def fetch_question_info(dataframe_idx):
    id = dataframe_idx[0]
    score =  dataframe_idx[1] 
    info = new_td[id]
    #print(info)
    meta_dict = {}
    meta_dict['question'] = info[0] 
    meta_dict['score'] = score
    return meta_dict

In [None]:
import numpy as np

def get_k_nearest_neighbours(query_embed, given_query_embed, k):
    index = faiss.IndexFlatL2(query_embed.shape[1])
    index.add(np.array(query_embed))
    return index.search(np.array(given_query_embed), k)

#Accuracy Calculation

In [None]:
def search(sim_query, top_k, model, actual_ques_large):
    query_vector = model(sim_query)
    query_vector =  query_vector.numpy()
    D, I = get_k_nearest_neighbours(encoded_data, query_vector, 7)
    if(D[0][0]<threshold):
      if(td_ques[I[0][0]]==actual_ques_large[0]):
        return 1
      else:
        return 0
    else:
      return 0

In [None]:
def accuracy(sim_query, model, ac_ques_large):
  sum = 0
  for i in range(len(sim_query)):
    sum += search([sim_query[i]], 1, model, [actual_ques_large[i]])

  Accuracy = sum/len(sim_query)*100
  print('For a threshold = '+ str(threshold) + ', the accuracy comes at ' + str(Accuracy)+' %')

In [None]:
for threshold in thresholds:
  accuracy(sim_query, model, actual_ques_large)

For a threshold = 0.1, the accuracy comes at 18.939393939393938 %
For a threshold = 0.2, the accuracy comes at 42.42424242424242 %
For a threshold = 0.3, the accuracy comes at 64.39393939393939 %
For a threshold = 0.4, the accuracy comes at 79.92424242424242 %
