#Installing and Importing Libraries


In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install transformers faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import csv
import requests
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from pprint import pprint
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import torch
import numpy as np
import faiss

#Load the dataset

In [None]:
# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = list(cr)

    print(f"Number of examples = {len(train_data)}")
    ans, noans = 0, 0
    for x in train_data:
        if x[4] == 'False':
            noans += 1
        else:
            ans += 1
    print(f"\tAnswerable questions = {ans}")
    print(f"\tNon-Answerable questions = {noans}\n")
    print("Examples:")
    for i in [0, 1000, 1300]:
        print(' | '.join(train_data[i][:2]), ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
def load_theme_wise_data(train_data):
    theme_wise_data = {}
    for x in train_data[1:]:
        if x[1] not in theme_wise_data:
            theme_wise_data[x[1]] = {
                'para': [],
                'ques': [],
                'ans': []
            }
        if x[2] not in theme_wise_data[x[1]]['para']:
            theme_wise_data[x[1]]['para'].append(x[2])
        theme_wise_data[x[1]]['ques'].append(x[3])
        # ans contains a list -> [Para_Number, Answer_possible, Answer_text, Answer_start]
        theme_wise_data[x[1]]['ans'].append([theme_wise_data[x[1]]['para'].index(x[2])] + x[4:])
    print(f'\nTotal {len(theme_wise_data)} themes present.')
    return theme_wise_data

In [None]:
train_data = load_data()
theme_wise_data = load_theme_wise_data(train_data)

Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []

Total 361 themes present.


In [None]:
theme = 'Adolescence'

In [None]:
thresholds = [0.1, 0.2, 0.3, 0.4]

In [None]:
t_ques = list(theme_wise_data[theme]['ques'])
t_ans = list(theme_wise_data[theme]['ans'])
t_ans = [i[2][2:-2] for i in t_ans]
t = [[i, j] for i,j in zip(t_ques , t_ans )]
new_t = {idx:[t[idx][0],t[idx][1]] for idx in range(len(t))}

In [None]:
!gdown 1rly8PEMoyq8CBGIwcPxeFoezgW7TQ7fS

Downloading...
From: https://drive.google.com/uc?id=1rly8PEMoyq8CBGIwcPxeFoezgW7TQ7fS
To: /content/Question Generation - Sheet1 (2).csv
  0% 0.00/41.6k [00:00<?, ?B/s]100% 41.6k/41.6k [00:00<00:00, 45.1MB/s]


In [None]:
df = pd.read_csv('Question Generation - Sheet1 (2).csv')

In [None]:
query = df[df['Theme']==theme]['Similar Question']

In [None]:
actual_ques = df[df['Theme']==theme]['Question']

In [None]:
qid=[]
for i in range(len(new_t)):
  for j in range(len(actual_ques)):
    if actual_ques[j]==new_t[i][0]:
      qid.append(i)

In [None]:
metric = faiss.METRIC_INNER_PRODUCT

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)

#Creating Embeddings and Index for FAISS Similarity Search

In [None]:
encoded_data=model(t_ques)
encoded_data=encoded_data.numpy()
encoded_data = np.array(encoded_data)

In [None]:
def fetch_question_info(dataframe_idx):
    id = dataframe_idx[0]
    score =  dataframe_idx[1] 
    info = new_t[id]
    #print(info)
    meta_dict = {}
    meta_dict['question'] = info[0] 
    meta_dict['score'] = score
    return meta_dict

In [None]:
import numpy as np

def get_k_nearest_neighbours(query_embed, given_query_embed, k):
    index = faiss.IndexFlatL2(query_embed.shape[1])
    index.add(np.array(query_embed))
    return index.search(np.array(given_query_embed), k)

#Accuracy by Context ID

In [None]:
def search(query, top_k, model, actual_ques, i, theme, qid):
    query_vector = model(query)
    query_vector =  query_vector.numpy()
    D, I = get_k_nearest_neighbours(encoded_data, query_vector, 7)
    for q in range(len(theme_wise_data[theme]['ques'])):
      if(t_ques[I[0][0]]==theme_wise_data[theme]['ques'][q]):
         break
    if(D[0][0]<threshold):
      if(theme_wise_data[theme]['ans'][q][0]==theme_wise_data[theme]['ans'][qid][0]):
        return 1
      else:
        return 0
    else:
      return 0

In [None]:
def accuracy(query, model, ac_ques, theme, qid):
  sum = 0
  for i in range(len(query)):
    if(search([query[i]], 1, model, [ac_ques[i]], i, theme, qid[i])):
      sum+=1
  Accuracy = sum/len(query)*100
  print('For a threshold = '+ str(threshold) + ', the accuracy comes at ' + str(Accuracy)+' %')

In [None]:
for threshold in thresholds:
  accuracy(query, model, actual_ques, theme, qid)

For a threshold = 0.1, the accuracy comes at 10.9375 %
For a threshold = 0.2, the accuracy comes at 34.375 %
