#Installing and Importing Libraries

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [None]:
! pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.0/648.0 KB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.1-cp38-cp38-linux_x86_64.whl size=582769 sha256=b178a805f24fc51d5d513ddbdf097d7a5e1b7b66fbb50f9753c2c6d80b7f1a52
  Stored in directory: /root/.cache/pip/wheels/f9/93/19/30511c4a9ae6b4937455a134c34a39e13943e2c6f46fcd2ed2
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1


In [None]:
import csv
import requests
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from pprint import pprint
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import torch
import numpy as np
import annoy

In [None]:
# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = list(cr)

    print(f"Number of examples = {len(train_data)}")
    ans, noans = 0, 0
    for x in train_data:
        if x[4] == 'False':
            noans += 1
        else:
            ans += 1
    print(f"\tAnswerable questions = {ans}")
    print(f"\tNon-Answerable questions = {noans}\n")
    print("Examples:")
    for i in [0, 1000, 1300]:
        print(' | '.join(train_data[i][:2]), ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
def load_theme_wise_data(train_data):
    theme_wise_data = {}
    for x in train_data[1:]:
        if x[1] not in theme_wise_data:
            theme_wise_data[x[1]] = {
                'para': [],
                'ques': [],
                'ans': []
            }
        if x[2] not in theme_wise_data[x[1]]['para']:
            theme_wise_data[x[1]]['para'].append(x[2])
        theme_wise_data[x[1]]['ques'].append(x[3])
        # ans contains a list -> [Para_Number, Answer_possible, Answer_text, Answer_start]
        theme_wise_data[x[1]]['ans'].append([theme_wise_data[x[1]]['para'].index(x[2])] + x[4:])
    print(f'\nTotal {len(theme_wise_data)} themes present.')
    return theme_wise_data

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)

In [None]:
train_data = load_data()
theme_wise_data = load_theme_wise_data(train_data)

Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []

Total 361 themes present.


In [None]:
thresholds = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]

In [None]:
theme = 'Adolescence'

In [None]:
!gdown 1rCktWk6rljttLjiXCupAF0wrS5oHUI8K

df = pd.read_csv('Question Generation - Sheet1.csv')
query = df[df['Theme']==theme]['Similar Question']
actual_ques = df[df['Theme']==theme]['Question']

Downloading...
From: https://drive.google.com/uc?id=1rCktWk6rljttLjiXCupAF0wrS5oHUI8K
To: /content/Question Generation - Sheet1.csv
  0% 0.00/13.0k [00:00<?, ?B/s]100% 13.0k/13.0k [00:00<00:00, 16.8MB/s]


In [None]:
t_ques = list(theme_wise_data[theme]['ques'])
t_ans = list(theme_wise_data[theme]['ans'])
t_ans = [i[2][2:-2] for i in t_ans]
t = [[i, j] for i,j in zip(t_ques , t_ans )]
new_t = {idx:[t[idx][0],t[idx][1]] for idx in range(len(t))}

#Creating Embeddings and Index for ANNOY Similarity Search

In [None]:
encoded_data=model(t_ques)
encoded_data = np.array(encoded_data)

In [None]:
def create_index_annoy(embeddings, vector_length = 512, metric = 'angular', num_trees = 100):
  annoy_index = annoy.AnnoyIndex(vector_length, metric=metric)
  for i in range(len(embeddings)):
    annoy_index.add_item(i, embeddings[i])
  annoy_index.build(n_trees = num_trees)
  return annoy_index

In [None]:
index =create_index_annoy(encoded_data)

#Accuracy Calculation

In [None]:
def find_similar_annoy(index ,embedding, num_matches=1):
  '''Finds similar items to a given embedding in the ANN index'''
  
  ids = index.get_nns_by_vector(
  embedding, num_matches, search_k=-1, include_distances=True)
  score = ids[1]
  questions = [new_t[id][0] for id in ids[0]]
  return [(a[0], a[1]) for a in zip(questions, score)]

In [None]:
def top_k_ques(query, index, model, actual_ques):
    query_vector = model(query)
    query_vector =  query_vector.numpy()
    results = find_similar_annoy(index,query_vector[0])
    result = results[0][1]
    if(result<threshold):
      if(results[0][0] == actual_ques[0]):
        return 1 #Can be answered from previously answered query
      else:
        return 0
    else:
      return 0 #Can\'t be answered from previously answered query

In [None]:
def accuracy(query, index, model, actual_ques):
  sum = 0
  for i in range(len(query)):
    sum += top_k_ques([query[i]], index, model, [actual_ques[i]])

  Accuracy = sum/len(query)*100
  print('For a threshold = '+ str(threshold) + ', the accuracy comes at ' + str(Accuracy)+' %')

In [None]:
for threshold in thresholds:
  accuracy(query, index, model, actual_ques)

For a threshold = 0.3, the accuracy comes at 0.0 %
For a threshold = 0.35, the accuracy comes at 0.0 %
For a threshold = 0.4, the accuracy comes at 6.666666666666667 %
For a threshold = 0.45, the accuracy comes at 13.333333333333334 %
For a threshold = 0.5, the accuracy comes at 20.0 %
For a threshold = 0.55, the accuracy comes at 46.666666666666664 %
For a threshold = 0.6, the accuracy comes at 66.66666666666666 %
For a threshold = 0.65, the accuracy comes at 86.66666666666667 %
For a threshold = 0.7, the accuracy comes at 93.33333333333333 %
