#Installing and Importing Libraries

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [None]:
import csv
import requests
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from pprint import pprint
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import torch
import numpy as np

#Load the dataset

In [None]:
# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = list(cr)

    print(f"Number of examples = {len(train_data)}")
    ans, noans = 0, 0
    for x in train_data:
        if x[4] == 'False':
            noans += 1
        else:
            ans += 1
    print(f"\tAnswerable questions = {ans}")
    print(f"\tNon-Answerable questions = {noans}\n")
    print("Examples:")
    for i in [0, 1000, 1300]:
        print(' | '.join(train_data[i][:2]), ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
def load_theme_wise_data(train_data):
    theme_wise_data = {}
    for x in train_data[1:]:
        if x[1] not in theme_wise_data:
            theme_wise_data[x[1]] = {
                'para': [],
                'ques': [],
                'ans': []
            }
        if x[2] not in theme_wise_data[x[1]]['para']:
            theme_wise_data[x[1]]['para'].append(x[2])
        theme_wise_data[x[1]]['ques'].append(x[3])
        # ans contains a list -> [Para_Number, Answer_possible, Answer_text, Answer_start]
        theme_wise_data[x[1]]['ans'].append([theme_wise_data[x[1]]['para'].index(x[2])] + x[4:])
    print(f'\nTotal {len(theme_wise_data)} themes present.')
    return theme_wise_data

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)

In [None]:
train_data = load_data()
theme_wise_data = load_theme_wise_data(train_data)

Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []

Total 361 themes present.


In [None]:
theme = "Adolescence"

In [None]:
!gdown 1rCktWk6rljttLjiXCupAF0wrS5oHUI8K

df = pd.read_csv('Question Generation - Sheet1.csv')
queries = df[df['Theme']==theme]['Similar Question']
dis_sim_queries = df[df['Theme']==theme]['Dissimilar Question']

Downloading...
From: https://drive.google.com/uc?id=1rCktWk6rljttLjiXCupAF0wrS5oHUI8K
To: /content/Question Generation - Sheet1.csv
  0% 0.00/13.0k [00:00<?, ?B/s]100% 13.0k/13.0k [00:00<00:00, 8.94MB/s]


In [None]:
t_ques = list(theme_wise_data[theme]['ques'])
t_ans = list(theme_wise_data[theme]['ans'])
t_ans = [i[2][2:-2] for i in t_ans]
t = [[i, j] for i,j in zip(t_ques , t_ans )]
new_t = {idx:[t[idx][0],t[idx][1]] for idx in range(len(t))}

#Create embeddings and apply Cosine Similarity

In [None]:
encoded_data=model(t_ques)
encoded_data=encoded_data.numpy()
encoded_data = np.array(encoded_data)

In [None]:
from numpy.linalg import norm
def cos(a, b):
  return (np.dot(a, b)/(norm(a)*norm(b)))

#Check for one query

In [None]:
query = ["What organ produces estrogen in females?"]

In [None]:
query_encode=model(query)
query_encode=query_encode.numpy()
query_encode = np.array(query_encode)

In [None]:
cosarr = []
for i in range(len(encoded_data)):
  cosarr.append(cos(query_encode, encoded_data[i]))

In [None]:
cosarr = np.array(cosarr, dtype=np.float32)
index_cosarr = np.argsort(cosarr, axis=0)[-5:][::-1]

In [None]:
for i in range(5):
  print(theme_wise_data[theme]['ques'][index_cosarr[i][0]])
  print(1-cosarr[index_cosarr[i]])

Which part of the body relesases estrogen in females?
[[0.14522243]]
Which part of the body releases testosterone in males?
[[0.28804702]]
What is a female's major landmark of puberty?
[[0.44473755]]
What is a secondary sex characteristic change?
[[0.45803547]]
Which male body parts enlarge and develop at the same time as the penis?
[[0.49982017]]


# Testing on self-curated dataset

In [None]:
def most_sim(query, model, theme, encoded_data):
  query_encode=model(query)
  query_encode=query_encode.numpy()
  query_encode = np.array(query_encode)
  cosarr = []
  for i in range(len(encoded_data)):
    cosarr.append(cos(query_encode, encoded_data[i]))
  index_cosarr = np.argsort(cosarr, axis=0)[-1:][::-1]
  print(query)
  print('Most similar query is: ' +str(theme_wise_data[theme]['ques'][index_cosarr[0][0]]))
  print('And the cosine score for the query is '+str(cosarr[index_cosarr[0][0]]))

In [None]:
for k in range(len(queries)):
  most_sim([queries[k]], model, theme, encoded_data)
  most_sim([dis_sim_queries[k]], model, theme, encoded_data)

['When are a number of changes triggered due to increased hormone production ']
Most similar query is: A surge in hormone production triggers a number of physical changes during what stage of life?
And the cosine score for the query is [0.78290737]
['At what stage of life does a decrease in hormone production trigger a number of physical changes?']
Most similar query is: A surge in hormone production triggers a number of physical changes during what stage of life?
And the cosine score for the query is [0.8692846]
['What organ produces testosterone in males?']
Most similar query is: Which part of the body releases testosterone in males?
And the cosine score for the query is [0.8768661]
['Which organ in the male body does not release testosterone?']
Most similar query is: Which part of the body releases testosterone in males?
And the cosine score for the query is [0.8727007]
['What organ produces estrogen in females?']
Most similar query is: Which part of the body relesases estrogen in f