## Dataset Gen

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install sentence-transformers
!pip install --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3

In [None]:
!gdown 1Z-yb752A3o7b9dqrGt24XU0sl53FVqya

Downloading...
From: https://drive.google.com/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya
To: /content/train_data.csv
100% 65.6M/65.6M [00:00<00:00, 71.5MB/s]


In [None]:
import csv
import requests

# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = list(cr)

    print(f"Number of examples = {len(train_data)}")
    ans, noans = 0, 0
    for x in train_data:
        if x[4] == 'False':
            noans += 1
        else:
            ans += 1
    print(f"\tAnswerable questions = {ans}")
    print(f"\tNon-Answerable questions = {noans}\n")
    print("Examples:")
    for i in [0, 1000, 1300]:
        print(' | '.join(train_data[i][:2]), ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
def load_theme_wise_data(train_data):
    theme_wise_data = {}
    for x in train_data[1:]:
        if x[1] not in theme_wise_data:
            theme_wise_data[x[1]] = {
                'para': [],
                'ques': [],
                'ans': []
            }
        if x[2] not in theme_wise_data[x[1]]['para']:
            theme_wise_data[x[1]]['para'].append(x[2])
        theme_wise_data[x[1]]['ques'].append(x[3])
        # ans contains a list -> [Para_Number, Answer_possible, Answer_text, Answer_start]
        theme_wise_data[x[1]]['ans'].append([theme_wise_data[x[1]]['para'].index(x[2])] + x[4:])
    print(f'\nTotal {len(theme_wise_data)} themes present.')
    return theme_wise_data

In [None]:
def load_ques_by_theme(theme, theme_wise_data, answerable_only = False):
    paras = theme_wise_data[theme]['para']
    ques = []
    gold_para = []
    ans = []
    for i in range(len(theme_wise_data[theme]['ques'])):
        if answerable_only and theme_wise_data[theme]['ans'][i][1] == 'False':
            continue
        ques.append(theme_wise_data[theme]['ques'][i])
        gold_para.append(theme_wise_data[theme]['ans'][i][0])
        ans.append(theme_wise_data[theme]['ans'][i][1:])
    
    print("Total Questions:", len(ques))
    print("Total Paragraphs:", len(paras))
    return paras, ques, gold_para, ans

In [None]:
train_data = load_data()
theme_wise_data = load_theme_wise_data(train_data)

Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []

Total 361 themes present.


In [None]:
!pip install transformers
!pip install transformers faiss-cpu

from transformers import AutoModel, AutoTokenizer

import faiss
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import torch

metric = faiss.METRIC_INNER_PRODUCT

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from tqdm import tqdm
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def fetch_question_info(dataframe_idx, new_t):
    id = dataframe_idx[0]
    score =  dataframe_idx[1] 
    info = new_t[id]
    #print(info)
    meta_dict = {}
    meta_dict['sentence'] = info
    meta_dict['score'] = score
    return meta_dict

In [None]:
def get_k_nearest_neighbours(query_embed, given_query_embed, k):
    index = faiss.IndexFlatL2(query_embed.shape[1])
    index.add(np.array(query_embed))
    return index.search(np.array(given_query_embed), k)

In [None]:
import random

def Rand(start, end, num):
    res = []
 
    for j in range(num):
        res.append(random.randint(start, end))
 
    return res

In [None]:
def make_dataset(theme):
  sent_comb = []
  for i in range(len(theme_wise_data[theme]['para'])):
    sent_comb+=nltk.sent_tokenize(theme_wise_data[theme]['para'][i])

  t_ans = []
  for i in range(len(sent_comb)):
    t_ans.append('ans')

  t_ans = [i[2][2:-2] for i in t_ans]
  t = [[i, j] for i,j in zip(sent_comb , t_ans )]
  new_t = {idx:[t[idx][0],t[idx][1]] for idx in range(len(t))}

  encoded_data=model.encode(sent_comb)
  encoded_data = np.array(encoded_data)
  faiss.normalize_L2(encoded_data)
  index = faiss.index_factory(len(encoded_data[0]),"Flat",metric)
  index.add(encoded_data)
  
  char_nos = []
  for i in range(len(theme_wise_data[theme]['ans'])):
    char_nos.append(re.sub(r'[()\[\]{}]', '', theme_wise_data[theme]['ans'][i][3]))

  for i in range(len(theme_wise_data[theme]['ans'])):
    if(char_nos[i] ==''):
      char_nos[i] = '-1'

  randm=[]
  for i in range(len(theme_wise_data[theme]['ques'])):
    if(char_nos[i]!='-1'):
      randm.append(i)

  rndm = random.sample(randm, int(len(randm)/3.5))

  #num = int(len(theme_wise_data[theme]['ques'])/5)
  #start = 0
  #end = len(theme_wise_data[theme]['ques'])-1
  #rndm = Rand(start, end, num)
  query_vec=[]
  for i in rndm:
    query_vec.append(theme_wise_data[theme]['ques'][i])

  non_similar_sent = []
  for i in range(len(rndm)):
    query_vector = model.encode([query_vec[i]])
    #top_k = index.search(q_vector, 5)
    #top_k_score = top_k[0].tolist()[0]
    #top_k_ids = top_k[1].tolist()[0]
    #top_k_ids = list(np.unique(top_k_ids))
    #final = sorted(zip(top_k_ids,top_k_score))
    #results =  [fetch_question_info(idx, new_t) for idx in final]
    D, I = get_k_nearest_neighbours(encoded_data, query_vector, 7)
    if(D[0][6] <= 0.8):
      #print(sent_comb[I[0][4]])
      non_similar_sent.append(sent_comb[I[0][4]])
    else:
      for j in range(7):
        if(D[0][j] > 0.8):

         # print(sent_comb[I[0][j]])
          non_similar_sent.append(sent_comb[I[0][j]])
          break
    # for j in range(7):

      # if(D[0][j]>0.7):
      #   #print(sent_comb[I[0][i]])
      #   #print(D[0][i])
      #   non_similar_sent.append(sent_comb[I[0][j]])
      #   break
      # if(D[0][6]<=0.7):
      #   non_similar_sent.append(sent_comb[I[0][4]])
      #   break
    #fifth_sim.append(results[4]['sentence'][0])

  ans_sent = []
  for i in rndm:
    sntncs = nltk.sent_tokenize(theme_wise_data[theme]['para'][theme_wise_data[theme]['ans'][i][0]])
    char_no = 0
    for j in range(len(sntncs)):
      if(int(char_nos[i])!=-1):
        if(char_no <= int(char_nos[i])):
          char_no+=len(sntncs[j])
        if(char_no>int(char_nos[i])):
          break
      
    if(char_nos[i]!='-1'):
      ans_sent.append(sntncs[j])
    else:
      ans_sent.append('')

  return query_vec, ans_sent, non_similar_sent

In [None]:
themes_not_unique = []
for i in range(1, 75056):
  themes_not_unique.append(train_data[i][1])
  
themelist = set(themes_not_unique)

In [None]:
!pip install tqdm
from tqdm import tqdm

qu = []
an = []
si = []
for theme in tqdm(themelist):
  qu, an, si = (val + delta for val, delta in zip((qu, an, si), (make_dataset(theme))))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


 36%|███▌      | 129/361 [1:23:08<1:24:34, 21.87s/it]

In [None]:
import pandas as pd
data = {'Query':qu, 'Actual Answer':an, 'Fifth Similar':si}
df = pd.DataFrame(data)
df.to_csv('CompleteDataset.csv')

In [None]:
del model
del an
del data
del df
del i
del metric
del qu
del si
del theme
del theme_wise_data
del themelist
del themes_not_unique

In [None]:
!cp CompleteDataset.csv gdrive/MyDrive/CompleteDataset.csv

## Load Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pandas as pd
def load_data():
  data = pd.read_csv('gdrive/MyDrive/CompleteDataset.csv')
  # data = data.sample(frac=0.28)
  data = data.dropna()
  return data

## Prepare For training

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader


def train_data(data):
  train_data = []
  for index, row in data.iterrows():
    train_data.append(InputExample(texts=[row['Query'], row['Actual Answer'], row['Fifth Similar']]))

  train_dataloader = DataLoader(train_data, shuffle=True, batch_size=8)
  return train_dataloader

## Load Sentence Embedding Model

In [None]:
from sentence_transformers import SentenceTransformer

def load_model():
  model_id = 'sentence-transformers/all-mpnet-base-v2'
  model = SentenceTransformer(model_id)
  return model

# Train

In [None]:
from sentence_transformers import losses
import torch

def train(model, dataloader, params):
  train_loss = losses.TripletLoss(model=model)
  model.fit(
      [(dataloader, train_loss)],
      None,
      params['epochs'],
      None,
      params['scheduler'],
      params['warmup_steps'],
      params['optimizer_class'],
      params['optimizer_params'],
      params['weight_decay'],
      0,
      "finetuned_mpnet_triplet",
      True,
      params['max_grad_norm'],
      params['use_amp'],
      None,
      True,
      None,
      500,
      0
  )

# Execution

In [None]:
data = load_data()
data

In [None]:
data = data.loc[data['Actual Answer'] != "[citation needed]"]

In [None]:
data

In [None]:
train_dataloader = train_data(data)

In [None]:
model = load_model()

In [None]:
hyperparams = {
    'epochs': 10,
    'scheduler': 'WarmupLinear',
    'warmup_steps': 10000,
    'optimizer_class': torch.optim.AdamW,
    'optimizer_params': {'lr':2e-07},
    'weight_decay': 0.01,
    'max_grad_norm': 1,
    'use_amp':False,
}

In [None]:
train(model, train_dataloader, hyperparams)

In [None]:
!mv "finetuned_mpnet_triplet" "gdrive/MyDrive/finetuned_mpnet_triplet"