In [None]:
!pip install transformers
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
!pip install faiss-gpu
!pip install sentencepiece
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss-gpu]
!pip install langchain
!pip install wolframalpha

In [None]:
import os
import requests
import json

from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
from transformers import T5Tokenizer, T5ForConditionalGeneration

import numpy as np

from tqdm.notebook import tqdm

import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def text_chunker(text, chunk_size = 500):
  text = text.split(' ')
  new_texts = []
  for i in range(0,len(text),chunk_size):
    if i + chunk_size > len(text):
      temp_text = text[i:]
    else:
      temp_text = text[i:i+chunk_size]
    new_texts.append(" ".join(temp_text))
  return new_texts

In [None]:
dataset = pd.DataFrame()
dataset['id'] = None
dataset['text'] = None
dataset['source'] = None
files = os.listdir('./docs')

i = 0
for f in tqdm(files,total=len(files)):
  if f.find('.txt') > -1:
    text = open('./docs/'+f,'r',errors='ignore').read()
    text = text.replace('\n',' ')
    text = text.strip()
    while text.find('  ') > -1:
      text = text.replace('  ',' ')
    chunks = text_chunker(text,400) #400 tokens only, keeping space for word piece and prompt
    for chunk in chunks:
      temp_dict = {}
      temp_dict['id'] = i
      temp_dict['text'] = chunk
      temp_dict['source'] = './docs/'+f
      i += 1
      dataset = dataset.append(temp_dict,ignore_index = True)

  0%|          | 0/230 [00:00<?, ?it/s]

In [None]:
document_store_faiss = FAISSDocumentStore(faiss_index_factory_str="Flat",return_embedding=True)

retriever_faiss = EmbeddingRetriever(document_store = document_store_faiss, embedding_model='sentence-transformers/all-mpnet-base-v2')

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
document_store_faiss.delete_all_documents()
document_store_faiss.write_documents(
                                dataset[['id', 'text', 'source']].rename(
                                                                  columns={
                                                                      'id':'id',
                                                                      'source' : 'source',
                                                                      'text':'content'
                                                                   }
                                                                 ).to_dict(orient='records'))
document_store_faiss.get_document_count()

                1. delete_all_documents() method is deprecated, please use delete_documents method
                For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/1045
                


Writing Documents:   0%|          | 0/293 [00:00<?, ?it/s]

293

In [None]:
document_store_faiss.update_embeddings(retriever=retriever_faiss)

Updating Embedding:   0%|          | 0/293 [00:00<?, ? docs/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
def get_results(query, retriever, n_docs = 10):
  return [(item.content, item.to_dict()['meta']) for item in retriever.retrieve(query, top_k = n_docs)]

In [None]:
def query(payload):
    data = json.dumps(payload)
    return data

In [None]:
u_input = "How many Regional Holidays?"
input_prompt_start = "Answer the Query: "+u_input+" from the following Context. \n\n Context: "
res = get_results(u_input, retriever_faiss,1)

for r in res:
  input_text = input_prompt_start+res[0][0]
  headers = {"Authorization": "Bearer <add-key-here>",
               "Content-type":"application/json"}

  API_URL = "https://api-inference.huggingface.co/models/google/flan-ul2"

  data = query({"inputs": input_text,
                  "parameters":{"max_length":2,"wait_for_model":True}})

  response = requests.request("POST", API_URL, headers=headers, data=data)
  print("Answer: ",response.json()[0]['generated_text'])
  print("Source: ",res[0][0])
  print('Link: ',res[0][1]['source'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer:  as applicable at the Customer’s establishment
Source:  The associate will observe Public Holidays as applicable at the Customer’s establishment.
Link:  ./docs/international-relocation-policy-dec-2022_64.txt
