#Setup and Imports

In [None]:
!pip install langchain --upgrade
!pip install openai
!pip install faiss-gpu
!pip install tiktoken
!pip install sentence_transformers
!pip install wolframalpha
!pip install accelerate

In [None]:
import os
import pathlib
import re
import pandas as pd
import nltk
nltk.download('punkt')

from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.agents import create_pandas_dataframe_agent
from langchain.chains import RetrievalQAWithSourcesChain, LLMMathChain, LLMChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.agents import initialize_agent, Tool
from langchain.agents.agent_types import AgentType
from langchain.chat_models import AzureChatOpenAI
from langchain import HuggingFaceHub, HuggingFacePipeline
from transformers import T5Tokenizer
from langchain.prompts import PromptTemplate
from transformers import pipeline
import torch

from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import NLTKTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.utilities.wolfram_alpha import WolframAlphaAPIWrapper

os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = ""
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''
os.environ["WOLFRAM_ALPHA_APPID"] = ""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Create Docstore

In [None]:
repo_path = './docs/'
document_files = os.listdir(repo_path)

def convert_path_to_doc_url(doc):
  # Convert from relative path to actual document url
  return './docs/'+doc

text_splitter = CharacterTextSplitter(separator = "\n",chunk_size = 256,chunk_overlap  = 20,length_function = len)

documents = []
for text_file in document_files:
  if text_file.find('.txt') > -1:
    content = open(repo_path+text_file, "r").read()
    split_texts = text_splitter.split_text(content)
    for text in split_texts:
      text = text.replace('\n',' \n ')
      documents.append(Document(page_content=text,metadata={"source": convert_path_to_doc_url(text_file)}))

In [None]:
documents[0]

Document(page_content='© 2022 Cognizant  \n Confidential or Trade Secret  \n                           Leave Policy - India                                   Page 8 of 11  \n    \n •  \n Associates Travelling on International Assignments with Payroll Transfer from / to India', metadata={'source': './docs/leave-policy_feb22_8.txt'})

In [None]:
embeddings = OpenAIEmbeddings(deployment='text-embedding-ada-002',model="text-embedding-ada-002")

In [None]:
vector_store.save_local('./Data_store/')

In [None]:
retriever = vector_store.as_retriever()

In [None]:
'''
if os.path.exists('./Data_store/'):
  vector_store = FAISS.load_local(
      './Data_store/',
      embeddings
  )
else:
  print(f"Missing files. Upload index.faiss and index.pkl files to Data Store directory first")
'''

'\nif os.path.exists(\'./Data_store/\'):\n  vector_store = FAISS.load_local(\n      \'./Data_store/\',\n      embeddings\n  )\nelse:\n  print(f"Missing files. Upload index.faiss and index.pkl files to Data Store directory first")\n'

#Setup Prompt

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Given the below document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

=========
{summaries}
=========
"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [None]:
prompt.format(summaries = "Who was the father of Mary Ball Washington?",question = 'Who was the father of Mary Ball Washington?')



#Setup LLM Model for QA


In [None]:
llm = AzureChatOpenAI(deployment_name="chatgpt",model_name="gpt-3.5-turbo")

In [None]:
chain_type_kwargs = {"prompt": prompt}
QAChain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [None]:
QAChain('How Many Years are required to become FTE?')

{'question': 'How Many Years are required to become FTE?',
 'answer': '(2) years',
 'sources': '',
 'source_documents': [Document(page_content='(2) years.  \n •  \n Once the said period of two (2) years is attained on part time, Associates need to be converted  \n to FTE. If part time option is requested for, beyond two (2) years, exception approval from BU  \n Head (VP+) and India HR Head will be sought.  \n •', metadata={'source': './docs/part-time-india_2.txt'}),
  Document(page_content='To be eligible for this leave, the Associate must have spent at least two consecutive years of  \n service with the Company. The Company may determine the number of Associates who shall  \n be given this leave annually in accordance with work requirements.', metadata={'source': './docs/international-relocation-policy-dec-2022_66.txt'}),
  Document(page_content='return to their Home Country prior to completion of four (4) months, then the Associate’s  \n manager must inform Human Resources and provid