In [4]:
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

# from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.agents import ConversationalChatAgent, ZeroShotAgent, Tool, AgentExecutor
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI, LLMChain, PromptTemplate
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, PyPDFDirectoryLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

import json
import os
os.environ["OPENAI_API_KEY"] = "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"

In [5]:
def load_document(docs):
    for file in os.listdir('docs'):
        if file.endswith('.pdf'):
            pdf_path = './docs/' + file
            loader = PyPDFLoader(pdf_path)
            docs.extend(loader.load())
        elif file.endswith('.docx') or file.endswith('.doc'):
            doc_path = './docs/' + file
            loader = Docx2txtLoader(doc_path)
            docs.extend(loader.load())
        elif file.endswith('.txt'):
            text_path = './docs/' + file
            loader = TextLoader(text_path)
            docs.extend(loader.load())

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    chunked_documents = text_splitter.split_documents(docs)
    return chunked_documents

In [2]:
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFDirectoryLoader('data')
documents = loader.load_and_split()

# #splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents=documents)

In [31]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base", 
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [32]:
db = FAISS.from_documents(texts, instructor_embeddings)
db.save_local("db")

In [33]:
from langchain.vectorstores import FAISS
new_db = FAISS.load_local("db", instructor_embeddings)
retriever_IE_faiss = new_db.as_retriever()

In [36]:
retriever_IE_faiss.invoke('hookes law formula?')

[Document(page_content="‘The results of Experiment 1.1 produces a graph with a straight line passing through the origin as shown in Figure 1.33. \n“This shows that the extension of the spring is directly proportional t0 the force applied on the spring, \nHooke's law states that the extension of a spring. \nis directly proportional to the force applicd on the spring provided the elastc limit of the spring s i not exceeded. \n“This relationship can be written as: xxF \nFox Feke \n‘where F = applied force o F x= extension of the spring Figure 1.33 Graph o x against B k= spring constant \nF = ks the formula for Hooke'slaw. \nAnalysis of the Graph of Force Against the Extension of a Spring \nFigure 1.34 shows the graph of force against the extension of a spring. \nEIN \nFigure 1.34 Graph of F against x \nBased on the graph of F agins ., the gradien \nof the. == \nLaawrs WsChAN ME! 3 ‘Spring constant \n! i ity sIGHL Spring constant, k = Gradient of the graph of F against", metadata={'source'

In [5]:
# read a pdf and answers question on it
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

loader = loader = DirectoryLoader('data', glob="**/*.pdf", loader_cls=PyPDFLoader)
pages = loader.load()


In [8]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceInstructEmbeddings

embedding_function = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base", 
                                                      model_kwargs={"device": "cuda"})
# load it into Chroma
db = Chroma.from_documents(pages, embedding_function)

  from .autonotebook import tqdm as notebook_tqdm


load INSTRUCTOR_Transformer
max_seq_length  512


Using embedded DuckDB without persistence: data will be transient


In [26]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={
                              'score_threshold': 0.5})

In [37]:
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains import create_qa_with_sources_chain

llm_src = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

qa_chain = create_qa_with_sources_chain(llm_src)

doc_prompt = PromptTemplate(
    template="""
    You are a study helper chatbot. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
    As an AI assistant you provide answers based on the given context, ensuring accuracy and brifness. 
    You always follow these guidelines:

    -If the answer isn't available within the context, state that fact
    -Otherwise, answer to your best capability, refering to source of documents provided
    -Only use examples if explicitly requested
    -Do not introduce examples outside of the context
    -Do not answer if context is absent
    -Limit responses to three or four sentences for clarity and conciseness
    
    Content: {page_content}
    Source: {source}
    Page:{page}
    """, # look at the prompt does have page#
    input_variables=["page_content", "source","page"],
)

final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain, 
    document_variable_name='context',
    document_prompt=doc_prompt,
)
retrieval_qa = RetrievalQA(
    retriever=retriever_IE_faiss,
    combine_documents_chain=final_qa_chain
)


In [48]:
def format_json(input_json):
    # Parse the input JSON
    data = json.loads(input_json)

    # Extract the query and result
    query = data['query']
    result = json.loads(data['result'])

    # Extract the answer, source document, and page number from the result
    answer = result['answer']
    source_info = answer.split("Source: ")
    page_info = source_info[1].split(", Page ")
    source_document = source_info[0].strip()
    page_number = int(page_info[1])

    # Create a formatted JSON object
    formatted_data = {
        'query': query,
        'result': {
            'answer': answer,
            'sources': [source_document],
        },
        'page_number': page_number
    }

    return formatted_data

In [60]:
ans = retrieval_qa('what does pascals principle states?')
ans

{'query': 'what does pascals principle states?',
 'result': '{\n  "answer": "Pascal\'s principle states that the pressure applied on an enclosed fluid is transmitted uniformly in all directions in the fluid.",\n  "sources": ["data\\\\anyflip_output_10-99 (1).pdf, Page 60"]\n}'}

In [59]:
ans

{'query': 'how to conduct experiment to generate the idea that high velocity of fluids creates a region of low pressure',
 'result': '{\n  "answer": "To conduct an experiment to generate the idea that high velocity of fluids creates a region of low pressure, you can follow these steps:\\n1. Set up a Venturi tube on a retort stand.\\n2. Hold a piece of A4 paper with both hands and blow across the top surface of the paper.\\n3. Observe the movement of the paper.\\n\\nSource: data\\\\anyflip_output_10-99 (1).pdf, Page 78",\n  "sources": ["data\\\\anyflip_output_10-99 (1).pdf"]\n}'}