# LangChain: Q&A over Documents

In [None]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch  # Deprecated
# from langchain_openai import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.llms import OpenAI

In [None]:
file = 'l4_outdoor_clothing_catalog_1000.csv'
loader = CSVLoader(file_path=file)  # Initialize a csv loader

In [None]:
# Install sqlalchemy: required in order to import VectorstoreIndexCreator
# %pip install --upgrade --force-reinstall sqlalchemy

In [None]:
# Import an index
from langchain.indexes import VectorstoreIndexCreator

In [None]:
# Install DocArray
# %pip install docarray

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])


In [None]:
query = "Please list all your shirt with sun protection in a table in markdown and summarize each one"

In [None]:
response = index.query(query)

In [None]:
display(Markdown(response))

## Step By Step

In [None]:
from langchain.document_loaders import CSVLoader

# Create Document loader
loader = CSVLoader(file_path=file)

In [None]:
# Load document
docs = loader.load()

In [None]:
docs[0]

In [None]:
# Create embeddings using OpenAI's embedding class
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [None]:
# Create embeddings using OpenAI's embedding class
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [None]:
embed = embeddings.embed_query("Hi my name is Hassane")

In [None]:
print(len(embed))

In [None]:
print(embed[:5])

In [None]:
len(docs)

In [None]:
# Create embeddings for all pieces of text just loaded and store them in a vectore store
# using the `from_documents` method on the vector stor
db = DocArrayInMemorySearch.from_documents(
    docs[:750],  # docs has 1000 element which require more than 150000 TPM (tohen per minute)
    # In free trial mode, the token limits for `text-embedding-3-small` model is 150000 TPM
    embeddings
)

In [None]:
query = "Please suggest a shirt with sunblocking"

In [None]:
docs = db.similarity_search(query)

In [None]:
len(docs)

In [None]:
docs[0]

In [None]:
# Create a retriever
retriever = db.as_retriever()

In [None]:
# LLM model
llm = ChatOpenAI(temperature=0.0)

In [None]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])

In [None]:
response = llm.call_as_llm(f"{qdocs} Question: Please list all your shirts with sun \
protection in a table in markdown and summarize each one.") 

# call_as_llm is deprecated, use invoke
# response = llm.invoke(f"{qdocs} Question: Please list all your shirts with sun \
# protection in a table in markdown and summarize each one.") 

In [None]:
display(Markdown(response))

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [None]:
query =  "Please list all your shirts with sun protection in a table in markdown and summarize each one."

In [None]:
# response = qa_stuff.run(query)
response = qa_stuff.invoke(query)

In [None]:
display(Markdown(response))

In [None]:
response = index.query(query, llm=llm)

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])