# LangChain: Q&A over Documents

Let's build a tool that would allow you to query a product catalog for items of interest.

# Setup

## Python


In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from IPython.display import display, Markdown
from langchain.vectorstores import DocArrayInMemorySearch  # vector store
from langchain.document_loaders import CSVLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import datetime
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())  # read local .env file

In [None]:
# pip install --upgrade langchain

# Simple example

## Load Data

In [None]:
file = '../data/OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)



- Create vector store

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

## Query and response

In [None]:
query = "Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

In [None]:
response = index.query(query)

In [None]:
display(Markdown(response))

# Step By Step

## Basics

- Language models ca only inspect a few thousands word at a time

- This is why we need embeddings and vector stores


## Loader

- We use our `loader` from before (`loader = CSVLoader(file_path=file)`




In [None]:
docs = loader.load()

In [None]:
docs[0]

- Document(page_content=": 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regular shoe size. For half sizes not offered, order up to next whole size. \n\nSpecs: Approx. weight: 1 lb.1 oz. per pair. \n\nConstruction: Soft canvas material for a broken-in feel and look. Comfortable EVA innersole with Cleansport NXT® antimicrobial odor control. Vintage hunt, fish and camping motif on innersole. Moderate arch contour of innersole. EVA foam midsole for cushioning and support. Chain-tread-inspired molded rubber outsole with modified chain-tread pattern. Imported. \n\nQuestions? Please contact us for any inquiries.", metadata={'source': '../data/OutdoorClothingCatalog_1000.csv', 'row': 0})

## Embeddings

- Our documents are so small that we dont need to chunck them first



In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
embed = embeddings.embed_query("Hi my name is Jan")

In [None]:
print(len(embed))

- 1536



In [None]:
print(embed[:5])

- [-0.015501204878091812, -0.0016401495086029172, -0.01953849568963051, -0.016909271478652954, -0.021893581375479698]

## Create vector store

In [None]:
db = DocArrayInMemorySearch.from_documents(
    docs,
    embeddings
)

## Query {.smaller}

In [None]:
query = "Please suggest a shirt with sunblocking"

In [None]:
docs = db.similarity_search(query)

In [None]:
len(docs)

- 4



In [None]:
docs[0]

- Document(page_content=': 255\nname: Sun Shield Shirt by\ndescription: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. \n\nSize & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.\n\nFabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.\n\nAdditional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.\n\nSun Protection That Won\'t Wear Off\nOur high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun\'s harmful rays. This fabric is recommended by The Skin Cancer Foundation as an effective UV protectant.', metadata={'source': '../data/OutdoorClothingCatalog_1000.csv', 'row': 255})

# Manual Example



In [None]:
llm_model = "gpt-3.5-turbo"
llm = ChatOpenAI(temperature=0.0, model=llm_model)



- If we would do it manually, we would combine the text with a join

In [None]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])



- And then we would provide qdocs to the llm:

In [None]:
# This may take a while
response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one.")

In [None]:
display(Markdown(response))

XYZ Error

# Retrieval Chain

## Chain


In [None]:
retriever = db.as_retriever()

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # one propmt and one result
    retriever=retriever,
    verbose=True
)

## Query

In [None]:
query = "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."

In [None]:
response = qa_stuff.run(query)

In [None]:
display(Markdown(response))

In [None]:
response = index.query(query, llm=llm)