# LangChain: Q&A over Documents

An example might be a tool that would allow you to query a product catalog for items of interest.

In [15]:
# ! pip install langchain
! pip install docarray

Collecting docarray
  Using cached docarray-0.40.0-py3-none-any.whl.metadata (36 kB)
Collecting orjson>=3.8.2 (from docarray)
  Using cached orjson-3.9.12-cp311-none-win_amd64.whl.metadata (50 kB)
Collecting rich>=13.1.0 (from docarray)
  Using cached rich-13.7.0-py3-none-any.whl.metadata (18 kB)
Collecting types-requests>=2.28.11.6 (from docarray)
  Using cached types_requests-2.31.0.20240125-py3-none-any.whl.metadata (1.8 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=13.1.0->docarray)
  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=13.1.0->docarray)
  Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)
Using cached docarray-0.40.0-py3-none-any.whl (270 kB)
Using cached orjson-3.9.12-cp311-none-win_amd64.whl (134 kB)
Using cached rich-13.7.0-py3-none-any.whl (240 kB)
Using cached types_requests-2.31.0.20240125-py3-none-any.whl (14 kB)
Using cached markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
Installing

In [34]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

KEY  = os.environ['JINA_EMBEDDING_KEY']

In [2]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [23]:
from langchain.chains import RetrievalQA
from langchain.chat_models.openai import ChatOpenAI
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores.docarray import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.llms.openai import OpenAI


In [6]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

In [5]:
from langchain.indexes import VectorstoreIndexCreator

In [19]:
index = VectorstoreIndexCreator(
    
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [43]:
query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

In [46]:
# llm_replacement_model = OpenAI(temperature=0, 
#                                    llm_model = "gpt-3.5-turbo"

#                                model='gpt-3.5-turbo-instruct'
#                                )
llm_replacement_model = ChatOpenAI(model_name=llm_model, temperature=0)

response = index.query(question=query, llm = llm_replacement_model)

ValidationError: 2 validation errors for DocArrayDoc
text
  Field required [type=missing, input_value={'embedding': [0.00335164... -0.021174797216912216]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/missing
metadata
  Field required [type=missing, input_value={'embedding': [0.00335164... -0.021174797216912216]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/missing

## Step by Step

In [26]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader(file_path=file)

In [27]:
docs = loader.load()

In [36]:
# from langchain.embeddings import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings()
from langchain.embeddings import JinaEmbeddings
embeddings = JinaEmbeddings(jina_api_key = KEY, model_name="jina-embeddings-v2-base-en")

In [37]:
embed = embeddings.embed_query("Hi my name is Hamza Qureshi")

In [38]:
print(len(embed))
print(embed[:5])

768
[-0.602528, -0.47703412, 0.1974959, -0.08134877, -0.14021301]


In [39]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

In [40]:
query = "Please suggest a shirt with sunblocking"

In [None]:
docs = db.similarity_search(query)