# LangChain: Q&A over Documents

An example might be a tool that would allow you to query a product catalog for items of interest.

In [None]:
#pip install --upgrade langchain

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
_

True

In [3]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown

In [4]:
import pandas as pd 
from pandas import option_context
from typing import Union, Dict, List

def load_and_clean_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)
    
    # Reset the index
    df.reset_index(drop=True, inplace=True)
    
    # Clean column names: remove leading/trailing spaces, replace internal spaces with underscores, and convert to lowercase
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    
    # Handle missing values (this is a simple approach where all missing values are dropped; 
    # depending on the dataset, you might want to fill them in a different way)
    df.dropna(inplace=True)
    
    # Convert object columns to category if they have fewer than 50 unique values (this can save memory)
    for col in df.select_dtypes(include='object'):
        if df[col].nunique() < 50:
            df[col] = df[col].astype('category')
    
    # Convert boolean columns to int (this can be useful for certain types of analysis)
    for col in df.select_dtypes(include='bool'):
        df[col] = df[col].astype('int')
    
    return df

df = load_and_clean_data('data/OutdoorClothingCatalog_1000.csv')

  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


In [5]:
df.drop(columns=['unnamed:_0'], inplace=True)

In [6]:
with option_context('display.max_rows', None,'display.max_colwidth',200,'display.max_columns',90):
              display(df.head(3))

Unnamed: 0,name,description
0,Women's Campside Oxfords,"This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regu..."
1,"Recycled Waterhog Dog Mat, Chevron Weave","Protect your floors from spills and splashing with our ultradurable recycled Waterhog dog mat made right here in the USA. \n\nSpecs\nSmall - Dimensions: 18"" x 28"". \nMedium - Dimensions: 22.5"" x 3..."
2,"Infant and Toddler Girls' Coastal Chill Swimsuit, Two-Piece","She'll love the bright colors, ruffles and exclusive whimsical prints of this toddler's two-piece swimsuit! Our four-way-stretch and chlorine-resistant fabric keeps its shape and resists snags. Th..."


In [7]:
file = 'data/OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file,encoding='utf-8')

In [8]:
loader 

<langchain.document_loaders.csv_loader.CSVLoader at 0x2e28d40dd00>

In [9]:
from langchain.indexes import VectorstoreIndexCreator

In [10]:
#pip install docarray

In [11]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [12]:
# index.

In [13]:
query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

In [14]:
index.query

<bound method VectorStoreIndexWrapper.query of VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.docarray.in_memory.DocArrayInMemorySearch object at 0x000002E2939A3E20>)>

In [15]:
response = index.query(query)

In [16]:
display(Markdown(response))



| Name | Description |
| --- | --- |
| Men's Tropical Plaid Short-Sleeve Shirt | UPF 50+ rated, 100% polyester, wrinkle-resistant, front and back cape venting, two front bellows pockets |
| Men's Plaid Tropic Shirt, Short-Sleeve | UPF 50+ rated, 52% polyester and 48% nylon, machine washable and dryable, front and back cape venting, two front bellows pockets |
| Men's TropicVibe Shirt, Short-Sleeve | UPF 50+ rated, 71% Nylon, 29% Polyester, 100% Polyester knit mesh, wrinkle resistant, front and back cape venting, two front bellows pockets |
| Sun Shield Shirt by | UPF 50+ rated, 78% nylon, 22% Lycra Xtra Life fiber, wicks moisture, fits comfortably over swimsuit, abrasion resistant |

All four shirts provide UPF 50+ sun protection, blocking 98% of the sun's harmful rays. The Men's Tropical Plaid Short-Sleeve Shirt is made of 100% polyester and is wrinkle-resistant. The Men's Plaid Trop

In [22]:
df.shape[0]

1000

In [17]:
loader = CSVLoader(file_path=file,encoding='utf-8')

In [18]:
docs = loader.load()

In [21]:
len(docs)

1000

In [19]:
docs[0]

Document(page_content=": 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regular shoe size. For half sizes not offered, order up to next whole size. \n\nSpecs: Approx. weight: 1 lb.1 oz. per pair. \n\nConstruction: Soft canvas material for a broken-in feel and look. Comfortable EVA innersole with Cleansport NXT® antimicrobial odor control. Vintage hunt, fish and camping motif on innersole. Moderate arch contour of innersole. EVA foam midsole for cushioning and support. Chain-tread-inspired molded rubber outsole with modified chain-tread pattern. Imported. \n\nQuestions? Please contact us for any inquiries.", metadata={'source': 'data/OutdoorClothingCatalog_1000.csv', 'row': 0})

In [20]:
docs[1]

Document(page_content=': 1\nname: Recycled Waterhog Dog Mat, Chevron Weave\ndescription: Protect your floors from spills and splashing with our ultradurable recycled Waterhog dog mat made right here in the USA. \n\nSpecs\nSmall - Dimensions: 18" x 28". \nMedium - Dimensions: 22.5" x 34.5".\n\nWhy We Love It\nMother nature, wet shoes and muddy paws have met their match with our Recycled Waterhog mats. Ruggedly constructed from recycled plastic materials, these ultratough mats help keep dirt and water off your floors and plastic out of landfills, trails and oceans. Now, that\'s a win-win for everyone.\n\nFabric & Care\nVacuum or hose clean.\n\nConstruction\n24 oz. polyester fabric made from 94% recycled materials.\nRubber backing.\n\nAdditional Features\nFeatures an -exclusive design.\nFeatures thick and thin fibers for scraping dirt and absorbing water.\nDries quickly and resists fading, rotting, mildew and shedding.\nUse indoors or out.\nMade in the USA.\n\nHave questions? Reach out to

In [23]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [24]:
embed = embeddings.embed_query("Hi my name is Harrison")

In [25]:
print(len(embed))

1536


In [26]:
print(embed[:5])

[-0.021913960576057434, 0.006774206645786762, -0.018190348520874977, -0.039148248732089996, -0.014089343138039112]


In [27]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

In [28]:
db.similarity_search

<bound method DocArrayIndex.similarity_search of <langchain.vectorstores.docarray.in_memory.DocArrayInMemorySearch object at 0x000002E293F10EB0>>

In [29]:
query = "Please suggest a shirt with sunblocking"

In [30]:
docs = db.similarity_search(query,k= 2)

In [31]:
len(docs)

2

In [32]:
docs[0]

Document(page_content=': 255\nname: Sun Shield Shirt by\ndescription: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. \n\nSize & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.\n\nFabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.\n\nAdditional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.\n\nSun Protection That Won\'t Wear Off\nOur high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun\'s harmful rays. This fabric is recommended by The Skin Cancer Foundation as an effective UV protectant.', metadata={'source': 'data/OutdoorClothingCatalog_1000.csv', 'row': 255})

In [33]:
retriever = db.as_retriever()

In [34]:
retriever

VectorStoreRetriever(vectorstore=<langchain.vectorstores.docarray.in_memory.DocArrayInMemorySearch object at 0x000002E293F10EB0>, search_type='similarity', search_kwargs={})

In [40]:
llm = ChatOpenAI(temperature = 0.0,model_name = "gpt-3.5-turbo-16k")


In [41]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


In [42]:
len(docs)

2

In [43]:
qdocs 

': 255\nname: Sun Shield Shirt by\ndescription: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. \n\nSize & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.\n\nFabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.\n\nAdditional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.\n\nSun Protection That Won\'t Wear Off\nOur high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun\'s harmful rays. This fabric is recommended by The Skin Cancer Foundation as an effective UV protectant.: 374\nname: Men\'s Plaid Tropic Shirt, Short-Sleeve\ndescription: Our Ultracomfortable sun protection is rated to UPF 50+, helping you stay cool and dry. Originally designed for fishing, this lightest hot-weather shirt offers UPF 50+ c

In [None]:
f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one."

In [None]:
response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one.") 


In [None]:
response

In [None]:
display(Markdown(response))

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [None]:
query =  "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."

In [None]:
response = qa_stuff.run(query)

In [None]:
display(Markdown(response))

In [None]:
response = index.query(query, llm=llm)

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])

In [1]:
from datetime import datetime, timedelta

start_date = datetime(2023, 7, 19)
end_date = datetime(2023, 8, 2)
delta = timedelta(days=1)

date_list = []

while start_date <= end_date:
    date_str = start_date.strftime('%Y/%m/%d')
    date_list.append(f'raw/chattest/{date_str}/')
    start_date += delta

print(date_list)

['raw/chattest/2023/07/19/', 'raw/chattest/2023/07/20/', 'raw/chattest/2023/07/21/', 'raw/chattest/2023/07/22/', 'raw/chattest/2023/07/23/', 'raw/chattest/2023/07/24/', 'raw/chattest/2023/07/25/', 'raw/chattest/2023/07/26/', 'raw/chattest/2023/07/27/', 'raw/chattest/2023/07/28/', 'raw/chattest/2023/07/29/', 'raw/chattest/2023/07/30/', 'raw/chattest/2023/07/31/', 'raw/chattest/2023/08/01/', 'raw/chattest/2023/08/02/']
