In [72]:
import os
import pandas as pd
from langchain.document_loaders import DataFrameLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOllama

In [73]:
from langchain.chains import RetrievalQA    # Import RetrievalQA for creating a retrieval-based question answering chain.
from langchain.document_loaders import CSVLoader # Import CSVLoader for loading data from CSV files.
from IPython.display import display, Markdown   # Import display and Markdown from IPython.display for displaying formatted text.

In [74]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

In [None]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import HuggingFaceEmbeddings


In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings
).from_loaders([loader])

In [77]:
query = "Please list all your shirts with sun protection in a table in markdown and summarize each one."

In [78]:
llm = ChatOllama(
    model="llama3.2",
    temperature=0.5
)
response = index.query(query, llm)

In [79]:
display(Markdown(response))

| Product ID | Name | Category | Gender | Price | Color | Size | Material | Waterproof |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| OC083 | Wilderness Sun Shirt | Shirts | Women | 59.99 | Light Blue | M | Nylon | No |
| OC124 | Mountain Windshirt | Shirts | Women | 79.99 | Blue | M | Nylon | No |

Note: I excluded the Mountain Sun Sleeves as it is an arm sleeve and not a full shirt, but rather an accessory to provide sun protection on specific areas of the body.

Both shirts have UPF 50+ protection to prevent sun exposure in hot conditions. The Wilderness Sun Shirt is a long-sleeve shirt, while the Mountain Windshirt is a lightweight pullover.

<div align="center">
    <img src="1.png" width="500" height="330" style="display: block; margin: 0 auto;">
    <img src="2.png" width="1500" height="400" style="display: block; margin: 0 auto; margin-top: 10px;">
</div>

接下来准备加深一下底层的理解

In [80]:
docs = loader.load()
docs[0]

Document(metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 0}, page_content='product_id: OC001\nname: Alpine Explorer Jacket\ncategory: Jackets\ngender: Men\nprice: 249.99\ncolor: Blue\nsize: M\nmaterial: Gore-Tex\nwaterproof: Yes\ndescription: Designed for serious mountaineers, this jacket offers maximum protection against harsh weather conditions.')

In [81]:
embed = embeddings.embed_query("Hi, my name is ww.")
print(len(embed))
print(embed[:5])

384
[-0.06562487781047821, -0.03318195044994354, -0.02051895298063755, 0.054820042103528976, -0.04752255231142044]


In [82]:
db = DocArrayInMemorySearch.from_documents(docs, embeddings)

In [83]:
query = "Please suggest a shirt with sunblocking."

In [84]:
docs = db.similarity_search(query)
list(docs)

[Document(metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 82}, page_content='product_id: OC083\nname: Wilderness Sun Shirt\ncategory: Shirts\ngender: Women\nprice: 59.99\ncolor: Light Blue\nsize: M\nmaterial: Nylon\nwaterproof: No\ndescription: A long-sleeve shirt with UPF 50+ protection for sun exposure in hot conditions.'),
 Document(metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 69}, page_content='product_id: OC070\nname: Mountain Sun Sleeves\ncategory: Accessories\ngender: Men\nprice: 29.99\ncolor: White\nsize: L\nmaterial: Nylon/Spandex\nwaterproof: No\ndescription: UPF 50+ arm sleeves that protect from sun exposure and are easily removed as temperatures change.'),
 Document(metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 97}, page_content='product_id: OC098\nname: Ridgeline Trail Shirt\ncategory: Shirts\ngender: Men\nprice: 59.99\ncolor: Plaid Blue\nsize: L\nmaterial: Nylon/Polyester\nwaterproof: No\ndescription: A technical button-up sh

如何利用这个来在我们自己的文档上进行问答？

检索器是一个通用接口，可以由任何接收查询并返回文档的方法支持，向量存储和嵌入是实现此目的的一种方法。

In [85]:
retriever = db.as_retriever() # 创建一个检索器，它使用db（DocArrayInMemorySearch）作为其检索方法。

In [86]:
llm = ChatOllama(
    model="llama3.2",
    temperature=0.0
)

In [87]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])

In [88]:
response = llm.call_as_llm(f"{qdocs} Question: Please list all your shirts with sun protection in a table in markdown and summarize each one.")

In [89]:
display(Markdown(response))

| Product ID | Name | Category | Gender | Price | Color | Size | Material | Waterproof |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| OC083 | Wilderness Sun Shirt | Shirts | Women | 59.99 | Light Blue | M | Nylon | No |
| OC124 | Mountain Windshirt | Shirts | Women | 79.99 | Blue | M | Nylon | No |

Here's a summary of each shirt with sun protection:

* **Wilderness Sun Shirt (OC083)**: A long-sleeve shirt with UPF 50+ protection for hot conditions, providing sun exposure protection.
* **Mountain Windshirt (OC124)**: A lightweight, wind-resistant pullover with no specific sun protection mentioned, but designed for cool, breezy conditions.

这些步骤都可以被LangChain链封装

In [None]:
# Create a RetrievalQA chain that combines a language model with a retrieval system for question answering
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm,              # The language model (ChatOllama) to use for generating answers
    chain_type='stuff',   # 'stuff' strategy: concatenates all documents into a single prompt
    retriever=retriever,  # The retriever object that fetches relevant documents from the vector store
    verbose=True         # Enable verbose mode to see the chain's internal operations
)

In [99]:
query = "Please list all hat in a table using markdown and summarize each one."

In [100]:
response = qa_stuff.run(query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [101]:
display(Markdown(response))

| **Product ID** | **Name** | **Category** | **Gender** | **Price** | **Color** | **Material** | **Waterproof** | **Description** |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| OC075 | Trekker Trucker Hat | Accessories | Men | $29.99 | Gray/Black | Polyester/Mesh | No | Classic trucker hat with breathable mesh back for ventilation. |
| OC113 | Wilderness Wool Cap | Accessories | Women | $39.99 | Gray | Wool | No | Classic wool cap with small brim for sun protection and style. |
| OC182 | Ridgeline Hiking Hat - Wide Brim | Accessories | Unisex | $34.99 | Khaki | Nylon | No | Wide-brimmed hiking hat with UPF 50+ protection. |
| OC160 | Mountain Sun Hat - UPF 50+ | Accessories | Unisex | $39.99 | Khaki | Nylon | No | Wide-brimmed sun hat with UPF 50+ protection and moisture-wicking sweatband. |

Note: I've used markdown to format the table, but it's not a full-fledged table editor. If you need more features or customization options, please let me know!

一般情况下使用下列作为标准步骤

In [105]:
# 1. 从CSV文件加载数据
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

# 2. 初始化嵌入模型
# 使用HuggingFace的all-MiniLM-L6-v2模型来生成文本嵌入向量
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 3. 创建向量存储索引
# VectorstoreIndexCreator将文档转换为向量并存储
# DocArrayInMemorySearch用于在内存中存储和搜索向量
# from_loaders方法从loader加载文档并创建索引
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings
).from_loaders([loader])

# 4. 初始化LLM模型
# 使用Ollama的llama3.2模型
# temperature=0.5设置生成文本的随机性（0最保守，1最创新）
llm = ChatOllama(
    model="llama3.2",
    temperature=0.5
)

# 5. 执行查询
# index.query结合了向量检索和LLM生成
# 首先找到相关文档，然后使用LLM生成回答
query = "Please list all hat in a table using markdown and summarize each one."
response = index.query(query, llm)

# 6. 显示结果
# 将响应转换为Markdown格式并显示
display(Markdown(response))

| Product ID | Name | Category | Gender | Price | Color | Size | Material | Waterproof |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| OC075 | Trekker Trucker Hat | Accessories | Men | $29.99 | Gray/Black | One Size | Polyester/Mesh | No |
| OC113 | Wilderness Wool Cap | Accessories | Women | $39.99 | Gray | One Size | Wool | No |
| OC182 | Ridgeline Hiking Hat - Wide Brim | Accessories | Unisex | $34.99 | Khaki | One Size | Nylon | No |
| OC160 | Mountain Sun Hat - UPF 50+ | Accessories | Unisex | $39.99 | Khaki | One Size | Nylon | No |

Here's a summary of each hat:

* **Trekker Trucker Hat (OC075)**: A classic trucker hat with a breathable mesh back for ventilation, perfect for casual wear.
* **Wilderness Wool Cap (OC113)**: A classic wool cap with a small brim for sun protection and style, suitable for women.
* **Ridgeline Hiking Hat - Wide Brim (OC182)**: A wide-brimmed hiking hat with UPF 50+ protection, ideal for outdoor activities.
* **Mountain Sun Hat - UPF 50+ (OC160)**: A wide-brimmed sun hat with UPF 50+ protection and a moisture-wicking sweatband, designed for outdoor enthusiasts.