In [None]:
from langchain.chains import RetrievalQA  # 检索QA链，在文档上进行检索
# from langchain.chat_models import ChatOpenAI  # openai模型
# from langchain_community.chat_models import ChatOllama
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings  # ollama嵌入
from langchain.document_loaders import CSVLoader  # 文档加载器，采用csv格式存储
# 导入向量存储索引创建器
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch  # 向量存储

from IPython.display import display, Markdown  # 在jupyter显示信息的工具
import pandas as pd

file = "../data/OutdoorClothingCatalog_1000.csv"

# 使用langchain文档加载器对数据进行导入
loader = CSVLoader(file_path=file)

# 使用pandas导入数据，用以查看
data = pd.read_csv(file, usecols=[1, 2])
data.head()

Unnamed: 0,name,description
0,Women's Campside Oxfords,This ultracomfortable lace-to-toe Oxford boast...
1,"Recycled Waterhog Dog Mat, Chevron Weave",Protect your floors from spills and splashing ...
2,Infant and Toddler Girls' Coastal Chill Swimsu...,"She'll love the bright colors, ruffles and exc..."
3,"Refresh Swimwear, V-Neck Tankini Contrasts",Whether you're going for a swim or heading out...
4,EcoFlex 3L Storm Pants,Our new TEK O2 technology makes our four-seaso...


In [19]:
llm = ChatOllama(model="qwen2.5", temperature=0.0)  # 使用ollama模型

In [25]:
# 创建 Ollama 嵌入模型实例
embeddings = OllamaEmbeddings(model="qwen2.5")  # 或其他你在 Ollama 中安装的模型名称

try:
    # Create the document loader
    file = "../data/OutdoorClothingCatalog_1000.csv"
    loader = CSVLoader(file_path=file)

    # Create vector store index
    index = VectorstoreIndexCreator(
        vectorstore_cls=DocArrayInMemorySearch,
        embedding=embeddings,  # 添加 embedding 参数
    ).from_loaders([loader])

except Exception as e:
    print(f"Error creating index: {str(e)}")



In [36]:
# query = "请用markdown表格的方式列出所有具有防晒功能的衬衫，对每件衬衫描述进行总结"


In [None]:
# 创建一个文档加载器，通过csv格式加载
file = "../data/OutdoorClothingCatalog_1000.csv"
loader = CSVLoader(file_path=file)
docs_ = loader.load()

# 查看单个文档，每个文档对应于CSV中的一行数据
docs_[0]

Document(metadata={'source': '../data/OutdoorClothingCatalog_1000.csv', 'row': 0}, page_content=": 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regular shoe size. For half sizes not offered, order up to next whole size. \n\nSpecs: Approx. weight: 1 lb.1 oz. per pair. \n\nConstruction: Soft canvas material for a broken-in feel and look. Comfortable EVA innersole with Cleansport NXT® antimicrobial odor control. Vintage hunt, fish and camping motif on innersole. Moderate arch contour of innersole. EVA foam midsole for cushioning and support. Chain-tread-inspired molded rubber outsole with modified chain-tread pattern. Imported. \n\nQuestions? Please contact us for any inquiries.")

In [41]:
embed = embeddings.embed_query("你好呀，我的名字叫小可爱")

In [45]:
# 查看得到向量表征的长度
print("\n\033[32m向量表征的长度: \033[0m \n", len(embed))

# 每个元素都是不同的数字值，组合起来就是文本的向量表征
print("\n\033[32m向量表征前5个元素: \033[0m \n", embed[:5])


[32m向量表征的长度: [0m 
 3584

[32m向量表征前5个元素: [0m 
 [-0.003120341, 0.0022507957, -0.0013351035, 0.007695697, 0.0015686533]


In [None]:
# 将刚才创建文本向量表征(embeddings)存储在向量存储(vector store)中
# 使用DocArrayInMemorySearch类的from_documents方法来实现
# 该方法接受文档列表以及向量表征模型作为输入
db = DocArrayInMemorySearch.from_documents(docs_, embeddings)

query = "请推荐一件具有防晒功能的衬衫"
# 使用上面的向量存储来查找与传入查询类似的文本，得到一个相似文档列表
docs = db.similarity_search(query)
print("\n\033[32m返回文档的个数: \033[0m \n", len(docs))
print("\n\033[32m第一个文档: \033[0m \n", docs[0])


[32m返回文档的个数: [0m 
 4

[32m第一个文档: [0m 
 page_content=': 669
name: Serene Sky 45 Pack
description: This ultralight pack is designed for the backpacker who counts every ounce. It comes in under 2 lb. yet has all of the comfort and features you'd expect in a multiday hiking pack. Made for weekend backpack trips or even ultralight thru-hikes, the Osprey Levity is for the hiker who wants as light a load as possible. In fact, you won't see pictures of this pack on Instagram because its typical user won't carry the extra weight of a phone. Special lightweight yet tough fabric allows it to weigh under two pounds while keeping great Osprey technology like the Airspeed backpanel and Exoform harness for comfort.

Specs:
Small 
- Capacity: 42 L. 
- Torso Fit: Fits torsos from 16" to 19". 
- Weight: 1.786 lb.
- Dimensions: 24.8"H X 15.75"W X 11.81"D.

Medium 
- Capacity: 45 L. 
- Torso Fit: Fits torsos from 18" to 21". 
- Weight: 1.852 lb. 
- Dimensions: 26.77"H X 15.' metadata={'source': '../d

In [47]:
# 合并获得的相似文档内容
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


# 将合并的相似文档内容后加上问题（question）输入到 `llm.call_as_llm`中
# 这里问题是：以Markdown表格的方式列出所有具有防晒功能的衬衫并总结
response = llm.call_as_llm(
    f"{qdocs}问题：请用markdown表格的方式列出所有具有防晒功能的衬衫，对每件衬衫描述进行总结"
)

display(Markdown(response))

  response = llm.call_as_llm(


根据提供的信息，实际上没有提到任何具有防晒功能的衬衫。不过，我可以帮助你创建一个示例表格来展示如何格式化这样的数据。假设我们有一些虚构的防晒衬衫产品信息：

```markdown
| 产品名称       | 描述                                                                                   |
|--------------|------------------------------------------------------------------------------------|
| 防晒速干衬衫 A | 采用高科技防晒面料，UPF值高达50+，有效阻挡紫外线，适合户外活动和夏季穿着。轻薄透气，快速吸汗干燥。 |
| 防晒长袖衬衫 B | 特别设计的长袖款式，提供全面的皮肤保护。采用防紫外线技术，保持舒适贴身感。适用于各种户外运动。   |
```

如果你有具体的防晒衬衫产品信息，请提供给我，我可以帮助你填充表格内容。

In [None]:
# 基于向量储存，创建检索器
retriever = db.as_retriever()

qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, verbose=True
)

# 创建一个查询并在此查询上运行链
query = "请用markdown表格的方式列出所有具有防晒功能的衬衫，对每件衬衫描述进行总结"

response = qa_stuff.run(query)

display(Markdown(response))