### 准备环境

In [None]:
! pip install -qU \
    langchain==0.0.316 \
    openai==0.28.1  \
    tiktoken==0.5.1  \
    cohere \
    chromadb==0.4.15

In [None]:
import os
from langchain.chat_models import ChatOpenAI

os.environ["OPENAI_API_KEY"] = "your-api-key-here"

chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

OpenAI Python 的例子
```python
[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Knock knock."},
    {"role": "assistant", "content": "Who's there?"},
    {"role": "user", "content": "Orange."},
]
```
https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models


但是langchain 需要使用以下的格式

In [None]:

from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)


messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Knock knock."),
    AIMessage(content="Who's there?"),
    HumanMessage(content="Orange"),
    
]

In [None]:
res = chat(messages)
res

因为 `res`也是`AIMessage`属性，所以我们可以直接进行添加，即可实现下一次的响应

In [None]:
messages.append(res)
res = chat(messages)

print(res.content)

#### 处理LLM存在的缺陷
1. 容易出现幻觉
2. 信息滞后
3. 专业领域深度知识匮乏


In [None]:
messages = [
    SystemMessage(content="你是一个专业的知识助手。"),
    HumanMessage(content="你知道baichuan2模型吗？"),
]

In [None]:
res = chat(messages)
print(res.content)

chatgpt AI无法满足我们在某些特定领域的专业需求，我们可以通过知识注入的方式，利用prompt来解决这种问题：

In [None]:

baichuan2_information = [
    "Baichuan 2是一个大规模多语言语言模型，它专注于训练在多种语言中表现优异的模型，包括不仅限于英文。这使得Baichuan 2在处理各种语言的任务时能够取得显著的性能提升。",
    "Baichuan 2是从头开始训练的，使用了包括了2.6万亿个标记的庞大训练数据集。相对于以往的模型，Baichuan 2提供了更丰富的数据资源，从而能够更好地支持多语言的开发和应用。",
    "Baichuan 2不仅在通用任务上表现出色，还在特定领域（如医学和法律）的任务中展现了卓越的性能。这为特定领域的应用提供了强有力的支持。"
]

source_knowledge = "\n".join(baichuan2_information)

In [None]:
print(source_knowledge)

### 创建一个RAG对话模型

#### 1. 加载数据 

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path='./data/test_attraction_summary_1.csv')
data = loader.load()

print(data)

In [None]:
for document in data:
    print(document)

#### 2. 知识切片 将文档分割成均匀的块。每个块是一段原始文本

In [None]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size = 500,
#     chunk_overlap = 50,
# )

# docs = text_splitter.split_documents(pages)

In [None]:
# len(docs)

#### 3. 利用embedding模型对每个文本片段进行向量化，并储存到向量数据库中

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

docs = data
embed_model = OpenAIEmbeddings(model = "text-embedding-ada-002")
vectorstore = Chroma.from_documents(documents=docs, embedding=embed_model , collection_name="openai_embed")

#### 4. 通过向量相似度检索和问题最相关的K个文档。

In [None]:
query = "What are some fun attractions in Shanghai that you can recommend me and give me pictures and reviews of this attraction?"
result = vectorstore.similarity_search(query ,k = 2)

In [None]:
result

#### 5. 原始`query`与检索得到的文本组合起来输入到语言模型，得到最终的回答

In [None]:
def augment_prompt(query: str):
  # 获取top3的文本片段
  results = vectorstore.similarity_search(query, k=3)
  source_knowledge = "\n".join([x.page_content for x in results])
  # 构建prompt
  augmented_prompt = f"""Using the contexts below, answer the query.

  contexts:
  {source_knowledge}

  query: {query}"""
  return augmented_prompt

In [None]:
print(augment_prompt(query))

In [None]:
# 创建prompt
prompt = HumanMessage(
    content=augment_prompt(query)
)

messages.append(prompt)

res = chat(messages)

print(res.content)