In [None]:
# 安装所有依赖
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple python-dotenv ipykernel llama-index nest_asyncio
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple timm
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple llama-index-llms-huggingface
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple llama-index-embeddings-huggingface
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple llama-index-embeddings-instructor

In [None]:
##### OpenAI Key 方案 #####
from dotenv import load_dotenv
load_dotenv(".env")
    
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
llm = OpenAI(model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding()

# test chat
response = llm.complete("香蕉的颜色是")
print(response)

In [None]:
##### 本地模型加载方案 #####
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# 加载本地的qwen2-7b，你本地的模型放在哪就直接改路径
llm = HuggingFaceLLM(
    tokenizer_name="G:\\models\\Qwen2-7B-Instruct",
    model_name="G:\\models\\Qwen2-7B-Instruct",
    device_map="auto",
    tokenizer_kwargs={"trust_remote_code": True},
    model_kwargs={"trust_remote_code": True},
)
# 调用本地bce-embedding-base_v1作为embedding模型
embed_args = {
    'model_name': 'hkunlp/instructor-base', 
    'max_length': 512, 
    'embed_batch_size': 32, 
    'device': 'cuda'
    }
embed_model = HuggingFaceEmbedding(**embed_args)

In [None]:
from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core import SimpleDirectoryReader

# 加载PDF数据，这里雄哥以卖油翁为例，可以改为自己的任意数据放在datasets文件夹即可，或改为指定的绝对路径
documents = SimpleDirectoryReader(input_files=["./datasets/maiyouweng.pdf"]).load_data()

In [None]:
from llama_index.core.node_parser import SentenceSplitter

# 把文本分块，chunk_size为1024，每一块的大小
splitter = SentenceSplitter(chunk_size=1024)
# 创建文档的节点
nodes = splitter.get_nodes_from_documents(documents)

In [None]:
# 这里可以打印每一个块的信息！
node_metadata = nodes[0].get_content(metadata_mode=True)
print(len(nodes), node_metadata)

In [None]:
from llama_index.core import SummaryIndex, VectorStoreIndex

# 创建数据摘要索引
summary_index = SummaryIndex(nodes)
# 创建矢量存储索引
vector_index = VectorStoreIndex(nodes)

In [None]:
# 把这些索引，转换为工具，后面雄哥就可以调用这些工具
# 摘要查询引擎
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)

# 向量查询引擎
vector_query_engine = vector_index.as_query_engine()

In [None]:
from llama_index.core.tools import QueryEngineTool

# 定义summary_tool的工具
summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "适用于生成与卖油翁课堂相关的摘要问题。"
    ),
)

# 定义vector_tool的工具
vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "适用于检索卖油翁常规上下文的问题。"
    ),
)

In [None]:
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

# 定义所有的tools
query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)

In [None]:
response = query_engine.query("卖油翁课堂有什么内容?")
print(str(response))

In [None]:
print(len(response.source_nodes))

In [None]:
response = query_engine.query("卖油翁的文章引发了什么思考？")
print(str(response))

In [None]:
print(len(response.source_nodes))
for node in response.source_nodes:
    print(node)