In [2]:
import os

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

datafile = os.path.join(os.getenv('GEEKTIME_AI_COURSE_DATA'), 'mr_fujino')
documents = SimpleDirectoryReader(datafile).load_data()

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [4]:
# https://docs.llamaindex.ai/en/stable/examples/vector_stores/SimpleIndexDemo/
index.set_index_id('vector_index')
index.storage_context.persist('./storage')

In [4]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir='storage')
loaded_index = load_index_from_storage(storage_context)

In [5]:
query_engine = loaded_index.as_query_engine()
# response = query_engine.query('鲁迅先生在日本学习医学的老师是谁？')
response = query_engine.query('鲁迅先生去哪里学的医学？')
print(response)

鲁迅先生去仙台学习医学。


In [4]:
# https://docs.llamaindex.ai/en/stable/module_guides/models/prompts/usage_pattern/#defining-a-custom-prompt
from llama_index.core import PromptTemplate

template = (
    'Context information is below. \n'
    '------------------------------\n'
    '{context_str}\n'
    '------------------------------\n'
    'Given the context information and not prior knowledge, '
    'answer the question: {query_str}\n'
)
qa_template = PromptTemplate(template)
query_engine = loaded_index.as_query_engine(text_qa_template=qa_template)
response = query_engine.query('鲁迅先生在日本学习医学的老师是谁？')
print(response)

鲁迅先生在日本学习医学的老师是藤野先生。


In [None]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

template = (
    '下面的“我”指的是鲁迅先生\n'
    '-------------\n'
    '{context_str}\n'
    '-------------\n'
    '根据这些信息，请回答问题：{query_str}\n'
    '如果你不知道的话，请回答不知道\n'
)
qa_template = PromptTemplate(template)

retriever = loaded_index.as_retriever()
synth = get_response_synthesizer(text_qa_template=qa_template)
query_engine = RetrieverQueryEngine(retriever, synth)
# response = query_engine.query('请问林黛玉和贾宝玉是什么关系？')
response = query_engine.query('鲁迅在哪儿学的医？')
print(response)

In [3]:
# https://docs.llamaindex.ai/en/stable/examples/index_structs/doc_summary/DocSummary/
from langchain_openai import ChatOpenAI
from langchain.text_splitter import SpacyTextSplitter
from llama_index.core import get_response_synthesizer, SummaryIndex
from llama_index.core.node_parser import LangchainNodeParser
from llama_index.llms.openai import OpenAI

text_splitter = SpacyTextSplitter(pipeline='zh_core_web_sm', chunk_size=2048)
node_parser = LangchainNodeParser(text_splitter)
llm = ChatOpenAI(
    base_url=os.environ['OPENAI_API_BASE'],
    api_key=os.environ['OPENAI_API_KEY'],
    temperature=0, 
    model='gpt-4o-mini',
    max_tokens=1024,
)
# llm = OpenAI(temperature=0, model='gpt-4o-mini')
response_synthesizer = get_response_synthesizer(response_mode='tree_summarize')

list_index = SummaryIndex.from_documents(
    documents, 
    llm=llm, 
    transformations=[node_parser],
    response_synthesizer=response_synthesizer,
)

In [None]:
query_engine = list_index.as_query_engine()
response = query_engine.query('下面鲁迅先生以第一人称‘我’写的内容，请你用中文总结一下:')
print(response)

## TODO
* [ImageReader](https://docs.llamaindex.ai/en/stable/api_reference/readers/file/#llama_index.readers.file.ImageReader)
* Install PyTorch 2.5.1
* Receipt OCR & query.