# Raw Document Analysis (Optional)

In [1]:
def word_count(file_name: str) -> None:
    # More accurate word count for Chinese
    with open(file_name, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Count Chinese characters (each Chinese char ≈ 1 word)
    chinese_chars = len([c for c in content if '\u4e00' <= c <= '\u9fff'])
    # Count English words
    english_words = len([w for w in content.split() if any('a' <= c.lower() <= 'z' for c in w)])
    
    print(f"Total characters: {len(content)}")
    print(f"Chinese characters: {chinese_chars} (≈ words)")
    print(f"English words: {english_words}")
    print(f"Approximate total words: {chinese_chars + english_words}")

word_count("cn1.md")

Total characters: 8085
Chinese characters: 6433 (≈ words)
English words: 24
Approximate total words: 6457


# Splitting and Chunking Strategy

In [2]:
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_into_chunks(
    doc_file: str, 
    chunk_size: int = 800, 
    chunk_overlap: int = 150
) -> List[str]:
    with open(doc_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n## ", "\n# ", "\n\n", "\n", "。", "!", "?", ";", " ", ""]
    )
    return text_splitter.split_text(content)

chunks = split_into_chunks("cn1.md")
print(f"Created {len(chunks)} chunks\n")


Created 15 chunks



# Indexing and Storage
Embedding Model Choice:
- `shibing624/text2vec-base-chinese` (good for Chinese and used in `rag0`)
- `BAAI/bge-base-zh-v1.5` (better Chinese performance)
- `moka-ai/m3e-base` (multilingual Chinese-English)

In [3]:
import chromadb
from sentence_transformers import SentenceTransformer
from typing import List

# ======================================== 
# Initialize models
# ========================================
embedding_model = SentenceTransformer("shibing624/text2vec-base-chinese")


# ======================================== 
# Create ChromaDB with correct settings
# ========================================
def create_db():
    client = chromadb.PersistentClient(path="./chroma_db")

    # If you want to GUARANTEE cosine space, delete+recreate:
    try:
        client.delete_collection(name="default")
        print("Deleted old collection")
    except Exception as e:
        # ok if it doesn't exist; still good to know unexpected errors
        print(f"(delete_collection) {e}")

    collection = client.create_collection(
        name="default",
        metadata={"hnsw:space": "cosine"}
    )

    print(f"Created collection with metadata: {collection.metadata}")
    return collection

chromadb_collection = create_db()


# ======================================== 
# Embed
# ========================================
def embed_chunk(chunk: str) -> List[float]:
    emb = embedding_model.encode(chunk)  # numpy array
    return emb.tolist()


# ======================================== 
# Store with metadata
# ========================================
def save_embeddings(
    collection,
    chunks: List[str],
    embeddings: List[List[float]],
    source_file: str,
) -> None:
    if not chunks:
        raise ValueError("chunks is empty")

    if len(chunks) != len(embeddings):
        raise ValueError(f"chunks ({len(chunks)}) and embeddings ({len(embeddings)}) length mismatch")
    
    if not source_file:
        return ValueError("file is empty")

    # Use stable IDs that won't collide across different files
    ids = [f"{source_file}:{i}" for i in range(len(chunks))]

    metadatas = [
        {
            "chunk_id": i,
            "source": source_file,
            "chunk_length": len(chunk),
            "chunk_index": i,
        }
        for i, chunk in enumerate(chunks)
    ]

    # Use upsert so reruns don't explode on duplicate ids
    collection.upsert(
        documents=chunks,
        embeddings=embeddings,
        ids=ids,
        metadatas=metadatas,
    )

    print(f"Saved {len(chunks)} chunks to ChromaDB")

# embeddings
embeddings = [embed_chunk(c) for c in chunks]
print(f"Generated {len(embeddings)} embeddings")
print(f"Embedding dimension: {len(embeddings[0])}")

save_embeddings(chromadb_collection, chunks, embeddings, "cn1.md")


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: shibing624/text2vec-base-chinese
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Deleted old collection
Created collection with metadata: {'hnsw:space': 'cosine'}
Generated 15 embeddings
Embedding dimension: 768
Saved 15 chunks to ChromaDB


# Retrieval

In [4]:
def retrieve(query: str, top_k: int = 5, score_threshold = None):
    query_embedding = embed_chunk(query)
    
    results = chromadb_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=['documents', 'distances', 'metadatas']
    )
    
    if not results['documents'][0]:
        print("No results found!")
        return []
    
    retrieved = []
    for i, (doc, dist, meta) in enumerate(zip(
        results['documents'][0],
        results['distances'][0],
        results['metadatas'][0]
    )):
        if score_threshold is None or dist <= score_threshold:
            retrieved.append({
                'document': doc,
                'distance': dist,
                'similarity': 1 / (1 + dist),
                'metadata': meta,
                'rank': i
            })
    
    print(f"Retrieved {len(retrieved)}/{top_k} chunks")
    return retrieved

## Retrieval Testing

In [5]:
# query = "哈利波特用了什么魔法打败了索伦？"
query = "哈利的守护神是什么样子的？"
results = retrieve(query, top_k=5, score_threshold=None)

def print_top_k_result(results):
    print("\n" + "="*70)
    print("Results for ", query)
    print("="*70)
    for i, r in enumerate(results):
        print(f"\nRank {i+1}:")
        print(f"  Distance: {r['distance']:.3f}")  # Should now be 0.0-2.0 range
        print(f"  Similarity: {r['similarity']:.3f}")
        print(f"  Text: {r['document'][:80]}...")

print_top_k_result(results)

Retrieved 5/5 chunks

Results for  哈利的守护神是什么样子的？

Rank 1:
  Distance: 0.454
  Similarity: 0.688
  Text: ## 第二章：邓布利多的召见

哈利意识到自己必须马上通知学校。他立刻从口袋中掏出一枚银色硬币般的护符，深吸一口气，默念咒语：“Expecto Patronum...

Rank 2:
  Distance: 0.455
  Similarity: 0.687
  Text: # 魔戒与魔杖：两个世界的交汇

## 第一章：神秘的传送门

霍格沃茨的禁林，夜色正浓，月光从浓密树冠的缝隙中洒落，投下斑驳的银色光影。空气中弥漫着湿润的苔藓...

Rank 3:
  Distance: 0.465
  Similarity: 0.683
  Text: ## 第五章：魔法的融合

临战前夜，霍格沃茨的钟楼熄灯，取而代之的是城堡上空缓缓旋转的守护星光魔阵。夜色中笼罩着一股难以名状的寂静，仿佛整个世界都屏息等待。
...

Rank 4:
  Distance: 0.482
  Similarity: 0.675
  Text: “我们可以反向布阵，让索伦误以为魔戒就在阵心，引诱他完全现身。”赫敏解释道，“只要他现身，我们的咒语就能发挥最大效力。”

佛罗多脸色惨白：“但那意味着我必须站...

Rank 5:
  Distance: 0.516
  Similarity: 0.659
  Text: “还有我，”赫敏抬起头，“如果允许，我想继续研究裂缝魔法，也许这会是巫师历史新的开端。”

邓布利多微微一笑：“霍格沃茨的图书馆，将永远为你敞开。”

甘道夫从...


# Rerank

In [6]:
from sentence_transformers import CrossEncoder

def rerank(
    query: str, 
    retrieved_results: List[dict], 
    top_k: int
) -> List[dict]:
    cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')
    
    # Extract documents from retrieve() results
    chunks = [result['document'] for result in retrieved_results]
    pairs = [(query, chunk) for chunk in chunks]
    scores = cross_encoder.predict(pairs)
    
    # Preserve full result dict with rerank score
    for result, score in zip(retrieved_results, scores):
        result['rerank_score'] = float(score)
    
    # Sort by rerank score
    retrieved_results.sort(key=lambda x: x['rerank_score'], reverse=True)
    
    return retrieved_results[:top_k]

## Rerank Testing

In [7]:
retrieved = retrieve(query, top_k=5)
reranked = rerank(query, retrieved, top_k=3)

for i, result in enumerate(reranked):
    print(f"[{i}] Rerank: {result['rerank_score']:.3f} | {result['document'][:80]}\n")

Retrieved 5/5 chunks


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: cross-encoder/mmarco-mMiniLMv2-L12-H384-v1
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


[0] Rerank: 2.337 | ## 第二章：邓布利多的召见

哈利意识到自己必须马上通知学校。他立刻从口袋中掏出一枚银色硬币般的护符，深吸一口气，默念咒语：“Expecto Patronum

[1] Rerank: 1.561 | ## 第五章：魔法的融合

临战前夜，霍格沃茨的钟楼熄灯，取而代之的是城堡上空缓缓旋转的守护星光魔阵。夜色中笼罩着一股难以名状的寂静，仿佛整个世界都屏息等待。


[2] Rerank: -0.051 | # 魔戒与魔杖：两个世界的交汇

## 第一章：神秘的传送门

霍格沃茨的禁林，夜色正浓，月光从浓密树冠的缝隙中洒落，投下斑驳的银色光影。空气中弥漫着湿润的苔藓



# LLM Generation

In [8]:
from dotenv import load_dotenv
from google import genai
from typing import List

load_dotenv()
google_client = genai.Client()

def generate(query: str, chunks: List[str]) -> str:
    # Combine retrieved chunks into context
    context = "\n\n".join([f"[片段 {i+1}]\n{chunk}" for i, chunk in enumerate(chunks)])
    
    prompt = f"""你是一个专业的小说问答助手。请基于以下提供的小说片段来回答用户的问题。

小说片段:
{context}

用户问题: {query}

请根据上述片段提供准确、详细的回答。不要编造信息。如果片段中没有足够信息回答问题,请说明。"""

    response = google_client.models.generate_content(
        model='gemini-2.5-flash',
        contents=prompt
    )
    
    return response.text

## Generation Test

In [9]:
# Usage
query = "哈利的守护神是什么样子的？"
retrieved = retrieve(query, top_k=5)
reranked = rerank(query, retrieved, top_k=3)
reranked_chunks = [result['document'] for result in reranked]

answer = generate(query, reranked_chunks)
print(answer)

Retrieved 5/5 chunks


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: cross-encoder/mmarco-mMiniLMv2-L12-H384-v1
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


根据小说片段的描述，哈利的守护神在不同阶段有不同的形态：

1.  **初始形态（片段1）**：哈利的守护神是**一头牡鹿状的**，伴随一道银光腾空而起。
2.  **融合后的形态（片段2）**：在甘道夫的指引下，并吸收了中土世界最纯净的圣光——阿尔诺之焰后，哈利的守护神发生了变化。它变成了一道**巨大的银色凤凰**，展开羽翼，羽毛上镶嵌着**星光般的符文**，胸口蕴藏着**甘道夫圣光的核心**。这段描述也指出，它既是哈利的守护神，也是光明本身。
