# Raw Document Analysis (Optional)

In [17]:
def word_count(file_name: str) -> None:
    # More accurate word count for Chinese
    with open(file_name, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Count Chinese characters (each Chinese char ≈ 1 word)
    chinese_chars = len([c for c in content if '\u4e00' <= c <= '\u9fff'])
    # Count English words
    english_words = len([w for w in content.split() if any('a' <= c.lower() <= 'z' for c in w)])
    
    print(f"Total characters: {len(content)}")
    print(f"Chinese characters: {chinese_chars} (≈ words)")
    print(f"English words: {english_words}")
    print(f"Approximate total words: {chinese_chars + english_words}")
    print(f"Number of chunks: {len(chunks)}")

word_count("cn.md")

Total characters: 1897
Chinese characters: 1483 (≈ words)
English words: 19
Approximate total words: 1502
Number of chunks: 5


>**5 chunks is actually good enough for a short story (i.e., 1,897 chars).** (i) Fast retrieval speed as only 5 embeddings to search through; And (ii) context preservation since each chunk has ~400 chars - enough context for a complete scene.
>
>**You'd want 10-20+ chunks if** (i) Document is 10,000+ characters (long article/book); (ii) Very specific factual queries requiring granular retrieval; (iii) Multiple topics in one document

# Splitting and Chunking Strategy

In [1]:
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_into_chunks(doc_file: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
    with open(doc_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", "。", ".", " ", ""]
    )
    return text_splitter.split_text(content)

chunks = split_into_chunks("cn.md")
print(f"Created {len(chunks)} chunks\n")

# for i, c in enumerate(chunks):
#     print(f"[{i}] {c}")

Created 5 chunks



# Indexing and Storage
Embedding Model Choice:
- `shibing624/text2vec-base-chinese` (good for Chinese and used in `rag0`)
- `BAAI/bge-base-zh-v1.5` (better Chinese performance)
- `moka-ai/m3e-base` (multilingual Chinese-English)

In [2]:
import chromadb
from sentence_transformers import SentenceTransformer
from typing import List

# ======================================== 
# Initialize models
# ========================================
embedding_model = SentenceTransformer("shibing624/text2vec-base-chinese")


# ======================================== 
# Create ChromaDB with correct settings
# ========================================
def create_db():
    client = chromadb.PersistentClient(path="./chroma_db")

    # If you want to GUARANTEE cosine space, delete+recreate:
    try:
        client.delete_collection(name="default")
        print("Deleted old collection")
    except Exception as e:
        # ok if it doesn't exist; still good to know unexpected errors
        print(f"(delete_collection) {e}")

    collection = client.create_collection(
        name="default",
        metadata={"hnsw:space": "cosine"}
    )

    print(f"Created collection with metadata: {collection.metadata}")
    return collection

chromadb_collection = create_db()


# ======================================== 
# Embed
# ========================================
def embed_chunk(chunk: str) -> List[float]:
    emb = embedding_model.encode(chunk)  # numpy array
    return emb.tolist()


# ======================================== 
# Store with metadata
# ========================================
def save_embeddings(
    collection,
    chunks: List[str],
    embeddings: List[List[float]],
    source_file: str = "cn.md",
) -> None:
    if not chunks:
        raise ValueError("chunks is empty")

    if len(chunks) != len(embeddings):
        raise ValueError(f"chunks ({len(chunks)}) and embeddings ({len(embeddings)}) length mismatch")

    # Use stable IDs that won't collide across different files
    ids = [f"{source_file}:{i}" for i in range(len(chunks))]

    metadatas = [
        {
            "chunk_id": i,
            "source": source_file,
            "chunk_length": len(chunk),
            "chunk_index": i,
        }
        for i, chunk in enumerate(chunks)
    ]

    # Use upsert so reruns don't explode on duplicate ids
    collection.upsert(
        documents=chunks,
        embeddings=embeddings,
        ids=ids,
        metadatas=metadatas,
    )

    print(f"Saved {len(chunks)} chunks to ChromaDB")

# embeddings
embeddings = [embed_chunk(c) for c in chunks]
print(f"Generated {len(embeddings)} embeddings")
print(f"Embedding dimension: {len(embeddings[0])}")

save_embeddings(chromadb_collection, chunks, embeddings, "cn.md")




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: shibing624/text2vec-base-chinese
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


(delete_collection) Collection [default] does not exist
Created collection with metadata: {'hnsw:space': 'cosine'}
Generated 5 embeddings
Embedding dimension: 768
Saved 5 chunks to ChromaDB


# Retrieval

In [4]:
def retrieve(query: str, top_k: int = 5, score_threshold = None):
    query_embedding = embed_chunk(query)
    
    results = chromadb_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=['documents', 'distances', 'metadatas']
    )
    
    if not results['documents'][0]:
        print("No results found!")
        return []
    
    retrieved = []
    for i, (doc, dist, meta) in enumerate(zip(
        results['documents'][0],
        results['distances'][0],
        results['metadatas'][0]
    )):
        if score_threshold is None or dist <= score_threshold:
            retrieved.append({
                'document': doc,
                'distance': dist,
                'similarity': 1 / (1 + dist),
                'metadata': meta,
                'rank': i
            })
    
    print(f"Retrieved {len(retrieved)}/{top_k} chunks")
    return retrieved

## Retrieval Testing

In [16]:
# Test with same query
query = "哈利波特用了什么魔法打败了索伦？"
results = retrieve(query, top_k=3, score_threshold=None)

print("\n" + "="*70)
print("Results for ", query)
print("="*70)
for i, r in enumerate(results):
    print(f"\nRank {i+1}:")
    print(f"  Distance: {r['distance']:.3f}")  # Should now be 0.0-2.0 range
    print(f"  Similarity: {r['similarity']:.3f}")
    print(f"  Text: {r['document'][:80]}...")


Retrieved 3/3 chunks

Results for  哈利波特用了什么魔法打败了索伦？

Rank 1:
  Distance: 0.358
  Similarity: 0.736
  Text: ## 第八章：最后一击

索伦发出震耳欲聋的咆哮，但还没有被完全击败。他将所有的黑暗力量集中到一点，准备做最后的反击。就在这个关键时刻，佛罗多举起至尊魔戒，大喊...

Rank 2:
  Distance: 0.370
  Similarity: 0.730
  Text: ## 第五章：魔法的融合

在甘道夫的指导下，哈利学会了如何将他的守护神咒与中土世界的光明魔法结合。他们制定了一个大胆的计划：哈利将使用"Expecto Pat...

Rank 3:
  Distance: 0.416
  Similarity: 0.706
  Text: # 魔戒与魔杖：两个世界的交汇

## 第一章：神秘的传送门

霍格沃茨的禁林深处，哈利·波特正在寻找独角兽的踪迹，完成海格布置的神奇生物课作业。突然，一道耀眼...


# Rerank

In [None]:
from sentence_transformers import CrossEncoder

def rerank(query: str, retrieved_chunks: List[str], top_k: int) -> List[str]:
    corss_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')
    pairs = [(query, chunk) for chunk in retrieved_chunks]
    scores = corss_encoder.predict(pairs)

    scored_chunks = [(chunk, score) for chunk, score in zip(retrieved_chunks, scores)]
    scored_chunks.sort(key=lambda pair: pair[1], reverse=True)

    # we only return text chunk but get rid of score
    return [chunk for chunk, _ in scored_chunks][:top_k]

reranked_chunks = rerank(query, retrieved_chunks, 3)

# Print top-K result out
for i, chunk in enumerate(reranked_chunks):
    print(f"[{i}] {chunk}\n")

# LLM Generation