In [28]:
# combine simalarity search and mmr, TFIDF or BM25

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "models/gemini-2.5-flash-lite")


In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(model=GEMINI_MODEL, api_key=GEMINI_API_KEY)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
query = "what is deepseek model doing?"

In [4]:
import chromadb

client = chromadb.PersistentClient(path="./store_emb")

In [5]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
import time
from tqdm import tqdm

os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")


In [6]:
from langchain_chroma import Chroma


db = Chroma(collection_name="rag_collection", client=client, embedding_function=embeddings)


In [62]:
sim_results = db._collection.query(
        query_embeddings=embeddings.embed_query(query),
        n_results=20,
        include=["documents", "metadatas", "distances"]
    )

In [63]:
from langchain_core.documents import Document

sim_docs = [
        Document(page_content=d, metadata=m)
        for d, m in zip(sim_results["documents"][0], sim_results["metadatas"][0])
    ]

In [64]:
mmr_retriever = db.as_retriever(search_type="mmr", search_kwargs={"k":10, "fetch_k":20, "lambda_mult":0.5})

In [65]:
mmr_docs = mmr_retriever.invoke(query)

In [66]:
mmr_docs

[Document(id='21af96dc89d8ee66129759b8e11549dd', metadata={'page_number': 2, 'source': '../data/DeepSeek-R1.pdf'}, page_content='Introduction Contributions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Summary of Evaluation Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeepSeek-R1-Zero: Reinforcement Learning on the Base Model . . . . . . . . . . Reinforcement Learning Algorithm . . . . . . . . . . . . . . . . . . . . . . Reward Modeling . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Training Template . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Performance, Self-evolution Process and Aha Moment of DeepSeek-R1-Zero DeepSeek-R1: Reinforcement Learning with Cold Start . . . . . . . . . . . . . . . Cold Start . . . . . . . . . . . . . . . .'),
 Document(id='5ed9c492191c6a5986ec8cfe9af2ff9b', metadata={'source':

In [29]:
from rank_bm25 import BM25Okapi

In [67]:
corpus = [doc.page_content for doc in sim_docs]
tokenized_corpus = [c.split() for c in corpus]
bm25 = BM25Okapi(tokenized_corpus)
bm25_scores = bm25.get_scores(query.split())

In [68]:
k=10
alpha=0.6
weight_mmr=0.3

In [73]:
from collections import defaultdict
import numpy as np

def normalize(arr):
    arr = np.array(arr, dtype=float)
    lo, hi = arr.min(), arr.max()
    if hi - lo < 1e-8:
        return np.ones_like(arr) * 0.5
    return (arr - lo) / (hi - lo)

# --- Hybrid retrieval (similarity + MMR + BM25) ---
def hybrid_retrieve(query, chroma_db, embeddings, k=10, alpha=0.6, weight_mmr=0.3):
    """
    Combines similarity, MMR, and BM25 retrieval into a normalized hybrid score.
    alpha: weight for semantic similarity
    weight_mmr: weight for MMR diversity
    (1 - alpha - weight_mmr): weight for BM25 lexical relevance
    """

    # Step 1: similarity docs
    sim_results = chroma_db._collection.query(
        query_embeddings=embeddings.embed_query(query),
        n_results=20,
        include=["documents", "metadatas", "distances"]
    )
    sim_docs = [
        Document(page_content=d, metadata=m)
        for d, m in zip(sim_results["documents"][0], sim_results["metadatas"][0])
    ]
    sim_dists = np.array(sim_results["distances"][0])
    sim_scores = 1 / (1 + sim_dists)  # convert distance â†’ similarity
    sim_norm = normalize(sim_scores)

    # Step 2: mmr docs
    mmr_docs = chroma_db.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 10, "fetch_k": 20, "lambda_mult": 0.5}
    ).invoke(query)

    # Step 3: BM25
    corpus = [doc.page_content for doc in sim_docs]
    tokenized_corpus = [c.split() for c in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = normalize(bm25.get_scores(query.split()))

    # Step 4: combine scores using stable doc IDs
    doc_scores = defaultdict(float)
    doc_id_map = {id(doc): doc for doc in sim_docs}

    # semantic similarity
    for i, doc in enumerate(sim_docs):
        doc_scores[id(doc)] += alpha * sim_norm[i]

    # MMR reciprocal-rank
    for i, doc in enumerate(mmr_docs):
        key = id(doc)
        doc_scores[key] += weight_mmr / (i + 1)

    # BM25 lexical
    for i, score in enumerate(bm25_scores):
        doc_scores[id(sim_docs[i])] += (1 - alpha - weight_mmr) * score

    # Step 5: rank and attach metadata
    ranked = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:k]
    ranked_docs = []
    for key, score in ranked:
        doc = doc_id_map.get(key)
        if doc:
            doc.metadata["hybrid_score"] = float(score)
            ranked_docs.append(doc)

    return ranked_docs

In [74]:
ranked_docs = hybrid_retrieve(query, db, embeddings, k=10, alpha=0.6, weight_mmr=0.3)

In [75]:
ranked_docs

[Document(metadata={'source': '../data/DeepSeek-R1.pdf', 'page_number': 2, 'hybrid_score': 0.6}, page_content='Introduction Contributions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Summary of Evaluation Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeepSeek-R1-Zero: Reinforcement Learning on the Base Model . . . . . . . . . . Reinforcement Learning Algorithm . . . . . . . . . . . . . . . . . . . . . . Reward Modeling . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Training Template . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Performance, Self-evolution Process and Aha Moment of DeepSeek-R1-Zero DeepSeek-R1: Reinforcement Learning with Cold Start . . . . . . . . . . . . . . . Cold Start . . . . . . . . . . . . . . . .'),
 Document(metadata={'page_number': 1, 'source': '../data/DeepSeek-R1.pdf', 'hybrid_sco

In [76]:
from sentence_transformers import CrossEncoder
from scipy.special import softmax


cross_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # precise reranker

def rerank_with_cross_encoder(query, candidates, weight_cross=0.7, weight_hybrid=0.3):
    """
    Re-rank top documents using cross-encoder.
    Combines normalized cross-encoder scores and hybrid retrieval scores.
    """
    if not candidates:
        return []

    pairs = [[query, c.page_content] for c in candidates]
    cross_scores = cross_model.predict(pairs)
    cross_norm = softmax(cross_scores)

    hybrid_arr = np.array([c.metadata.get("hybrid_score", 0.0) for c in candidates])
    hybrid_norm = normalize(hybrid_arr)

    combined = weight_cross * cross_norm + weight_hybrid * hybrid_norm

    for c, cs, cn, hn, final in zip(
        candidates, cross_scores, cross_norm, hybrid_norm, combined
    ):
        c.metadata["cross_score"] = float(cs)
        c.metadata["cross_norm"] = float(cn)
        c.metadata["hybrid_norm"] = float(hn)
        c.metadata["final_score"] = float(final)

    ranked = sorted(candidates, key=lambda x: x.metadata["final_score"], reverse=True)
    return ranked

In [77]:
cross_score = rerank_with_cross_encoder(query, ranked_docs, weight_cross=0.7, weight_hybrid=0.3)

In [78]:
cross_score

[Document(metadata={'page_number': 1, 'source': '../data/DeepSeek-R1.pdf', 'hybrid_score': 0.4617633824754085, 'cross_score': 5.093067646026611, 'cross_norm': 0.2590062618255615, 'hybrid_norm': 0.6839243968159727, 'final_score': 0.38648169934245263}, page_content='DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning DeepSeek-AI research@deepseek.com We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super- vised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data befo