In [1]:
! pip install deepeval==3.7.0 \
              langchain==0.3.7 \
              langchain-core==0.3.17 \
              langchain-community==0.3.7 \
              langchain-qdrant==0.2.0 \
              langchain-huggingface==0.1.1 \
              langchain-openai==0.2.3 \
              qdrant-client==1.11.0 \
              sentence-transformers==3.0.1 \
              pymupdf==1.24.7 \
              python-dotenv==1.1.1 \
              opentelemetry-api==1.37.0 \
              razdel==0.5.0 \
              yandex_cloud_ml_sdk==0.17.0 \
              mistralai==1.9.11 \
              pydantic==2.11.7



In [14]:
import gc
import os
import json
import pickle
import time
from pathlib import Path
import asyncio
from typing import Any, Dict, Sequence, Optional, Union, List

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
from google.colab import userdata
from qdrant_client import QdrantClient, models
from langchain_qdrant import Qdrant
import deepeval
from deepeval.models.base_model import DeepEvalBaseLLM
from yandex_cloud_ml_sdk import YCloudML

from langchain.schema import Document
from langchain_core.language_models.llms import LLM, BaseCache
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage
from langchain_core.outputs import Generation, LLMResult
from langchain_core.vectorstores.base import VectorStore
from langchain_core.prompts import PromptTemplate
from langchain.callbacks.manager import Callbacks
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.base import Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI



DATA_DIR = Path("/content/books")
YANDEX_API_KEY = userdata.get('YANDEX_API_KEY')
YANDEX_FOLDER_ID = userdata.get('YANDEX_FOLDER_ID')
OPENROUTER_API_KEY = userdata.get('openrouter_intuition')
OPENROUTER_BASE_URL = userdata.get('openrouter_base_url')
MY_QDRANT_URL = userdata.get('MY_QDRANT_URL')
MY_QDRANT_KEY = userdata.get('MY_QDRANT_KEY')

MAX_EMBEDDING_BATCH = 50
QDRANT_BATCH_SIZE = 200
QDRANT_TIMEOUT = 300
QDRANT_MAX_RETRIES = 3


# Yandex Embeddings

In [3]:
class YandexCloudEmbeddings(Embeddings):
    """–ö–ª–∞—Å—Å-–æ–±–µ—Ä—Ç–∫–∞ –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ Yandex Cloud, —Å–æ–≤–º–µ—Å—Ç–∏–º—ã–π —Å LangChain."""

    def __init__(
          self,
          folder_id: str,
          api_key: str,
          requests_per_second: int=9,
          timeout: float=60.0
        ):

        self.sdk = YCloudML(
            folder_id=folder_id,
            auth=api_key
        )
        self.query_model = self.sdk.models.text_embeddings("query")
        self.doc_model = self.sdk.models.text_embeddings("doc")
        self.vector_size = 256
        self.delay = 1.0 / requests_per_second
        self.timeout = timeout

    def _rate_limited_run(self, model, text: str) -> List[float]:
        """–í—ã–ø–æ–ª–Ω—è–µ—Ç –∑–∞–ø—Ä–æ—Å —Å —É—á–µ—Ç–æ–º –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è —Å–∫–æ—Ä–æ—Å—Ç–∏."""
        result = model.run(text, timeout=self.timeout)
        time.sleep(self.delay)
        return result

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """–ü–æ–ª—É—á–∞–µ—Ç —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è —Å–ø–∏—Å–∫–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ —Å —É—á–µ—Ç–æ–º –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è —Å–∫–æ—Ä–æ—Å—Ç–∏."""
        return [self._rate_limited_run(self.doc_model, text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        """–ü–æ–ª—É—á–∞–µ—Ç —ç–º–±–µ–¥–¥–∏–Ω–≥ –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞."""
        return self._rate_limited_run(self.query_model, text)

    def encode(self, texts: List[str], **kwargs) -> np.ndarray:
        """–°–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç—å —Å –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–æ–º SentenceTransformer."""
        embeddings = self.embed_documents(texts)
        return np.array(embeddings)


embeddings = YandexCloudEmbeddings(
    folder_id=YANDEX_FOLDER_ID,
    api_key=YANDEX_API_KEY
)

In [4]:
vector = embeddings.encode(["example"])
vector.shape

(1, 256)

# –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤

In [15]:
def add_documents_in_batches(
    qdrant_store: Qdrant,
    documents: List[Document],
    batch_size: int=QDRANT_BATCH_SIZE
  ):

    total_docs = len(documents)
    added_docs = 0

    for i in range(0, total_docs, batch_size):
        batch = documents[i:i + batch_size]
        retry_count = 0

        while retry_count < QDRANT_MAX_RETRIES:
            try:
                qdrant_store.add_documents(batch)
                added_docs += len(batch)
                print(f"[OK] Qdrant: –î–æ–±–∞–≤–ª–µ–Ω–æ {added_docs}/{total_docs} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
                break

            except Exception as e:
                retry_count += 1
                print(f"[FAIL] Qdrant: –û—à–∏–±–∫–∞ –ø—Ä–∏ –¥–æ–±–∞–≤–ª–µ–Ω–∏–∏ –±–∞—Ç—á–∞ {i//batch_size + 1}, –ø–æ–ø—ã—Ç–∫–∞ {retry_count}/{QDRANT_MAX_RETRIES}: {e}")
                time.sleep(5)
                if retry_count >= QDRANT_MAX_RETRIES:
                    raise e

In [16]:
def chunk_upload_qdrant(
    pdf_path: str,
    qdrant_store: Qdrant,
    splitter: RecursiveCharacterTextSplitter
):

    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"–§–∞–π–ª –Ω–µ –Ω–∞–π–¥–µ–Ω: {pdf_path}")

    loader = PyMuPDFLoader(pdf_path)
    pdf_name = os.path.basename(pdf_path)
    docs = loader.load()
    for doc in docs:
        doc.metadata["source"] = pdf_name

    chunks = splitter.split_documents(docs)
    print(f"{len(chunks)} —á–∞–Ω–∫–æ–≤")

    add_documents_in_batches(qdrant_store, chunks)

# Yandex GPT

In [17]:
class YandexGPT(LLM):
    """LangChain –∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏—è —Å YandexGPT."""

    model_name: str = "yandexgpt"
    temperature: float = 0.0
    max_tokens: int = 2000
    folder_id: str = YANDEX_FOLDER_ID
    api_key: str = YANDEX_API_KEY
    sdk: Any = None

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.sdk = YCloudML(
            folder_id=self.folder_id,
            auth=self.api_key,
        )


    @property
    def _llm_type(self) -> str:
        """–í–æ–∑–≤—Ä–∞—â–∞–µ—Ç —Ç–∏–ø LLM."""
        return "yandexgpt"

    def _convert_messages_to_yandex_format(
        self,
        messages: Sequence[BaseMessage]
      ) -> List[Dict[str, str]]:
        """–ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ—Ç —Å–æ–æ–±—â–µ–Ω–∏—è LangChain –≤ —Ñ–æ—Ä–º–∞—Ç YandexGPT."""

        yandex_messages = []
        for message in messages:
            if isinstance(message, SystemMessage):
                yandex_messages.append({"role": "system", "text": message.content})
            elif isinstance(message, HumanMessage):
                yandex_messages.append({"role": "user", "text": message.content})
        return yandex_messages

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """–í—ã–∑—ã–≤–∞–µ—Ç YandexGPT —Å –∑–∞–¥–∞–Ω–Ω—ã–º –ø—Ä–æ–º–ø—Ç–æ–º."""

        messages = [{"role": "user", "text": prompt}]

        # –î–æ–±–∞–≤–ª—è–µ–º —Å–∏—Å—Ç–µ–º–Ω—ã–π –ø—Ä–æ–º–ø—Ç, –µ—Å–ª–∏ –æ–Ω –µ—Å—Ç—å
        if "system_prompt" in kwargs:
            messages.insert(0, {"role": "system", "text": kwargs["system_prompt"]})

        try:
            result = (
                self.sdk.models.completions(self.model_name)
                .configure(
                    temperature=kwargs.get("temperature", self.temperature),
                    max_tokens=kwargs.get("max_tokens", self.max_tokens)
                )
                .run(messages)
            )

            if result:
                return result[0].text

            return "–ù–µ—Ç –æ—Ç–≤–µ—Ç–∞"
        except Exception as e:
            raise


    def _generate(
        self,
        prompts: List[str],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> LLMResult:
        """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –æ—Ç–≤–µ—Ç—ã –Ω–∞ –ø—Ä–æ–º–ø—Ç—ã."""
        generations = []
        for prompt in prompts:
            messages = [{"role": "user", "text": prompt}]

            # –î–æ–±–∞–≤–ª—è–µ–º —Å–∏—Å—Ç–µ–º–Ω—ã–π –ø—Ä–æ–º–ø—Ç, –µ—Å–ª–∏ –æ–Ω –µ—Å—Ç—å
            if "system_prompt" in kwargs:
                messages.insert(0, {"role": "system", "text": kwargs["system_prompt"]})

            result = (
                self.sdk.models.completions(self.model_name)
                .configure(
                    temperature=kwargs.get("temperature", self.temperature),
                    max_tokens=kwargs.get("max_tokens", self.max_tokens)
                )
                .run(messages)
            )

            if result:
                text = result[0].text
            else:
                text = "–ù–µ—Ç –æ—Ç–≤–µ—Ç–∞"

            generations.append([Generation(text=text)])

        return LLMResult(generations=generations)


    def invoke(
        self,
        input: str | BaseMessage | List[BaseMessage],
        config: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> str:
        """–í—ã–∑—ã–≤–∞–µ—Ç –º–æ–¥–µ–ª—å —Å –∑–∞–¥–∞–Ω–Ω—ã–º –≤–≤–æ–¥–æ–º."""

        if isinstance(input, str):
            messages = [{"role": "user", "text": input}]

        elif isinstance(input, BaseMessage):
            messages = [{"role": "user", "text": input.content}]

        else:
            messages = self._convert_messages_to_yandex_format(input)

        try:
            result = (
                self.sdk.models.completions(self.model_name)
                .configure(
                    temperature=kwargs.get("temperature", self.temperature),
                    max_tokens=kwargs.get("max_tokens", self.max_tokens)
                )
                .run(messages)
            )

            if result:
                return result[0].text

            return "–ù–µ—Ç –æ—Ç–≤–µ—Ç–∞"
        except Exception as e:
            raise


    async def ainvoke(
        self,
        input: str | BaseMessage | List[BaseMessage],
        config: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> str:
        """–ê—Å–∏–Ω—Ö—Ä–æ–Ω–Ω–æ –≤—ã–∑—ã–≤–∞–µ—Ç –º–æ–¥–µ–ª—å —Å –∑–∞–¥–∞–Ω–Ω—ã–º –≤–≤–æ–¥–æ–º."""

        if isinstance(input, str):
            messages = [{"role": "user", "text": input}]

        elif isinstance(input, BaseMessage):
            messages = [{"role": "user", "text": input.content}]

        else:
            messages = self._convert_messages_to_yandex_format(input)

        try:
            # –°–æ–∑–¥–∞–µ–º –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é –∑–∞–ø—Ä–æ—Å–∞
            completion = (
                self.sdk.models.completions(self.model_name)
                .configure(
                    temperature=kwargs.get("temperature", self.temperature),
                    max_tokens=kwargs.get("max_tokens", self.max_tokens)
                )
            )

            # –í—ã–ø–æ–ª–Ω—è–µ–º —Å–∏–Ω—Ö—Ä–æ–Ω–Ω—ã–π API-–∑–∞–ø—Ä–æ—Å –≤ –æ—Ç–¥–µ–ª—å–Ω–æ–º –ø–æ—Ç–æ–∫–µ
            loop = asyncio.get_event_loop()
            result = await loop.run_in_executor(
                None,
                lambda: completion.run(messages)
            )

            if result:
                text_result = result[0].text
                return text_result

            return "–ù–µ—Ç –æ—Ç–≤–µ—Ç–∞"
        except Exception as e:
            raise

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """–í–æ–∑–≤—Ä–∞—â–∞–µ—Ç –ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –∏–¥–µ–Ω—Ç–∏—Ñ–∏—Ü–∏—Ä—É—é—â–∏–µ –º–æ–¥–µ–ª—å."""
        return {
            "model_name": self.model_name,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "folder_id": self.folder_id
        }


llm = YandexGPT()

In [8]:
llm.invoke("–ü—Ä–∏–≤–µ—Ç")

'–ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ! –ß–µ–º —è –º–æ–≥—É –≤–∞–º –ø–æ–º–æ—á—å?'

# Other LLMs Interface

In [22]:
openrouter_llm_temperature = 0.0
openrouter_llm_max_tokens = 2000
rag_llm_model = "deepseek/deepseek-chat-v3.1"

ChatOpenAI.model_rebuild()
llm = ChatOpenAI(
    model=rag_llm_model,
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base=OPENROUTER_BASE_URL,
    temperature=openrouter_llm_temperature,
    max_tokens=openrouter_llm_max_tokens,
)

In [23]:
llm.invoke("–ü—Ä–∏–≤–µ—Ç!")

AIMessage(content='–ü—Ä–∏–≤–µ—Ç! üòä –ö–∞–∫ —è –º–æ–≥—É –ø–æ–º–æ—á—å –≤–∞–º —Å–µ–≥–æ–¥–Ω—è?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 13, 'prompt_tokens': 8, 'total_tokens': 21, 'completion_tokens_details': {'accepted_prediction_tokens': None, 'audio_tokens': None, 'reasoning_tokens': 0, 'rejected_prediction_tokens': None, 'image_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0, 'video_tokens': 0}, 'cost': 1.516e-05, 'is_byok': False, 'cost_details': {'upstream_inference_cost': None, 'upstream_inference_prompt_cost': 2.16e-06, 'upstream_inference_completions_cost': 1.3e-05}}, 'model_name': 'deepseek/deepseek-chat-v3.1', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-a769d9f8-6ef2-46dc-ac81-a4cdd1ed4445-0', usage_metadata={'input_tokens': 8, 'output_tokens': 13, 'total_tokens': 21, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'reasoning': 0

# RAG

In [9]:
class RAG:

  def __init__(
      self,
      llm: LLM,
      vector_store: VectorStore,
      top_k_docs: int = 5,
    ):

    self.llm = llm
    self.vector_store = vector_store
    self.top_k_docs = top_k_docs
    self.prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""
          –¢—ã ‚Äî —ç–∫—Å–ø–µ—Ä—Ç –ø–æ —Ä—É—Å—Å–∫–æ–π –∫–ª–∞—Å—Å–∏—á–µ—Å–∫–æ–π –ª–∏—Ç–µ—Ä–∞—Ç—É—Ä–µ.
          –û—Ç–≤–µ—Ç—å —Ç–æ—á–Ω–æ –∏ –ø–æ –¥–µ–ª—É, –∏—Å–ø–æ–ª—å–∑—É—è –¢–û–õ–¨–ö–û –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –∏–∑ –ø—Ä–∏–≤–µ–¥—ë–Ω–Ω–æ–≥–æ –Ω–∏–∂–µ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞.

          –ö–æ–Ω—Ç–µ–∫—Å—Ç:
          {context}

          –í–æ–ø—Ä–æ—Å:
          {question}

          –°—Ç–∏–ª—å –æ—Ç–≤–µ—Ç–∞
          1. –Ø—Å–Ω—ã–π, —ç–Ω–µ—Ä–≥–∏—á–Ω—ã–π —Ç–æ–Ω, –æ–±—Ä–∞–∑–Ω—ã–µ –ø—Ä–∏–º–µ—Ä—ã.
          2. –ü–∏—à–∏ —è—Å–Ω–æ, –ø–æ-—Ä—É—Å—Å–∫–∏, –±–µ–∑ –ª–∏—à–Ω–µ–π –≤–æ–¥—ã.
          3. –û—Ç–≤–µ—á–∞–π –ø–æ–¥—Ä–æ–±–Ω–æ.

          –û—Ç–≤–µ—Ç:"""
    )


  def retrieve_documents(
      self,
      question: str,
      source_name: str
    ):

    retriever = self.vector_store.as_retriever(
        search_kwargs={
            "filter": {
                "source": source_name
            },
            "k": self.top_k_docs
        }
    )
    relevant_docs = retriever.invoke(question)
    return relevant_docs


  def generate_answer(
      self,
      question: str,
      relevant_docs: List[Document]
    ):

    context = "\n\n".join(doc.page_content for doc in relevant_docs)

    prompt = self.prompt.format(
        context=context,
        question=question
    )

    answer = self.llm.invoke(prompt)
    return answer


  def run_rag_pipeline(
      self,
      question: str,
      source_name: str
    ):

    relevant_docs = self.retrieve_documents(
        question=question,
        source_name=source_name
    )

    answer = self.generate_answer(
        question=question,
        relevant_docs=relevant_docs
    )

    return {
        "answer": answer,
        "context": [str(doc.page_content) for doc in relevant_docs]
    }

# –û—Ü–µ–Ω–∫–∞ RAG —Å –ø–æ–º–æ—â—å—é DeepEval

In [10]:
ChatOpenAI.model_rebuild()
scorer_llm_model = "openai/gpt-4o-mini"

class ScorerLLM(DeepEvalBaseLLM):
    def __init__(self):
      self._client = ChatOpenAI(
          model=scorer_llm_model,
          openai_api_key=OPENROUTER_API_KEY,
          openai_api_base=OPENROUTER_BASE_URL,
          temperature=0.0,
          seed=42
      )
      self._model = scorer_llm_model


    def load_model(self):
      return self._client

    def  get_model_name(self):
      return self._model


    def generate(self, prompt: str) -> str:
      messages = [HumanMessage(content=prompt)]
      response = self._client.invoke(messages)
      return response.content



    async def a_generate(self, prompt: str) -> str:
      return self.generate(prompt)

In [11]:
def create_deepeval_dataset(dataset, rag):
    test_cases = []

    for i in tqdm(range(len(dataset))):
        entry = dataset[i]
        source_name = entry['source_name']
        question = entry['question']
        answer = entry['answer']

        result = rag.run_rag_pipeline(
            question, source_name
        )
        context, rag_response = result['context'], result['answer']
        test_case = deepeval.test_case.LLMTestCase(
            input=question,
            actual_output=rag_response,
            expected_output=answer,
            retrieval_context=context
        )

        test_cases.append(test_case)

    return test_cases

# Evaluation Loop

In [12]:
def get_metrics_single_book(
        source_name: str,
        eval_res_data: list,
        raw_metrics_save_path: str,
        chunk_size: int,
        overlap: int,
        topk: int,
        llm: str,
        embeddings_model: str,
        embedding_size: int
  ) -> pd.DataFrame:

    rows = [
        [item for tup in sublist for item in tup]
        for sublist in eval_res_data
    ]

    df = pd.DataFrame(
        rows,
        columns=["is_success_answer_relevant", "answer_relevant_score", "answer_relevant_thr",
                 "is_success_contextuall_recall", "contextuall_recall_score", "contextuall_recall_thr",
                 "is_success_ans_faithfulness", "answer_faithfulness_score", "answer_faithfulness_thr"
        ]
    )
    df["source_name"] = len(df) * [source_name]
    df["chunk_size"] = len(df) * [chunk_size]
    df["overlap"] = len(df) * [overlap]
    df["topk"] = len(df) * [topk]
    df["llm"] = len(df) * [llm]
    df["embeddings_model"] = len(df) * [embeddings_model]
    df["embedding_size"] = len(df) * [embedding_size]
    df.to_csv(raw_metrics_save_path, index=False)


def get_all_metrics(
    raw_metrics_dfs_dir: str,
    detail_metrics_save_path: str,
    compressed_metrics_save_path: str
):
    df_names = os.listdir(raw_metrics_dfs_dir)
    full_metrics_df = pd.DataFrame()

    for df_name in df_names:
        df = pd.read_csv(f"{raw_metrics_dfs_dir}/{df_name}")
        full_metrics_df = pd.concat([full_metrics_df, df])
    full_metrics_df.to_csv(detail_metrics_save_path, index=False)

    agg_df = full_metrics_df.groupby('source_name').agg({
        'is_success_answer_relevant': 'mean',
        'is_success_contextuall_recall': 'mean',
        'is_success_ans_faithfulness': 'mean',
        'chunk_size': 'first',
        'overlap': 'first',
        'topk': 'first',
        'llm': 'first',
        'embeddings_model': 'first',
        'embedding_size': 'first',
    }).reset_index()
    agg_df.to_csv(compressed_metrics_save_path, index=False)

In [20]:
def run_evaluation_pipeline(
    base_dir: str,
    book_golden_dict: dict,
    embeddings,
    MY_QDRANT_URL: str,
    MY_QDRANT_KEY: str,
    chunk_sizes: List[int],
    overlaps: List[int],
    topk_docs_rels: List[int],
    llm_test_name: str,
    embeddings_model_name: str,
    embedding_size: int,
    retry_count: int = 10,
    raw_metrics_dir: str = "raw_metrics",
    computed_metrics_dir: str = "computed_metrics",
    compressed_metrics_dir: str = "compressed_metrics",
    computed_filename: str = "computed.csv",
    compressed_filename: str = "compressed.csv"
):
    client = QdrantClient(url=MY_QDRANT_URL, api_key=MY_QDRANT_KEY)
    llm = YandexGPT()
    scorer = ScorerLLM()

    os.makedirs(os.path.join(base_dir, llm_test_name), exist_ok=True)

    for chunk_size in chunk_sizes:
        for overlap in overlaps:
            for topk_docs_rel in topk_docs_rels:

                print(f"Test params: ch{chunk_size}_ov{overlap}_topk{topk_docs_rel}")
                cur_save_dir = os.path.join(
                    base_dir,
                    llm_test_name,
                    f"{llm_test_name}_embmodel_{embeddings_model_name}_ch{chunk_size}_ov{overlap}_topk{topk_docs_rel}"
                )
                os.makedirs(cur_save_dir, exist_ok=True)

                collection_name = f"test_collection_ch{chunk_size}_ov{overlap}_topk{topk_docs_rel}"
                existing_collections = [c.name for c in client.get_collections().collections]

                if collection_name not in existing_collections:
                    client.create_collection(
                        collection_name=collection_name,
                        vectors_config=models.VectorParams(
                            size=embedding_size,
                            distance=models.Distance.COSINE
                        )
                    )

                client.create_payload_index(
                    collection_name=collection_name,
                    field_name="metadata.source",
                    field_schema=models.PayloadSchemaType.KEYWORD
                )

                qdrant_store = Qdrant(
                    client=client,
                    collection_name=collection_name,
                    embeddings=embeddings
                )
                rag = RAG(
                    llm=llm,
                    vector_store=qdrant_store,
                    top_k_docs=topk_docs_rel
                )

                for book_name, (book_path, golden_path) in book_golden_dict.items():
                    splitter = RecursiveCharacterTextSplitter(
                        chunk_size=chunk_size,
                        chunk_overlap=overlap,
                        separators=["\n\n", "\n", ".", " ", ""],
                        length_function=len,
                        is_separator_regex=False,
                        strip_whitespace=True
                    )

                    pickled_test_cases_path = os.path.join(
                        cur_save_dir, f"{book_name.rpartition('.')[0]}.pkl"
                    )

                    if not os.path.exists(pickled_test_cases_path):
                        chunk_upload_qdrant(
                            pdf_path=book_path,
                            qdrant_store=qdrant_store,
                            splitter=splitter
                        )

                    with open(golden_path, 'r', encoding='utf-8') as f:
                        dataset = json.load(f)["dataset"]
                    for elem in dataset:
                        elem["source_name"] = book_name

                    if os.path.exists(pickled_test_cases_path):
                        print("–ò—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω—ã–µ –æ—Ç–≤–µ—Ç—ã —Ä–∞–Ω–µ–µ")
                        with open(pickled_test_cases_path, 'rb') as f:
                            test_cases = pickle.load(f)
                    else:
                        test_cases = create_deepeval_dataset(dataset, rag)
                        with open(pickled_test_cases_path, 'wb') as f:
                            pickle.dump(test_cases, f)

                    data_eval_pickle = os.path.join(
                        cur_save_dir, f"metrics_{book_name.rpartition('.')[0]}.pkl"
                    )

                    eval_res = None
                    if os.path.exists(data_eval_pickle):
                        print("–ò—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è –ø–æ—Å—á–∏—Ç–∞–Ω–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ —Ä–∞–Ω–µ–µ")
                        with open(data_eval_pickle, 'rb') as f:
                            eval_res = pickle.load(f)
                    else:
                        for retry in range(retry_count):
                            try:
                                eval_res = deepeval.evaluate(
                                    test_cases=test_cases,
                                    metrics=[
                                        deepeval.metrics.AnswerRelevancyMetric(model=scorer, async_mode=False),
                                        deepeval.metrics.ContextualRecallMetric(model=scorer, async_mode=False),
                                        deepeval.metrics.FaithfulnessMetric(model=scorer, async_mode=False),
                                    ],
                                )
                                if eval_res:
                                    with open(data_eval_pickle, 'wb') as f:
                                        pickle.dump(eval_res, f)
                                    break
                            except Exception as ex:
                                print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ—Ü–µ–Ω–∫–µ (–ø–æ–ø—ã—Ç–∫–∞ {retry + 1}): {ex}")
                                continue

                    if eval_res is None:
                        print(f"–ù–µ —É–¥–∞–ª–æ—Å—å –æ—Ü–µ–Ω–∏—Ç—å {book_name} ‚Äî –ø—Ä–æ–ø—É—Å–∫.")
                        continue


                    data = [
                        [
                            (md.success, md.score, md.threshold)
                            for md in test_result.metrics_data
                        ]
                        for test_result in eval_res.test_results
                    ]

                    for subdir in [raw_metrics_dir, computed_metrics_dir, compressed_metrics_dir]:
                        os.makedirs(os.path.join(cur_save_dir, subdir), exist_ok=True)

                    raw_metrics_path = os.path.join(
                        cur_save_dir, raw_metrics_dir, f"raw_{book_name.rpartition('.')[0]}.csv"
                    )
                    computed_metrics_path = os.path.join(
                        cur_save_dir, computed_metrics_dir, computed_filename
                    )
                    compressed_metrics_path = os.path.join(
                        cur_save_dir, compressed_metrics_dir, compressed_filename
                    )

                    get_metrics_single_book(
                        source_name=book_name,
                        eval_res_data=data,
                        raw_metrics_save_path=raw_metrics_path,
                        chunk_size=chunk_size,
                        overlap=overlap,
                        topk=topk_docs_rel,
                        llm=llm_test_name,
                        embeddings_model=embeddings_model_name,
                        embedding_size=embedding_size
                    )

                get_all_metrics(
                    raw_metrics_dfs_dir=os.path.join(cur_save_dir, raw_metrics_dir),
                    detail_metrics_save_path=computed_metrics_path,
                    compressed_metrics_save_path=compressed_metrics_path
                )

                client.delete_collection(collection_name)

In [None]:
base_dir = "/content/drive/MyDrive/rag-evaluation"
books_dir = f"{base_dir}/books"
books = os.listdir(books_dir)
golden_data = f"{base_dir}/golden_data"
book_golden_dict = {book: [f"{books_dir}/{book}", f"{golden_data}/{book.rpartition('.')[0]}.json"] for book in books}

chunk_sizes = [2000, 1500, 1000]
overlaps = [200, 150, 100]
topk_docs_rels = [10, 5]

llm_test_name = "yagpt"
embeddings_model_name = "ya_embeddings"
embedding_size = 256

run_evaluation_pipeline(
    base_dir=base_dir,
    book_golden_dict=book_golden_dict,
    embeddings=embeddings,
    MY_QDRANT_URL=MY_QDRANT_URL,
    MY_QDRANT_KEY=MY_QDRANT_KEY,
    chunk_sizes=chunk_sizes,
    overlaps=overlaps,
    topk_docs_rels=topk_docs_rels,
    llm_test_name=llm_test_name,
    embeddings_model_name=embeddings_model_name,
    embedding_size=embedding_size
)