In [1]:
import os
import re
from typing import Optional, List, Dict, Any
from deepeval.models import DeepEvalBaseLLM
from openai import OpenAI, AsyncOpenAI
from deepeval.models import DeepEvalBaseEmbeddingModel

In [2]:
class InfinityEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self, model_name: str, base_url: str, api_key: Optional[str] = None):
        self.model_name = model_name
        self.base_url = base_url
        self.api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY", "123")
        self._sync_client: Optional[OpenAI] = None
        self._async_client: Optional[AsyncOpenAI] = None

    def load_model(self) -> OpenAI:
        if self._sync_client is None:
            self._sync_client = OpenAI(base_url=self.base_url, api_key=self.api_key)
        return self._sync_client

    def load_async_model(self) -> AsyncOpenAI:
        if self._async_client is None:
            self._async_client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)
        return self._async_client

    def embed_text(self, text: str) -> List[float]:
        client = self.load_model()
        try:
            response = client.embeddings.create(
                model=self.model_name,
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"Error during synchronous Infinity embedding for text '{text[:50]}...': {e}")
            return []

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        client = self.load_model()
        try:
            response = client.embeddings.create(
                model=self.model_name,
                input=texts
            )
            return [data.embedding for data in response.data]
        except Exception as e:
            print(f"Error during synchronous Infinity batch embedding: {e}")
            return [[] for _ in texts] 

    async def a_embed_text(self, text: str) -> List[float]:
        client = self.load_async_model()
        try:
            response = await client.embeddings.create(
                model=self.model_name,
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"Error during asynchronous Infinity embedding for text '{text[:50]}...': {e}")
            return []

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        client = self.load_async_model()
        try:
            response = await client.embeddings.create(
                model=self.model_name,
                input=texts
            )
            return [data.embedding for data in response.data]
        except Exception as e:
            print(f"Error during asynchronous Infinity batch embedding: {e}")
            return [[] for _ in texts]


    def get_model_name(self) -> str:
        """
        Returns the name of the embedding model.
        """
        return self.model_name

In [None]:
import os
import re
import json
from typing import Optional, Union, Dict, List, Any
from pydantic import BaseModel
import instructor
from openai import OpenAI, AsyncOpenAI
from deepeval.models import DeepEvalBaseLLM

class LLMModel(DeepEvalBaseLLM):
    COGITO_THINKING_INSTRUCTION = "Enable deep thinking subroutine."

    def __init__(self,
                 model_name: str,
                 base_url: str,
                 api_key: Optional[str] = None,
                 attempt_thinking_mode: bool = False,
                 cleaning_method: Optional[str] = None,
                 max_tokens: int = 2000
                ):
        self.model_name_original = model_name
        self.model_name_lower = model_name.lower()
        self.base_url = base_url
        self.api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY", "EMPTY")
        # –ò—Å–ø–æ–ª—å–∑—É–π—Ç–µ instructor.patch —Å—Ä–∞–∑—É –ø—Ä–∏ —Å–æ–∑–¥–∞–Ω–∏–∏ –∫–ª–∏–µ–Ω—Ç–æ–≤
        self._sync_client: Optional[OpenAI] = None
        self._async_client: Optional[AsyncOpenAI] = None
        self.attempt_thinking_mode = attempt_thinking_mode
        self.cleaning_method = cleaning_method
        self.max_tokens_to_generate = max_tokens
        super().__init__(model_name=model_name)

    def load_model(self) -> OpenAI:
        if self._sync_client is None:
            # –ü–∞—Ç—á–∏–º –∫–ª–∏–µ–Ω—Ç –ø—Ä–∏ —Å–æ–∑–¥–∞–Ω–∏–∏
            self._sync_client = instructor.patch(OpenAI(base_url=self.base_url, api_key=self.api_key))
        return self._sync_client

    def load_async_model(self) -> AsyncOpenAI:
        if self._async_client is None:
            # –ü–∞—Ç—á–∏–º –∞—Å–∏–Ω—Ö—Ä–æ–Ω–Ω—ã–π –∫–ª–∏–µ–Ω—Ç –ø—Ä–∏ —Å–æ–∑–¥–∞–Ω–∏–∏
            self._async_client = instructor.patch(AsyncOpenAI(base_url=self.base_url, api_key=self.api_key))
        return self._async_client

    def _clean_response(self, raw_response: str) -> str:
        if not raw_response:
            return ""

        cleaned = raw_response.strip()

        if self.cleaning_method == "rsplit":
            closing_tag = '</think>'
            if closing_tag in cleaned:
                parts = cleaned.rsplit(closing_tag, 1)
                think_block_start = parts[0].rfind('<think>')
                if think_block_start != -1:
                    cleaned = parts[0][:think_block_start].rstrip() + parts[1].lstrip()
                else:
                     cleaned = parts[1].lstrip()
            return cleaned.strip()

        elif self.cleaning_method == "regex":
            pattern = r'<think>.*?</think>\s*'
            cleaned = re.sub(pattern, '', cleaned, flags=re.DOTALL).strip()
            return cleaned
        else:
            return cleaned

    def _prepare_api_call_args(self, prompt: str) -> Dict[str, Any]:
        messages: List[Dict[str, str]] = [{"role": "user", "content": prompt}]
        api_extra_params: Dict[str, Any] = {}

        if self.attempt_thinking_mode:
            if "cogito" in self.model_name_lower:
                messages.insert(0, {"role": "system", "content": self.COGITO_THINKING_INSTRUCTION})

        return {"messages": messages, "api_extra_params": api_extra_params}

    # --- –ò–ó–ú–ï–ù–ï–ù–ù–´–ô –ú–ï–¢–û–î generate ---
    def generate(self, prompt: str, schema: Optional[BaseModel] = None) -> Union[str, BaseModel]:  
        client = self.load_model()  
        call_args = self._prepare_api_call_args(prompt)  
        messages = call_args["messages"]  
        api_extra_params = call_args["api_extra_params"]  
    
        try:  
            # –ï—Å–ª–∏ –ø–µ—Ä–µ–¥–∞–Ω–∞ —Å—Ö–µ–º–∞, –∏—Å–ø–æ–ª—å–∑—É–µ–º response_model  
            if schema is not None:  
                response = client.chat.completions.create(  
                    model=self.model_name_original,  
                    messages=messages,  
                    max_tokens=self.max_tokens_to_generate,  
                    response_model=schema,  # –í–∞–∂–Ω–æ: –ø–µ—Ä–µ–¥–∞–µ–º —Å—Ö–µ–º—É –∫–∞–∫ response_model  
                    **api_extra_params  
                )  
                return response  # Instructor –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç —É–∂–µ –æ–±—ä–µ–∫—Ç —Å—Ö–µ–º—ã  
            else:  
                # –û–±—ã—á–Ω—ã–π –≤—ã–∑–æ–≤ –¥–ª—è —Ç–µ–∫—Å—Ç–æ–≤–æ–≥–æ –æ—Ç–≤–µ—Ç–∞  
                response = client.chat.completions.create(  
                    model=self.model_name_original,  
                    messages=messages,  
                    max_tokens=self.max_tokens_to_generate,  
                    **api_extra_params  
                )  
                raw_response_content = response.choices[0].message.content  
                cleaned_response = self._clean_response(raw_response_content)  
                return cleaned_response  
        except Exception as e:  
            print(f"Error during synchronous generation for model '{self.model_name_original}', prompt '{prompt[:50]}...': {e}")  
            if schema is not None:  
                # –°–æ–∑–¥–∞–µ–º –∑–∞–≥–ª—É—à–∫—É —Å –Ω—É–ª–µ–≤—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏  
                default_values = {}  
                for field_name, field in schema.__annotations__.items():  
                    if field_name in ['clarity', 'depth', 'structure', 'relevance']:  
                        default_values[field_name] = 0.0  
                    else:  
                        default_values[field_name] = ""  
                return schema(**default_values)  
            return ""

    # --- –ò–ó–ú–ï–ù–ï–ù–ù–´–ô –ú–ï–¢–û–î a_generate ---
    async def a_generate(self, prompt: str, schema: Optional[BaseModel] = None) -> Union[str, BaseModel]:  
        client = self.load_async_model()  
        call_args = self._prepare_api_call_args(prompt)  
        messages = call_args["messages"]  
        api_extra_params = call_args["api_extra_params"]  
    
        try:  
            # –ï—Å–ª–∏ –ø–µ—Ä–µ–¥–∞–Ω–∞ —Å—Ö–µ–º–∞, –∏—Å–ø–æ–ª—å–∑—É–µ–º response_model  
            if schema is not None:  
                response = await client.chat.completions.create(  
                    model=self.model_name_original,  
                    messages=messages,  
                    max_tokens=self.max_tokens_to_generate,  
                    response_model=schema,  # –í–∞–∂–Ω–æ: –ø–µ—Ä–µ–¥–∞–µ–º —Å—Ö–µ–º—É –∫–∞–∫ response_model  
                    **api_extra_params  
                )  
                return response  # Instructor –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç —É–∂–µ –æ–±—ä–µ–∫—Ç —Å—Ö–µ–º—ã  
            else:  
                # –û–±—ã—á–Ω—ã–π –≤—ã–∑–æ–≤ –¥–ª—è —Ç–µ–∫—Å—Ç–æ–≤–æ–≥–æ –æ—Ç–≤–µ—Ç–∞  
                response = await client.chat.completions.create(  
                    model=self.model_name_original,  
                    messages=messages,  
                    max_tokens=self.max_tokens_to_generate,  
                    **api_extra_params  
                )  
                raw_response_content = response.choices[0].message.content  
                cleaned_response = self._clean_response(raw_response_content)  
                return cleaned_response  
        except Exception as e:  
            print(f"Error during asynchronous generation for model '{self.model_name_original}', prompt '{prompt[:50]}...': {e}")  
            if schema is not None:  
                # –°–æ–∑–¥–∞–µ–º –∑–∞–≥–ª—É—à–∫—É —Å –Ω—É–ª–µ–≤—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏  
                default_values = {}  
                for field_name, field in schema.__annotations__.items():  
                    if field_name in ['clarity', 'depth', 'structure', 'relevance']:  
                        default_values[field_name] = 0.0  
                    else:  
                        default_values[field_name] = ""  
                return schema(**default_values)  
            return ""

    def get_model_name(self) -> str:
        return self.model_name_original

In [None]:
import re  
import os  
import json  
from typing import Optional, Union, Tuple, Dict  
from deepeval.models import DeepEvalBaseLLM  
from openai import OpenAI, AsyncOpenAI  
from pydantic import BaseModel  
from lmformatenforcer import JsonSchemaParser  
  
class SGlangModel(DeepEvalBaseLLM):  
    def __init__(self,   
                 model_name: str,   
                 base_url: str,   
                 api_key: Optional[str] = "NET",  
                 enable_thinking: bool = False):  
        """  
        –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ—Ç –º–æ–¥–µ–ª—å SGlang.  
  
        Args:  
            model_name (str): –ò–º—è –º–æ–¥–µ–ª–∏.  
            base_url (str): –ë–∞–∑–æ–≤—ã–π URL –¥–ª—è API.  
            api_key (Optional[str]): API –∫–ª—é—á. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é "NET".  
            enable_thinking (bool): –§–ª–∞–≥ –¥–ª—è —É–ø—Ä–∞–≤–ª–µ–Ω–∏—è –ø–æ–≤–µ–¥–µ–Ω–∏–µ–º –º–æ–¥–µ–ª–µ–π Qwen3.  
                                     –ï—Å–ª–∏ True, –∫ –ø—Ä–æ–º–ø—Ç—É –¥–ª—è Qwen3 –±—É–¥–µ—Ç –¥–æ–±–∞–≤–ª–µ–Ω "/think".  
                                     –ï—Å–ª–∏ False, –∫ –ø—Ä–æ–º–ø—Ç—É –¥–ª—è Qwen3 –±—É–¥–µ—Ç –¥–æ–±–∞–≤–ª–µ–Ω "/no_think".  
                                     –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é False.  
        """  
        self.model_name = model_name  
        self.base_url = base_url  
        self.api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY")  
        self.enable_thinking = enable_thinking   
        self._sync_client: Optional[OpenAI] = None  
        self._async_client: Optional[AsyncOpenAI] = None  
  
    def load_model(self) -> OpenAI:  
        if self._sync_client is None:  
            self._sync_client = OpenAI(base_url=self.base_url, api_key=self.api_key)  
        return self._sync_client  
  
    def load_async_model(self) -> AsyncOpenAI:  
        if self._async_client is None:  
            self._async_client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)  
        return self._async_client  
  
    def _clean_qwen3_output(self, text_response: str) -> str:  
        """  
        –£–¥–∞–ª—è–µ—Ç –Ω–∞—á–∞–ª—å–Ω—ã–π –±–ª–æ–∫ <think>...</think> –∏–∑ –æ—Ç–≤–µ—Ç–æ–≤ –º–æ–¥–µ–ª–∏ Qwen3.  
        """  
        pattern = r'^\s*<think>.*?</think>\s*'  
        cleaned_response = re.sub(pattern, '', text_response, count=1, flags=re.DOTALL)  
        return cleaned_response  
  
    def _trim_and_load_json(self, input_string: str) -> Dict:  
        """  
        –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç —Å—Ç—Ä–æ–∫—É JSON, —É–¥–∞–ª—è—è –ª–∏—à–Ω–∏–µ —Å–∏–º–≤–æ–ª—ã –∏ –ø—Ä–µ–æ–±—Ä–∞–∑—É—è –≤ —Å–ª–æ–≤–∞—Ä—å.  
        """  
        start = input_string.find("{")  
        end = input_string.rfind("}") + 1  
        if end == 0 and start != -1:  
            input_string = input_string + "}"  
            end = len(input_string)  
        jsonStr = input_string[start:end] if start != -1 and end != 0 else ""  
        jsonStr = re.sub(r",\s*([\]}])", r"\1", jsonStr)  
        try:  
            return json.loads(jsonStr)  
        except json.JSONDecodeError:  
            error_str = "–ú–æ–¥–µ–ª—å –≤—ã–≤–µ–ª–∞ –Ω–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–π JSON. –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –∏—Å–ø–æ–ª—å–∑—É–π—Ç–µ –±–æ–ª–µ–µ –Ω–∞–¥–µ–∂–Ω—É—é –º–æ–¥–µ–ª—å."  
            raise ValueError(error_str)  
        except Exception as e:  
            raise Exception(f"–ü—Ä–æ–∏–∑–æ—à–ª–∞ –Ω–µ–ø—Ä–µ–¥–≤–∏–¥–µ–Ω–Ω–∞—è –æ—à–∏–±–∫–∞: {str(e)}")  
  
    def generate(self, prompt: str, schema: Optional[BaseModel] = None) -> Tuple[Union[str, BaseModel], float]:  
        """  
        –ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –æ—Ç–≤–µ—Ç –æ—Ç –º–æ–¥–µ–ª–∏.   
        –î–ª—è –º–æ–¥–µ–ª–∏ 'Qwen3' (–±–µ–∑ —Å—Ö–µ–º—ã):  
        - –ö –ø—Ä–æ–º–ø—Ç—É –¥–æ–±–∞–≤–ª—è–µ—Ç—Å—è "/think" –∏–ª–∏ "/no_think" –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç self.enable_thinking.  
        - –ë–ª–æ–∫ <think> –≤—Å–µ–≥–¥–∞ —É–¥–∞–ª—è–µ—Ç—Å—è –∏–∑ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ—Ç–≤–µ—Ç–∞.  
          
        Returns:  
            Tuple[Union[str, BaseModel], float]: –ö–æ—Ä—Ç–µ–∂ (—Ä–µ–∑—É–ª—å—Ç–∞—Ç, —Å—Ç–æ–∏–º–æ—Å—Ç—å)  
        """  
        client = self.load_model()  
          
        processed_prompt = prompt  
        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ Qwen3 (–Ω–µ—á—É–≤—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–∞—è –∫ —Ä–µ–≥–∏—Å—Ç—Ä—É) –∏ –æ—Ç—Å—É—Ç—Å—Ç–≤–∏–µ —Å—Ö–µ–º—ã  
        is_qwen3_text_mode = "qwen3" in self.model_name.lower() and schema is None  
  
        if is_qwen3_text_mode:  
            # –£–¥–∞–ª—è–µ–º —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–µ —Ç–µ–≥–∏ /think –∏–ª–∏ /no_think –∏–∑ –∫–æ–Ω—Ü–∞ –ø—Ä–æ–º–ø—Ç–∞  
            processed_prompt = re.sub(r'\s*/think\s*$', '', processed_prompt, flags=re.IGNORECASE).strip()  
            processed_prompt = re.sub(r'\s*/no_think\s*$', '', processed_prompt, flags=re.IGNORECASE).strip()  
              
            if self.enable_thinking:  
                processed_prompt += " /think" # –ò–Ω—Å—Ç—Ä—É–∫—Ç–∏—Ä—É–µ–º Qwen3 –≤—ã–ø–æ–ª–Ω–∏—Ç—å –ø—Ä–æ—Ü–µ—Å—Å —Ä–∞–∑–º—ã—à–ª–µ–Ω–∏—è  
            else:  
                processed_prompt += " /no_think" # –ò–Ω—Å—Ç—Ä—É–∫—Ç–∏—Ä—É–µ–º Qwen3 –ø—Ä–æ–ø—É—Å—Ç–∏—Ç—å/–º–∏–Ω–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ä–∞–∑–º—ã—à–ª–µ–Ω–∏—è  
          
        try:  
            if schema is None:  
                response = client.chat.completions.create(  
                    model=self.model_name,  
                    messages=[{"role": "user", "content": processed_prompt}],   
                )  
                raw_content = response.choices[0].message.content  
                  
                # –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º —Å—Ç–æ–∏–º–æ—Å—Ç—å, –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ  
                cost = 0  
                if hasattr(response, 'usage') and hasattr(response.usage, 'total_tokens'):  
                    cost = 0  # –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç —Å—Ç–æ–∏–º–æ—Å—Ç–∏  
                  
                if is_qwen3_text_mode:  
                    # –î–ª—è Qwen3 –≤ —Ç–µ–∫—Å—Ç–æ–≤–æ–º —Ä–µ–∂–∏–º–µ –≤—Å–µ–≥–¥–∞ –æ—á–∏—â–∞–µ–º –≤—ã–≤–æ–¥ –æ—Ç –±–ª–æ–∫–∞ <think>  
                    return self._clean_qwen3_output(raw_content), cost  
                else:  
                    # –î–ª—è –¥—Ä—É–≥–∏—Ö –º–æ–¥–µ–ª–µ–π –≤–æ–∑–≤—Ä–∞—â–∞–µ–º "—Å—ã—Ä–æ–π" –∫–æ–Ω—Ç–µ–Ω—Ç  
                    return raw_content, cost  
            else:  
                # –ò—Å–ø–æ–ª—å–∑—É–µ–º lm-format-enforcer —á–µ—Ä–µ–∑ vLLM API  
                if "vllm" in self.base_url.lower():  
                    # –ï—Å–ª–∏ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è vLLM, –∏—Å–ø–æ–ª—å–∑—É–µ–º –≤—Å—Ç—Ä–æ–µ–Ω–Ω—É—é –ø–æ–¥–¥–µ—Ä–∂–∫—É lm-format-enforcer  
                    response = client.chat.completions.create(  
                        model=self.model_name,  
                        messages=[{"role": "user", "content": processed_prompt}],  
                        extra_body={  
                            "guided_json": schema.model_json_schema(),  
                            "guided_decoding_backend": "lm-format-enforcer"  
                        }  
                    )  
                    raw_content = response.choices[0].message.content  
                      
                    # –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º —Å—Ç–æ–∏–º–æ—Å—Ç—å, –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ  
                    cost = 0  
                    if hasattr(response, 'usage') and hasattr(response.usage, 'total_tokens'):  
                        cost = 0  # –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç —Å—Ç–æ–∏–º–æ—Å—Ç–∏  
                      
                    # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º JSON-—Å—Ç—Ä–æ–∫—É –≤ –æ–±—ä–µ–∫—Ç —Å—Ö–µ–º—ã  
                    json_data = self._trim_and_load_json(raw_content)  
                    return schema.model_validate(json_data), cost  
                else:  
                    # –î–ª—è –¥—Ä—É–≥–∏—Ö API –∏—Å–ø–æ–ª—å–∑—É–µ–º –æ–±—ã—á–Ω—ã–π –∑–∞–ø—Ä–æ—Å —Å –ø–æ—Å–ª–µ–¥—É—é—â–µ–π –≤–∞–ª–∏–¥–∞—Ü–∏–µ–π  
                    # –î–æ–±–∞–≤–ª—è–µ–º –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏—é –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ JSON –≤ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–∏ —Å–æ —Å—Ö–µ–º–æ–π  
                    schema_json = json.dumps(schema.model_json_schema(), indent=2)  
                    json_prompt = f"{processed_prompt}\n\n–û—Ç–≤–µ—Ç –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –≤ —Ñ–æ—Ä–º–∞—Ç–µ JSON, —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–µ–º —Å–ª–µ–¥—É—é—â–µ–π —Å—Ö–µ–º–µ:\n{schema_json}\n\n–í–∞–∂–Ω–æ: –û—Ç–≤–µ—Ç –¥–æ–ª–∂–µ–Ω —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Ç–æ–ª—å–∫–æ –≤–∞–ª–∏–¥–Ω—ã–π JSON –±–µ–∑ –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞."  
                      
                    response = client.chat.completions.create(  
                        model=self.model_name,  
                        messages=[{"role": "user", "content": json_prompt}],  
                    )  
                    raw_content = response.choices[0].message.content  
                      
                    # –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º —Å—Ç–æ–∏–º–æ—Å—Ç—å, –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ  
                    cost = 0  
                    if hasattr(response, 'usage') and hasattr(response.usage, 'total_tokens'):  
                        cost = 0  # –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç —Å—Ç–æ–∏–º–æ—Å—Ç–∏  
                      
                    # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –∏ –≤–∞–ª–∏–¥–∏—Ä—É–µ–º JSON  
                    json_data = self._trim_and_load_json(raw_content)  
                    return schema.model_validate(json_data), cost  
        except Exception as e:  
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ —Å–∏–Ω—Ö—Ä–æ–Ω–Ω–æ–π –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ –¥–ª—è –ø—Ä–æ–º–ø—Ç–∞ '{prompt[:50]}...': {e}")  
            raise e  
  
    async def a_generate(self, prompt: str, schema: Optional[BaseModel] = None) -> Tuple[Union[str, BaseModel], float]:  
        """  
        –ê—Å–∏–Ω—Ö—Ä–æ–Ω–Ω–æ –≥–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –æ—Ç–≤–µ—Ç –æ—Ç –º–æ–¥–µ–ª–∏.  
        –î–ª—è –º–æ–¥–µ–ª–∏ 'Qwen3' (–±–µ–∑ —Å—Ö–µ–º—ã):  
        - –ö –ø—Ä–æ–º–ø—Ç—É –¥–æ–±–∞–≤–ª—è–µ—Ç—Å—è "/think" –∏–ª–∏ "/no_think" –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç self.enable_thinking.  
        - –ë–ª–æ–∫ <think> –≤—Å–µ–≥–¥–∞ —É–¥–∞–ª—è–µ—Ç—Å—è –∏–∑ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ –æ—Ç–≤–µ—Ç–∞.  
          
        Returns:  
            Tuple[Union[str, BaseModel], float]: –ö–æ—Ä—Ç–µ–∂ (—Ä–µ–∑—É–ª—å—Ç–∞—Ç, —Å—Ç–æ–∏–º–æ—Å—Ç—å)  
        """  
        client = self.load_async_model()  
  
        processed_prompt = prompt  
        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ Qwen3 (–Ω–µ—á—É–≤—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–∞—è –∫ —Ä–µ–≥–∏—Å—Ç—Ä—É) –∏ –æ—Ç—Å—É—Ç—Å—Ç–≤–∏–µ —Å—Ö–µ–º—ã  
        is_qwen3_text_mode = "qwen3" in self.model_name.lower() and schema is None  
  
        if is_qwen3_text_mode:  
            # –£–¥–∞–ª—è–µ–º —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–µ —Ç–µ–≥–∏ /think –∏–ª–∏ /no_think –∏–∑ –∫–æ–Ω—Ü–∞ –ø—Ä–æ–º–ø—Ç–∞  
            processed_prompt = re.sub(r'\s*/think\s*$', '', processed_prompt, flags=re.IGNORECASE).strip()  
            processed_prompt = re.sub(r'\s*/no_think\s*$', '', processed_prompt, flags=re.IGNORECASE).strip()  
  
            if self.enable_thinking:  
                processed_prompt += " /think" # –ò–Ω—Å—Ç—Ä—É–∫—Ç–∏—Ä—É–µ–º Qwen3 –≤—ã–ø–æ–ª–Ω–∏—Ç—å –ø—Ä–æ—Ü–µ—Å—Å —Ä–∞–∑–º—ã—à–ª–µ–Ω–∏—è  
            else:  
                processed_prompt += " /no_think" # –ò–Ω—Å—Ç—Ä—É–∫—Ç–∏—Ä—É–µ–º Qwen3 –ø—Ä–æ–ø—É—Å—Ç–∏—Ç—å/–º–∏–Ω–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ä–∞–∑–º—ã—à–ª–µ–Ω–∏—è  
  
        try:  
            if schema is None:  
                response = await client.chat.completions.create(  
                    model=self.model_name,  
                    messages=[{"role": "user", "content": processed_prompt}],   
                )  
                raw_content = response.choices[0].message.content  
                  
                # –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º —Å—Ç–æ–∏–º–æ—Å—Ç—å, –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ  
                cost = 0  
                if hasattr(response, 'usage') and hasattr(response.usage, 'total_tokens'):  
                    cost = 0  # –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç —Å—Ç–æ–∏–º–æ—Å—Ç–∏  
  
                if is_qwen3_text_mode:  
                    # –î–ª—è Qwen3 –≤ —Ç–µ–∫—Å—Ç–æ–≤–æ–º —Ä–µ–∂–∏–º–µ –≤—Å–µ–≥–¥–∞ –æ—á–∏—â–∞–µ–º –≤—ã–≤–æ–¥ –æ—Ç –±–ª–æ–∫–∞ <think>  
                    return self._clean_qwen3_output(raw_content), cost  
                else:  
                    # –î–ª—è –¥—Ä—É–≥–∏—Ö –º–æ–¥–µ–ª–µ–π –≤–æ–∑–≤—Ä–∞—â–∞–µ–º "—Å—ã—Ä–æ–π" –∫–æ–Ω—Ç–µ–Ω—Ç  
                    return raw_content, cost  
            else:  
                # –ò—Å–ø–æ–ª—å–∑—É–µ–º lm-format-enforcer —á–µ—Ä–µ–∑ vLLM API  
                if "vllm" in self.base_url.lower():  
                    # –ï—Å–ª–∏ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è vLLM, –∏—Å–ø–æ–ª—å–∑—É–µ–º –≤—Å—Ç—Ä–æ–µ–Ω–Ω—É—é –ø–æ–¥–¥–µ—Ä–∂–∫—É lm-format-enforcer  
                    response = await client.chat.completions.create(  
                        model=self.model_name,  
                        messages=[{"role": "user", "content": processed_prompt}],  
                        extra_body={  
                            "guided_json": schema.model_json_schema(),  
                            "guided_decoding_backend": "lm-format-enforcer"  
                        }  
                    )  
                    raw_content = response.choices[0].message.content  
                      
                    # –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º —Å—Ç–æ–∏–º–æ—Å—Ç—å, –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ  
                    cost = 0  
                    if hasattr(response, 'usage') and hasattr(response.usage, 'total_tokens'):  
                        cost = 0  # –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç —Å—Ç–æ–∏–º–æ—Å—Ç–∏  
                      
                    # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º JSON-—Å—Ç—Ä–æ–∫—É –≤ –æ–±—ä–µ–∫—Ç —Å—Ö–µ–º—ã  
                    json_data = self._trim_and_load_json(raw_content)  
                    return schema.model_validate(json_data), cost  
                else:  
                    # –î–ª—è –¥—Ä—É–≥–∏—Ö API –∏—Å–ø–æ–ª—å–∑—É–µ–º –æ–±—ã—á–Ω—ã–π –∑–∞–ø—Ä–æ—Å —Å –ø–æ—Å–ª–µ–¥—É—é—â–µ–π –≤–∞–ª–∏–¥–∞—Ü–∏–µ–π  
                    # –î–æ–±–∞–≤–ª—è–µ–º –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏—é –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ JSON –≤ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–∏ —Å–æ —Å—Ö–µ–º–æ–π  
                    schema_json = json.dumps(schema.model_json_schema(), indent=2)  
                    json_prompt = f"{processed_prompt}\n\n–û—Ç–≤–µ—Ç –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –≤ —Ñ–æ—Ä–º–∞—Ç–µ JSON, —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–µ–º —Å–ª–µ–¥—É—é—â–µ–π —Å—Ö–µ–º–µ:\n{schema_json}\n\n–í–∞–∂–Ω–æ: –û—Ç–≤–µ—Ç –¥–æ–ª–∂–µ–Ω —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Ç–æ–ª—å–∫–æ –≤–∞–ª–∏–¥–Ω—ã–π JSON –±–µ–∑ –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞."  
                      
                    response = await client.chat.completions.create(  
                        model=self.model_name,  
                        messages=[{"role": "user", "content": json_prompt}],  
                    )  
                    raw_content = response.choices[0].message.content  
                      
                    # –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º —Å—Ç–æ–∏–º–æ—Å—Ç—å, –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ  
                    cost = 0  
                    if hasattr(response, 'usage') and hasattr(response.usage, 'total_tokens'):  
                        cost = 0  # –ó–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Ä–µ–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç —Å—Ç–æ–∏–º–æ—Å—Ç–∏  
                      
                    # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –∏ –≤–∞–ª–∏–¥–∏—Ä—É–µ–º JSON  
                    json_data = self._trim_and_load_json(raw_content)  
                    return schema.model_validate(json_data), cost  
        except Exception as e:  
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∞—Å–∏–Ω—Ö—Ä–æ–Ω–Ω–æ–π –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ –¥–ª—è –ø—Ä–æ–º–ø—Ç–∞ '{prompt[:50]}...': {e}")  
            raise e  
  
    def get_model_name(self) -> str:  
        return self.model_name

In [4]:
import os
import re
from typing import Optional, List, Dict, Any
from deepeval.models import DeepEvalBaseLLM
from openai import OpenAI, AsyncOpenAI

class SGlangModel(DeepEvalBaseLLM):
    """
    Generic DeepEval LLM wrapper for models served via an
    OpenAI-compatible API (e.g., SGLang, vLLM).
    Can attempt to enable model-specific thinking/reasoning modes
    based on model_name and removes <think>...</think> tags from responses if they appear.
    """
    # –°–ø–µ—Ü–∏—Ñ–∏—á–µ—Å–∫–∞—è –∏–Ω—Å—Ç—Ä—É–∫—Ü–∏—è –¥–ª—è Cogito
    COGITO_THINKING_INSTRUCTION = "Enable deep thinking subroutine."

    def __init__(self,
                 model_name: str, # –ò–º—è –º–æ–¥–µ–ª–∏, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –¥–ª—è –ª–æ–≥–∏–∫–∏
                 base_url: str,   # URL —ç–Ω–¥–ø–æ–∏–Ω—Ç–∞ OpenAI-—Å–æ–≤–º–µ—Å—Ç–∏–º–æ–≥–æ API
                 api_key: Optional[str] = None, # API –∫–ª—é—á (—á–∞—Å—Ç–æ "EMPTY" –∏–ª–∏ –Ω–µ –Ω—É–∂–µ–Ω –¥–ª—è –ª–æ–∫–∞–ª—å–Ω—ã—Ö)
                 attempt_thinking_mode: bool = False, # –ü—ã—Ç–∞—Ç—å—Å—è –ª–∏ –≤–∫–ª—é—á–∏—Ç—å —Ä–µ–∂–∏–º —Ä–∞—Å—Å—É–∂–¥–µ–Ω–∏–π?
                 cleaning_method: str = "rsplit", # –ú–µ—Ç–æ–¥ –æ—á–∏—Å—Ç–∫–∏ —Ç–µ–≥–æ–≤: 'rsplit' –∏–ª–∏ 'regex'
                 max_tokens: int = 8192 # –ú–∞–∫—Å. —Ç–æ–∫–µ–Ω–æ–≤ –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
                ):
        """
        Initializes the SGlangModel wrapper.

        Args:
            model_name: Name of the model being served (e.g., "Qwen/Qwen3-30B-A3B", "deepcogito/cogito-v1...").
            base_url: The base URL of the OpenAI-compatible API endpoint.
            api_key: Optional API key for the endpoint. Defaults to env variable or "EMPTY".
            attempt_thinking_mode: If True, tries to enable thinking mode based on model_name. Defaults to False.
            cleaning_method: Method to use for removing <think> tags ('rsplit' or 'regex'). Defaults to 'rsplit'.
            max_tokens: Maximum number of new tokens to generate. Defaults to 8192.
        """
        self.model_name_original = model_name # –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–µ –∏–º—è
        self.model_name_lower = model_name.lower() # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ –Ω–∏–∂–Ω–µ–º —Ä–µ–≥–∏—Å—Ç—Ä–µ –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
        self.base_url = base_url
        self.api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY", "EMPTY")
        self._sync_client: Optional[OpenAI] = None
        self._async_client: Optional[AsyncOpenAI] = None
        self.attempt_thinking_mode = attempt_thinking_mode
        self.cleaning_method = cleaning_method
        self.max_tokens_to_generate = max_tokens

    def load_model(self) -> OpenAI:
        """Loads or returns the synchronous OpenAI client."""
        if self._sync_client is None:
            self._sync_client = OpenAI(base_url=self.base_url, api_key=self.api_key)
        return self._sync_client

    def load_async_model(self) -> AsyncOpenAI:
        """Loads or returns the asynchronous OpenAI client."""
        if self._async_client is None:
            self._async_client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)
        return self._async_client

    def _clean_response(self, raw_response: str) -> str:
        """
        Helper function to remove <think>...</think> blocks from the raw response string.
        Uses the method specified in self.cleaning_method.
        """
        if not raw_response:
            return ""

        if self.cleaning_method == "rsplit":
            closing_tag = '</think>'
            if closing_tag in raw_response:
                # –†–∞–∑–¥–µ–ª—è–µ–º –ø–æ –ø–æ—Å–ª–µ–¥–Ω–µ–º—É —Ç–µ–≥—É –∏ –±–µ—Ä–µ–º –ø—Ä–∞–≤—É—é —á–∞—Å—Ç—å
                main_answer = raw_response.rsplit(closing_tag, 1)[-1].strip()
            else:
                # –ï—Å–ª–∏ —Ç–µ–≥–∞ –Ω–µ—Ç, –ø—Ä–æ—Å—Ç–æ —É–±–∏—Ä–∞–µ–º –ø—Ä–æ–±–µ–ª—ã –ø–æ –∫—Ä–∞—è–º
                main_answer = raw_response.strip()
            return main_answer

        elif self.cleaning_method == "regex":
            # –®–∞–±–ª–æ–Ω –¥–ª—è —É–¥–∞–ª–µ–Ω–∏—è <think>...</think> –∏ –ø—Ä–æ–±–µ–ª–æ–≤ –ø–æ—Å–ª–µ
            pattern = r'<think>.*?</think>\s*'
            # –ó–∞–º–µ–Ω—è–µ–º –Ω–∞–π–¥–µ–Ω–Ω–æ–µ –Ω–∞ –ø—É—Å—Ç—É—é —Å—Ç—Ä–æ–∫—É, DOTALL –¥–ª—è –ø–µ—Ä–µ–Ω–æ—Å–æ–≤ —Å—Ç—Ä–æ–∫
            main_answer = re.sub(pattern, '', raw_response, flags=re.DOTALL).strip()
            return main_answer
        else:
            # –ï—Å–ª–∏ –º–µ—Ç–æ–¥ –Ω–µ 'rsplit' –∏ –Ω–µ 'regex', –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –∫–∞–∫ –µ—Å—Ç—å, —É–±—Ä–∞–≤ –ø—Ä–æ–±–µ–ª—ã
            print(f"Warning: Unknown cleaning_method '{self.cleaning_method}'. Returning raw response.")
            return raw_response.strip()

    def _prepare_api_call_args(self, prompt: str) -> Dict[str, Any]:
        """
        Prepares the messages list and a dictionary of extra API parameters
        based on the model name and the attempt_thinking_mode flag.
        """
        # –ë–∞–∑–æ–≤—ã–π —Å–ø–∏—Å–æ–∫ —Å–æ–æ–±—â–µ–Ω–∏–π - —Ç–æ–ª—å–∫–æ –ø—Ä–æ–º–ø—Ç –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
        messages: List[Dict[str, str]] = [{"role": "user", "content": prompt}]
        # –°–ª–æ–≤–∞—Ä—å –¥–ª—è –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã—Ö –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ API (–Ω–∞–ø—Ä–∏–º–µ—Ä, enable_thinking)
        api_extra_params: Dict[str, Any] = {}

        if self.attempt_thinking_mode:
            # --- –õ–æ–≥–∏–∫–∞ –¥–ª—è –∫–æ–Ω–∫—Ä–µ—Ç–Ω—ã—Ö –º–æ–¥–µ–ª–µ–π ---
            if "cogito" in self.model_name_lower:
                # –î–æ–±–∞–≤–ª—è–µ–º —Å–∏—Å—Ç–µ–º–Ω—ã–π –ø—Ä–æ–º–ø—Ç –¥–ª—è Cogito –í –ù–ê–ß–ê–õ–û —Å–ø–∏—Å–∫–∞
                messages.insert(0, {"role": "system", "content": self.COGITO_THINKING_INSTRUCTION})
                print(f"Info: Enabling Cogito thinking mode via system prompt for model '{self.model_name_original}'.")

            elif "qwen3" in self.model_name_lower:
                # –î–æ–±–∞–≤–ª—è–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä API –¥–ª—è Qwen3
                # –í–ê–ñ–ù–û: –ò–º—è –ø–∞—Ä–∞–º–µ—Ç—Ä–∞ 'enable_thinking' —è–≤–ª—è–µ—Ç—Å—è –ü–†–ï–î–ü–û–õ–û–ñ–ï–ù–ò–ï–ú.
                # –ü—Ä–æ–≤–µ—Ä—å—Ç–µ –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—é –≤–∞—à–µ–≥–æ —Å–µ—Ä–≤–µ—Ä–∞ SGLang/vLLM!
                # –ï—Å–ª–∏ –≤–æ–∑–Ω–∏–∫–Ω–µ—Ç –æ—à–∏–±–∫–∞, –≤–æ–∑–º–æ–∂–Ω–æ, –ø–æ—Ç—Ä–µ–±—É–µ—Ç—Å—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å 'extra_body'.
                api_extra_params["enable_thinking"] = True
                print(f"Info: Attempting to enable Qwen3 thinking mode via API parameter for model '{self.model_name_original}'.")

            else:
                # –ú–æ–¥–µ–ª—å –Ω–µ —Ä–∞—Å–ø–æ–∑–Ω–∞–Ω–∞ –¥–ª—è —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–∏ —Ä–µ–∂–∏–º–∞ —Ä–∞—Å—Å—É–∂–¥–µ–Ω–∏–π
                print(f"Warning: 'attempt_thinking_mode' is True, but no specific handling defined for model '{self.model_name_original}'. Making standard call.")
            # --- –ö–æ–Ω–µ—Ü –ª–æ–≥–∏–∫–∏ –¥–ª—è –º–æ–¥–µ–ª–µ–π ---

        # –í–æ–∑–≤—Ä–∞—â–∞–µ–º —Å–ª–æ–≤–∞—Ä—å —Å –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–º–∏ —Å–æ–æ–±—â–µ–Ω–∏—è–º–∏ –∏ –¥–æ–ø. –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏
        return {"messages": messages, "api_extra_params": api_extra_params}

    def generate(self, prompt: str) -> str:
        """
        Generates a response synchronously.
        If attempt_thinking_mode is True, it modifies the request based on the model.
        It always cleans the response to remove <think> tags.
        """
        client = self.load_model()
        # –ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –∞—Ä–≥—É–º–µ–Ω—Ç—ã –¥–ª—è –≤—ã–∑–æ–≤–∞ API
        call_args = self._prepare_api_call_args(prompt)
        messages = call_args["messages"]
        api_extra_params = call_args["api_extra_params"]

        try:
            # –í—ã–ø–æ–ª–Ω—è–µ–º –≤—ã–∑–æ–≤ API, –ø–µ—Ä–µ–¥–∞–≤–∞—è –æ—Å–Ω–æ–≤–Ω—ã–µ –∏ –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
            response = client.chat.completions.create(
                model=self.model_name_original, # –ò—Å–ø–æ–ª—å–∑—É–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–µ –∏–º—è –º–æ–¥–µ–ª–∏
                messages=messages,
                max_tokens=self.max_tokens_to_generate,
                **api_extra_params # –†–∞—Å–ø–∞–∫–æ–≤—ã–≤–∞–µ–º –¥–æ–ø. –ø–∞—Ä–∞–º–µ—Ç—Ä—ã (–º–æ–∂–µ—Ç –±—ã—Ç—å –ø—É—Å—Ç—ã–º)
                # –ü—Ä–∏–º–µ—á–∞–Ω–∏–µ: –ï—Å–ª–∏ 'enable_thinking' –Ω–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–∞–∫ –ø—Ä—è–º–æ–π –ø–∞—Ä–∞–º–µ—Ç—Ä,
                # –ø–æ–ø—Ä–æ–±—É–π—Ç–µ –ø–µ—Ä–µ–¥–∞—Ç—å –µ–≥–æ —Ç–∞–∫ (–∑–∞–∫–æ–º–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–≤ —Å—Ç—Ä–æ–∫—É –≤—ã—à–µ —Å **api_extra_params):
                # extra_body=api_extra_params if api_extra_params else None
            )
            # –ü–æ–ª—É—á–∞–µ–º —Å—ã—Ä–æ–π —Ç–µ–∫—Å—Ç–æ–≤—ã–π –æ—Ç–≤–µ—Ç
            raw_response_content = response.choices[0].message.content
            # –í—Å–µ–≥–¥–∞ –æ—á–∏—â–∞–µ–º –æ—Ç–≤–µ—Ç –æ—Ç —Ç–µ–≥–æ–≤ <think>
            cleaned_response = self._clean_response(raw_response_content)
            return cleaned_response
        except Exception as e:
            print(f"Error during synchronous generation for model '{self.model_name_original}', prompt '{prompt[:50]}...': {e}")
            # –î–æ–±–∞–≤–ª—è–µ–º –ø–æ–¥—Å–∫–∞–∑–∫–∏, –µ—Å–ª–∏ –æ—à–∏–±–∫–∞ —Å–≤—è–∑–∞–Ω–∞ —Å –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–º 'enable_thinking'
            if api_extra_params.get("enable_thinking") and "unexpected keyword argument 'enable_thinking'" in str(e).lower():
                 print("Hint: The API might not support 'enable_thinking' as a direct parameter. Try modifying the wrapper to use 'extra_body'.")
            elif api_extra_params.get("enable_thinking") and "extra_body" in str(e).lower():
                 print("Hint: Check the exact API documentation for SGLang/vLLM OpenAI-compatible endpoint for enabling Qwen3 thinking mode.")
            return "" # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç—É—é —Å—Ç—Ä–æ–∫—É –≤ —Å–ª—É—á–∞–µ –æ—à–∏–±–∫–∏

    async def a_generate(self, prompt: str) -> str:
        """
        Generates a response asynchronously.
        If attempt_thinking_mode is True, it modifies the request based on the model.
        It always cleans the response to remove <think> tags.
        """
        client = self.load_async_model()
        # –ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –∞—Ä–≥—É–º–µ–Ω—Ç—ã –¥–ª—è –≤—ã–∑–æ–≤–∞ API
        call_args = self._prepare_api_call_args(prompt)
        messages = call_args["messages"]
        api_extra_params = call_args["api_extra_params"]

        try:
            # –í—ã–ø–æ–ª–Ω—è–µ–º –∞—Å–∏–Ω—Ö—Ä–æ–Ω–Ω—ã–π –≤—ã–∑–æ–≤ API
            response = await client.chat.completions.create(
                model=self.model_name_original, # –ò—Å–ø–æ–ª—å–∑—É–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–µ –∏–º—è –º–æ–¥–µ–ª–∏
                messages=messages,
                max_tokens=self.max_tokens_to_generate,
                **api_extra_params # –†–∞—Å–ø–∞–∫–æ–≤—ã–≤–∞–µ–º –¥–æ–ø. –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
                 # –ü—Ä–∏–º–µ—á–∞–Ω–∏–µ: –ï—Å–ª–∏ 'enable_thinking' –Ω–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–∞–∫ –ø—Ä—è–º–æ–π –ø–∞—Ä–∞–º–µ—Ç—Ä,
                 # –ø–æ–ø—Ä–æ–±—É–π—Ç–µ –ø–µ—Ä–µ–¥–∞—Ç—å –µ–≥–æ —Ç–∞–∫ (–∑–∞–∫–æ–º–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–≤ —Å—Ç—Ä–æ–∫—É –≤—ã—à–µ —Å **api_extra_params):
                 # extra_body=api_extra_params if api_extra_params else None
            )
            # –ü–æ–ª—É—á–∞–µ–º —Å—ã—Ä–æ–π —Ç–µ–∫—Å—Ç–æ–≤—ã–π –æ—Ç–≤–µ—Ç
            raw_response_content = response.choices[0].message.content
            # –í—Å–µ–≥–¥–∞ –æ—á–∏—â–∞–µ–º –æ—Ç–≤–µ—Ç –æ—Ç —Ç–µ–≥–æ–≤ <think>
            cleaned_response = self._clean_response(raw_response_content)
            return cleaned_response
        except Exception as e:
            print(f"Error during asynchronous generation for model '{self.model_name_original}', prompt '{prompt[:50]}...': {e}")
            # –î–æ–±–∞–≤–ª—è–µ–º –∞–Ω–∞–ª–æ–≥–∏—á–Ω—ã–µ –ø–æ–¥—Å–∫–∞–∑–∫–∏ –ø—Ä–æ enable_thinking / extra_body
            if api_extra_params.get("enable_thinking") and "unexpected keyword argument 'enable_thinking'" in str(e).lower():
                 print("Hint: The API might not support 'enable_thinking' as a direct parameter. Try modifying the wrapper to use 'extra_body'.")
            elif api_extra_params.get("enable_thinking") and "extra_body" in str(e).lower():
                 print("Hint: Check the exact API documentation for SGLang/vLLM OpenAI-compatible endpoint for enabling Qwen3 thinking mode.")
            return "" # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç—É—é —Å—Ç—Ä–æ–∫—É –≤ —Å–ª—É—á–∞–µ –æ—à–∏–±–∫–∏

    def get_model_name(self) -> str:
        """
        Returns the original name of the model used for initialization.
        """
        # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–µ –∏–º—è, —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω–æ–µ –ø—Ä–∏ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏–∏
        return self.model_name_original


In [5]:
CogitoLLM= SGlangModel(model_name="deepcogito/cogito-v1-preview-llama-8B", base_url="http://85.143.167.11:30000/v1")

# Qwen3_30_Reasoning = LLMModel(model_name="qwen3-30-lmstudio", base_url="http://85.143.167.11:30000/v1", attempt_thinking_mode=True, cleaning_method="rsplit")
# Qwen3_30 = LLMModel(model_name="qwen3-30-lmstudio", base_url="http://85.143.167.11:30000/v1", attempt_thinking_mode=False)

# Qwen3_32_Reasoning = LLMModel(model_name="Qwen/Qwen3-32B-AWQ", base_url="http://85.143.167.11:30000/v1", attempt_thinking_mode=True, cleaning_method="rsplit")
# Qwen3_32 = LLMModel(model_name="Qwen/Qwen3-32B-AWQ", base_url="http://85.143.167.11:30000/v1", attempt_thinking_mode=False)

# Qwen3_8_Reasoning = LLMModel(model_name="Qwen/Qwen3-8B-FP8", base_url="http://85.143.167.11:30000/v1", attempt_thinking_mode=True, cleaning_method="rsplit")
# Qwen3_8 = LLMModel(model_name="Qwen/Qwen3-8B-FP8", base_url="http://85.143.167.11:30000/v1", attempt_thinking_mode=False)



In [6]:
BertaEmbeddings = InfinityEmbeddingModel(model_name="sergeyzh/BERTA", base_url="http://127.0.0.1:7997")
USER2Embeddings = InfinityEmbeddingModel(model_name="deepvk/USER2-base", base_url="http://127.0.0.1:7997")
RuEnROBERTAEmbeddings = InfinityEmbeddingModel(model_name="ai-forever/ru-en-RoSBERTa", base_url="http://127.0.0.1:7997")

In [7]:
from deepeval.synthesizer import Synthesizer, Evolution  
from deepeval.synthesizer.config import StylingConfig, EvolutionConfig, ContextConstructionConfig, FiltrationConfig  
  
context_construction_config = ContextConstructionConfig(  
    chunk_size=2048,                 
    chunk_overlap=0,            
    max_contexts_per_document=3,
    context_quality_threshold=0.4,
    max_retries=2,
    critic_model = CogitoLLM,
    embedder = BertaEmbeddings)  
  
styling_config = StylingConfig(  
    input_format="""–°–ª–æ–∂–Ω—ã–µ –∞–∫–∞–¥–µ–º–∏—á–µ—Å–∫–∏–µ –≤–æ–ø—Ä–æ—Å—ã –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ, –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–µ –Ω–∞ —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–∏ –º–µ–¥–∏—Ü–∏–Ω—Å–∫–∏—Ö —É—á–µ–±–Ω–∏–∫–æ–≤.   
    –í–æ–ø—Ä–æ—Å—ã –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å —Å—Ñ–æ—Ä–º—É–ª–∏—Ä–æ–≤–∞–Ω—ã —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º —Ç–æ—á–Ω–æ–π –º–µ–¥–∏—Ü–∏–Ω—Å–∫–æ–π —Ç–µ—Ä–º–∏–Ω–æ–ª–æ–≥–∏–∏,   
    —Ç—Ä–µ–±–æ–≤–∞—Ç—å –≥–ª—É–±–æ–∫–æ–≥–æ –ø–æ–Ω–∏–º–∞–Ω–∏—è –º–∞—Ç–µ—Ä–∏–∞–ª–∞ –∏ –ø—Ä–æ–≤–µ—Ä—è—Ç—å —Å–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å –ø—Ä–∏–º–µ–Ω—è—Ç—å —Ç–µ–æ—Ä–µ—Ç–∏—á–µ—Å–∫–∏–µ –∑–Ω–∞–Ω–∏—è   
    –∫ –∫–ª–∏–Ω–∏—á–µ—Å–∫–∏–º —Å–∏—Ç—É–∞—Ü–∏—è–º.""",  
      
    expected_output_format="""–°—Ç—Ä—É–∫—Ç—É—Ä–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ, –Ω–∞—É—á–Ω–æ –æ–±–æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–µ –æ—Ç–≤–µ—Ç—ã –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ,   
    –≤–∫–ª—é—á–∞—é—â–∏–µ –∫–ª—é—á–µ–≤—ã–µ –∫–æ–Ω—Ü–µ–ø—Ü–∏–∏ –∏–∑ —É—á–µ–±–Ω–∏–∫–∞, —Ç–æ—á–Ω—ã–µ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è, –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏ –∏ –º–µ—Ö–∞–Ω–∏–∑–º—ã.   
    –û—Ç–≤–µ—Ç—ã –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –æ—Ä–≥–∞–Ω–∏–∑–æ–≤–∞–Ω—ã –≤ –ª–æ–≥–∏—á–µ—Å–∫–æ–π –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –ø–æ–¥–∑–∞–≥–æ–ª–æ–≤–∫–æ–≤, –≥–¥–µ —ç—Ç–æ —É–º–µ—Å—Ç–Ω–æ.""",  
      
    task="""–°–æ–∑–¥–∞–Ω–∏–µ –≤—ã—Å–æ–∫–æ–∫–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã—Ö —ç–∫–∑–∞–º–µ–Ω–∞—Ü–∏–æ–Ω–Ω—ã—Ö –≤–æ–ø—Ä–æ—Å–æ–≤ –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ,   
    –∫–æ—Ç–æ—Ä—ã–µ —Ç–æ—á–Ω–æ –æ—Ç—Ä–∞–∂–∞—é—Ç —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–µ —É—á–µ–±–Ω–∏–∫–∞ –∏ –ø—Ä–æ–≤–µ—Ä—è—é—Ç –≥–ª—É–±–∏–Ω—É –ø–æ–Ω–∏–º–∞–Ω–∏—è –º–∞—Ç–µ—Ä–∏–∞–ª–∞.""",  
      
    scenario="""–ü—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—å –º–µ–¥–∏—Ü–∏–Ω—Å–∫–æ–≥–æ –≤—É–∑–∞ —Å–æ–∑–¥–∞–µ—Ç –±–∞–Ω–∫ –≤–æ–ø—Ä–æ—Å–æ–≤ –¥–ª—è —ç–∫–∑–∞–º–µ–Ω–æ–≤,   
    —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è –∏ —Å–∞–º–æ–ø—Ä–æ–≤–µ—Ä–∫–∏ —Å—Ç—É–¥–µ–Ω—Ç–æ–≤. –í–æ–ø—Ä–æ—Å—ã –¥–æ–ª–∂–Ω—ã —Ç–æ—á–Ω–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–æ–≤–∞—Ç—å   
    —Å–æ–¥–µ—Ä–∂–∞–Ω–∏—é —É—á–µ–±–Ω–∏–∫–æ–≤ –∏ –±—ã—Ç—å –ø—Ä–∏–≥–æ–¥–Ω—ã–º–∏ –¥–ª—è –æ—Ü–µ–Ω–∫–∏ –∫–æ–º–ø–µ—Ç–µ–Ω—Ü–∏–π —Å—Ç—É–¥–µ–Ω—Ç–æ–≤   
    —Ä–∞–∑–Ω—ã—Ö –∫—É—Ä—Å–æ–≤ –º–µ–¥–∏—Ü–∏–Ω—Å–∫–æ–≥–æ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è.""",  
)  
  
evolution_config = EvolutionConfig(  
    num_evolutions=2, 
    evolutions={  
        Evolution.CONCRETIZING: 0.25,     
        Evolution.MULTICONTEXT: 0.25,   
        Evolution.COMPARATIVE: 0.25,   
        Evolution.CONSTRAINED: 0.25,
    }  
) 

filtration_config = FiltrationConfig(  
    synthetic_input_quality_threshold=0.5,
    critic_model= CogitoLLM,
    max_quality_retries=2                
)

synthesizer = Synthesizer(
    model=CogitoLLM,  
    styling_config=styling_config,  
    # evolution_config=evolution_config,
    # filtration_config=filtration_config,  
    async_mode=True,  
    max_concurrent=8
)  
  


In [8]:
goldens = await synthesizer.a_generate_goldens_from_docs(  
    document_paths=['/mnt/sdb1/PycharmProjects/CODUP/AI-tutor-other/docs/for_golds/Anatomia_cheloveka_1_tom_2-52-57.pdf',
                    '/mnt/sdb1/PycharmProjects/CODUP/AI-tutor-other/docs/for_golds/Kapandzhi_-_Pozvonochnik-276-284.pdf'],  
    include_expected_output=True,  
    max_goldens_per_context=3,  
    context_construction_config=context_construction_config  
)  
  

‚ú® üöÄ ‚ú® Loading Documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.00s/it]
‚ú® üìö ‚ú® Chunking Documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.03it/s]
‚ú® üß© ‚ú® Generating Contexts:   0%|          | 0/12 [00:00<?, ?it/s]
[A

ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.