In [41]:
import os
import pandas as pd

from langchain.document_loaders import DataFrameLoader
from langchain_mistralai import ChatMistralAI, MistralAIEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers.single_hop.specific import (
    SingleHopSpecificQuerySynthesizer,
)
from ragas.testset.transforms.extractors.llm_based import (
    NERExtractor,
    HeadlinesExtractor,
    KeyphrasesExtractor
)
from ragas.testset.transforms.splitters import (
    HeadlineSplitter,
)
from ragas import evaluate, EvaluationDataset
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas.testset.persona import Persona
from ragas.run_config import RunConfig

In [42]:
data_path = "/Users/danilovsnnv/Work/data/telegram_posts.csv"
df = pd.read_csv(data_path)
df = df.dropna(subset=['text'])

In [43]:
df.shape

(4508, 9)

In [44]:
# sample for dev
df = df.sample(500)

In [45]:
df.shape

(500, 9)

In [47]:
loader = DataFrameLoader(df[['name', 'text']], page_content_column="text")
docs = loader.load()

In [48]:
# MistralAI models
llm = LangchainLLMWrapper(ChatMistralAI(api_key=os.getenv('MISTRAL_API_KEY')))
embeddings = LangchainEmbeddingsWrapper(MistralAIEmbeddings(model="mistral-embed", api_key=os.getenv('MISTRAL_API_KEY')))



In [49]:
# OpenAI models
llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

In [50]:
embeddings.max_retries = 20
embeddings.max_timeout = 180

In [51]:
distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=llm), 1.0),
]

for query, _ in distribution:
    prompts = await query.adapt_prompts("russian", llm=llm)
    query.set_prompts(**prompts)

In [52]:
headline_extractor = HeadlinesExtractor(llm=llm)
headline_splitter = HeadlineSplitter(min_tokens=300, max_tokens=1000)
keyphrase_extractor = KeyphrasesExtractor(
    llm=llm, property_name="keyphrases", max_num=10
)
nere_extractor = NERExtractor()

In [53]:
transforms = [headline_extractor, headline_splitter, keyphrase_extractor, nere_extractor]

In [54]:
personas = [
    Persona(
        name="Telegram User",
        role_description="Ты - человек, интересующийся различными фактами, новостями и событиями, читающий каналы в Telegram. Говори только на русском языке, задавай простые вопросы, чтобы узнать факты из Telegram каналов",
    ),
]

In [55]:
generator = TestsetGenerator(llm=llm, embedding_model=embeddings, persona_list=personas)
testdataset = generator.generate_with_langchain_docs(
    docs, 
    testset_size=100, 
    query_distribution=distribution, 
    transforms=transforms,
    raise_exceptions=False,
    run_config=RunConfig(
        timeout=180,
        max_retries=10,
        max_wait = 180,
        max_workers= 1, 
    ),
)

Applying HeadlinesExtractor:   0%|          | 0/500 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/500 [00:00<?, ?it/s]

Applying KeyphrasesExtractor:   0%|          | 0/1008 [00:00<?, ?it/s]

Property 'keyphrases' already exists in node 'eca0e0'. Skipping!
Property 'keyphrases' already exists in node '396c1b'. Skipping!
Property 'keyphrases' already exists in node '04a974'. Skipping!
Property 'keyphrases' already exists in node '1a11fb'. Skipping!
Property 'keyphrases' already exists in node 'e1e66c'. Skipping!
Property 'keyphrases' already exists in node '788d73'. Skipping!
Property 'keyphrases' already exists in node '3fb891'. Skipping!
Property 'keyphrases' already exists in node '3ef909'. Skipping!
Property 'keyphrases' already exists in node '7a2b2c'. Skipping!
Property 'keyphrases' already exists in node '0e7916'. Skipping!
Property 'keyphrases' already exists in node '92dda6'. Skipping!
Property 'keyphrases' already exists in node '9c7009'. Skipping!
Property 'keyphrases' already exists in node '09345c'. Skipping!
Property 'keyphrases' already exists in node 'dcd18d'. Skipping!
Property 'keyphrases' already exists in node 'e119ba'. Skipping!
Property 'keyphrases' alr

Applying NERExtractor:   0%|          | 0/1008 [00:00<?, ?it/s]

Property 'entities' already exists in node '396c1b'. Skipping!
Property 'entities' already exists in node '9c7009'. Skipping!
Property 'entities' already exists in node '04a974'. Skipping!
Property 'entities' already exists in node '788d73'. Skipping!
Property 'entities' already exists in node '4b84bb'. Skipping!
Property 'entities' already exists in node '3fb891'. Skipping!
Property 'entities' already exists in node 'e1e66c'. Skipping!
Property 'entities' already exists in node '09345c'. Skipping!
Property 'entities' already exists in node '92dda6'. Skipping!
Property 'entities' already exists in node '1a11fb'. Skipping!
Property 'entities' already exists in node '7a2b2c'. Skipping!
Property 'entities' already exists in node 'dcd18d'. Skipping!
Property 'entities' already exists in node 'eca0e0'. Skipping!
Property 'entities' already exists in node 'e119ba'. Skipping!
Property 'entities' already exists in node '3a752b'. Skipping!
Property 'entities' already exists in node '034374'. Sk

Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/100 [00:00<?, ?it/s]

In [56]:
evaluation_dataset = EvaluationDataset.from_list(dataset)

TestsetSample(eval_sample=SingleTurnSample(user_input='Что произошло с Габеном в последнее время?', retrieved_contexts=None, reference_contexts=['POV – казуальный Габен, когда все ждут геймплейный патч 🤳\n\n__UPD: __[__ДРОПНУЛИ!!!__](https://www.dota2.com/newsentry/4178852863469164127)__ ____🎧__\n__UPD 2: Но не патч ____👏__'], response=None, multi_responses=None, reference='Недавно Габен стал казуальным, когда все ждали геймплейный патч. В обновлении сообщается, что что-то дропнули, но это не патч.', rubrics=None), synthesizer_name='single_hop_specifc_query_synthesizer')

In [59]:
testdataset.to_pandas().to_csv('~/Work/data/q-insigt-testset.csv')