# Журнал разработки и тестирования агента/мультиагента, специализирующего в DataScience + MachineLearning + DeepLearning

In [21]:
from datasets import load_dataset, get_dataset_split_names

datasets_loads = [
    # {
    #     "NAME": "soufyane/DATA_SCIENCE_QA",
    #     "Q": "Question",
    #     "A": "Answer"
    # },
    {
        "NAME": "team-bay/data-science-qa",
        "Q": "question",
        "A": "answer"
    }
]

In [22]:
from langchain_mistralai import ChatMistralAI
from dotenv import load_dotenv
import evaluate

load_dotenv()

llm = ChatMistralAI(
    model="mistral-small-latest",
    temperature=0,
    max_retries=2,
    # other params...
)

messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg.content

"J'adore programmer."

In [28]:
predictions = []
true_answers = []

In [29]:
for ds_load in datasets_loads:
    splits = get_dataset_split_names(ds_load["NAME"])
    for split in splits:
        ds_split = load_dataset(ds_load["NAME"], split=split)
        
        for i in range(100):
            print(f"Dataset '{ds_load['NAME']}', split '{split}', row {i+1}/{ds_split.num_rows}.")
            messages = [
               (
                    "system",
                    "You are a helpful assistant that answers on questions, which are related to data science or machine learning.",
                ),
                ("human", ds_split[ds_load["Q"]][i]),
            ]
            ai_msg = llm.invoke(messages)
            predictions.append(ai_msg.content)
            true_answers.append(ds_split[ds_load["A"]][i])

Dataset 'team-bay/data-science-qa', split 'train', row 1/473.
Dataset 'team-bay/data-science-qa', split 'train', row 2/473.
Dataset 'team-bay/data-science-qa', split 'train', row 3/473.
Dataset 'team-bay/data-science-qa', split 'train', row 4/473.
Dataset 'team-bay/data-science-qa', split 'train', row 5/473.
Dataset 'team-bay/data-science-qa', split 'train', row 6/473.
Dataset 'team-bay/data-science-qa', split 'train', row 7/473.
Dataset 'team-bay/data-science-qa', split 'train', row 8/473.
Dataset 'team-bay/data-science-qa', split 'train', row 9/473.
Dataset 'team-bay/data-science-qa', split 'train', row 10/473.
Dataset 'team-bay/data-science-qa', split 'train', row 11/473.
Dataset 'team-bay/data-science-qa', split 'train', row 12/473.
Dataset 'team-bay/data-science-qa', split 'train', row 13/473.
Dataset 'team-bay/data-science-qa', split 'train', row 14/473.
Dataset 'team-bay/data-science-qa', split 'train', row 15/473.
Dataset 'team-bay/data-science-qa', split 'train', row 16/473.
D

In [31]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=true_answers)
results

{'bleu': 0.01793607127263799,
 'precisions': [0.05018583501173148,
  0.023803083581282122,
  0.012489314234482183,
  0.006936754351141848],
 'brevity_penalty': 1.0,
 'length_ratio': 16.11274673803948,
 'translation_length': 48161,
 'reference_length': 2989}

In [33]:
rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=true_answers)
results

{'rouge1': 0.13003004792455541,
 'rouge2': 0.06808846128378661,
 'rougeL': 0.10825821395233363,
 'rougeLsum': 0.11872244724027176}

In [39]:
import numpy as np

bleurt = evaluate.load("bleurt", module_type="metric", checkpoint="bleurt-large-512")
results = bleurt.compute(predictions=predictions, references=true_answers)

print (results)



{'scores': [-0.44078388810157776, -0.13905742764472961, -0.1375528872013092, -0.6471426486968994, -0.0185844786465168, -0.24806255102157593, -0.20272958278656006, -0.43642058968544006, -0.1816762387752533, -0.38655850291252136, -0.44491833448410034, -0.6149903535842896, -0.37793418765068054, -0.8058252930641174, -0.5447061061859131, -0.4497118890285492, -0.01762155070900917, -0.6250327229499817, -0.1843060553073883, -0.32236918807029724, -0.20398402214050293, -0.1957666575908661, -0.018202949315309525, -0.3393728733062744, -0.43493419885635376, -0.11338627338409424, -0.12296149134635925, -0.15445387363433838, -0.32360365986824036, -0.815139889717102, -0.19238919019699097, -0.3668462336063385, -0.41167300939559937, -0.5887979865074158, 0.14859294891357422, -0.2813679873943329, -0.37445324659347534, -0.46711575984954834, -0.3773594796657562, -0.1781224012374878, -0.5071555972099304, 0.010943692177534103, -0.0038436688482761383, -0.4347352981567383, -0.2485138475894928, -0.412445038557052

In [40]:
print(len(results["scores"]), np.mean(results["scores"]), np.std(results["scores"]))

100 -0.31261963926255704 0.21429681531101655
