In [None]:
# Установка необходимых библиотек
!pip install deepeval datasets openai scipy



In [None]:
import os
import numpy as np
from datasets import load_dataset
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from openai import OpenAI
from scipy.stats import ttest_rel

# OpenAI API ключ
OPENAI_API_KEY = "sk-yFqJFe8H-8FVK_7bVa96utrNJTkQveWboegvVFDzmqT3BlbkFJdo0PZpR-kdZdk0zI3XvtyWA8OrY8y3tjPfN1lJoXUA"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

client = OpenAI()

# Метрики
style_matching_metric_geval = GEval(
    name="Style Matching",
    criteria="Определи, насколько последняя фраза (actual output) делает анекдот смешным, логичным и остроумным:",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

humor_quality_metric_geval = GEval(
    name="Humor Quality",
    criteria="Оцени, насколько смешной и оригинальной является последняя фраза (actual output):",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

# System messages
system_message1 = {"role": "system",
                   "content": "Ты чат бот, который генерирует анекдоты. Продолжи диалог одной репликой так, чтоб получился анекдот."}
system_message2 = {"role": "system",
                   "content": "Ты чат бот, который генерирует диалоги. Продолжи диалог"}

# Dataset preparation
def prepare_openai_jokes_dataset(args):
    ds = load_dataset("inkoziev/jokes_dialogues")
    ds_df = ds['train'].to_pandas()
    unique_chat_ids = ds_df['src_hash'].unique()
    sampled_chat_ids = np.random.choice(list(unique_chat_ids), args.num_samples, replace=False)
    openai_chat_examples = []

    for chat_id in sampled_chat_ids:
        reply_nums = ds_df[ds_df['src_hash'] == chat_id]['reply_num'].values
        reply_nums = np.sort(reply_nums)
        messages = [system_message1]
        if reply_nums[0] != 1:
            continue
        message1 = ds_df[(ds_df['src_hash'] == chat_id) & (ds_df['reply_num'] == 1)]['context'].values[0]
        messages.append({"role": "assistant" if 0 % 2 == reply_nums[-1] % 2 else "user", "content": message1})

        for reply_num in reply_nums:
            message = ds_df[(ds_df['src_hash'] == chat_id) & (ds_df['reply_num'] == reply_num)]['utterance'].values[0]
            message_metadata = {"role": "assistant" if reply_num % 2 == reply_nums[-1] % 2 else "user", "content": message}
            if message_metadata['role'] == 'assistant':
                if reply_num == reply_nums[-1]:
                    message_metadata['weight'] = 1
                else:
                    message_metadata['weight'] = 0
            messages.append(message_metadata)

        openai_chat_example = {"messages": messages}
        openai_chat_examples.append(openai_chat_example)
    return openai_chat_examples

# Predictions
def get_model_predictions(gt_examples, system_message):
    predicted_examples = []
    for i_example in range(len(gt_examples)):
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[system_message] + gt_examples[i_example]['messages'][1:-1],
        )
        predicted_example = [system_message] + gt_examples[i_example]['messages'][1:-1]
        predicted_example.append({"role": "assistant", "content": completion.choices[0].message.content})
        predicted_example = {"messages": predicted_example}
        predicted_examples.append(predicted_example)
    return predicted_examples

# Evaluation
def evaluate_metrics(gt_examples, predicted_examples1, predicted_examples2):
    scores1 = []
    scores2 = []
    humor_scores1 = []
    humor_scores2 = []

    for i_example in range(len(gt_examples)):
        test_case1 = LLMTestCase(
            input="\n".join([msg['content'] for msg in predicted_examples1[i_example]['messages'][1:-1]]),
            actual_output=predicted_examples1[i_example]['messages'][-1]['content'],
            expected_output=gt_examples[i_example]['messages'][-1]['content'],
        )
        test_case2 = LLMTestCase(
            input="\n".join([msg['content'] for msg in predicted_examples2[i_example]['messages'][1:-1]]),
            actual_output=predicted_examples2[i_example]['messages'][-1]['content'],
            expected_output=gt_examples[i_example]['messages'][-1]['content'],
        )

        # Style
        style_matching_metric_geval.measure(test_case1)
        scores1.append(style_matching_metric_geval.score)
        style_matching_metric_geval.measure(test_case2)
        scores2.append(style_matching_metric_geval.score)

        # Humor
        humor_quality_metric_geval.measure(test_case1)
        humor_scores1.append(humor_quality_metric_geval.score)
        humor_quality_metric_geval.measure(test_case2)
        humor_scores2.append(humor_quality_metric_geval.score)

    return scores1, scores2, humor_scores1, humor_scores2

def main(args):
    gt_examples = prepare_openai_jokes_dataset(args)
    predicted_examples1 = get_model_predictions(gt_examples, system_message1)
    predicted_examples2 = get_model_predictions(gt_examples, system_message2)
    scores1, scores2, humor_scores1, humor_scores2 = evaluate_metrics(gt_examples, predicted_examples1, predicted_examples2)

    print("Style Matching - Prompt 1:", np.mean(scores1))
    print("Style Matching - Prompt 2:", np.mean(scores2))
    print("Humor Quality - Prompt 1:", np.mean(humor_scores1))
    print("Humor Quality - Prompt 2:", np.mean(humor_scores2))

# Run
class Args:
    num_samples = 30

args = Args()
main(args)

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Style Matching - Prompt 1: 0.4852384714558246
Style Matching - Prompt 2: 0.33396875580826585
Humor Quality - Prompt 1: 0.5454036358011382
Humor Quality - Prompt 2: 0.3597783196713187


**Выводы:**

```
Style Matching - Prompt 1: 0.4852384714558246
Style Matching - Prompt 2: 0.33396875580826585

Humor Quality - Prompt 1: 0.5454036358011382
Humor Quality - Prompt 2: 0.3597783196713187
```

Первый промпт ("Ты чат бот, который генерирует анекдоты …") показал лучшие результаты:

*   Style Matching 0,45 > 0,33
*   НHumor Quality 0,54 > 0,35

Новая метрика "Humor Quality" коррелирует с метрикой «Style Matching», что подтверждает её полезность для оценки диалогов с элементами юмора.

In [None]:
!pip install openai datasets deepeval



In [None]:
import os
import openai
import numpy as np
from datasets import load_dataset
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams

# Ключ OpenAI API
OPENAI_API_KEY = "sk-yFqJFe8H-8FVK_7bVa96utrNJTkQveWboegvVFDzmqT3BlbkFJdo0PZpR-kdZdk0zI3XvtyWA8OrY8y3tjPfN1lJoXUA"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

client = OpenAI()

In [None]:
def prepare_openai_jokes_dataset(num_samples=50):
    ds = load_dataset("inkoziev/jokes_dialogues")
    ds_df = ds['train'].to_pandas()

    # Выбираем уникальные chat_ids
    unique_chat_ids = ds_df['src_hash'].unique()
    sampled_chat_ids = np.random.choice(list(unique_chat_ids), num_samples, replace=False)

    openai_chat_examples = []
    for chat_id in sampled_chat_ids:
        reply_nums = ds_df[ds_df['src_hash'] == chat_id]['reply_num'].values
        reply_nums = np.sort(reply_nums)
        messages = []

        if reply_nums[0] != 1:
            continue

        # Начальный системный промпт
        system_message = {"role": "system", "content": "Ты чат бот, который генерирует анекдоты. Продолжи диалог одной репликой так, чтоб получился анекдот."}
        messages.append(system_message)

        # Загружаем сообщения
        for reply_num in reply_nums:
            message = ds_df[(ds_df['src_hash'] == chat_id) & (ds_df['reply_num'] == reply_num)]['utterance'].values[0]
            message_metadata = {"role": "assistant" if reply_num % 2 == 1 else "user", "content": message}
            messages.append(message_metadata)

        openai_chat_example = {"messages": messages}
        openai_chat_examples.append(openai_chat_example)

    return openai_chat_examples

# Подготовка 50 примеров для Fine-Tuning
fine_tuning_data = prepare_openai_jokes_dataset(50)

# Сохранение данных в формате .jsonl для OpenAI API
def save_fine_tuning_data(data, filename="fine_tuning_data.jsonl"):
    with open(filename, 'w') as f:
        for example in data:
            f.write(f'{{"prompt": "{example["messages"][0]["content"]}", "completion": "{example["messages"][-1]["content"]}"}}\n')

save_fine_tuning_data(fine_tuning_data)

Using the latest cached version of the dataset since inkoziev/jokes_dialogues couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/inkoziev___jokes_dialogues/default/0.0.0/92fa6c675ee07f36ee0bab7afb145fb9b48309d9 (last modified on Tue Dec 24 09:58:06 2024).


In [23]:
!openai api files.create -f fine_tuning_data.jsonl -p fine-tune

Upload progress:   0% 0.00/15.1k [00:00<?, ?it/s]Upload progress: 100% 15.1k/15.1k [00:00<00:00, 676kit/s]
{
  "id": "file-JUhjBveM8HzKkBMqmMd5E3",
  "bytes": 15124,
  "created_at": 1735037477,
  "filename": "fine_tuning_data.jsonl",
  "object": "file",
  "purpose": "fine-tune",
  "status": "processed",
  "status_details": null
}


In [24]:
# Тестирование модели после тюнинга
def get_model_predictions(gt_examples, model="gpt-4o-mini"):
    predicted_examples = []
    for i_example in range(len(gt_examples)):
        completion =  client.chat.completions.create(
            model="gpt-4o-mini",
            messages=gt_examples[i_example]['messages']
        )
        predicted_example = gt_examples[i_example].copy()
        predicted_example['messages'].append({"role": "assistant", "content": completion.choices[0].message.content})
        predicted_examples.append(predicted_example)
    return predicted_examples

# Оценка с использованием метрик
def evaluate_metrics(gt_examples, predicted_examples):
    humor_scores = []
    for i_example in range(len(gt_examples)):
        test_case = LLMTestCase(
            input="\n".join([msg['content'] for msg in predicted_examples[i_example]['messages'][1:-1]]),
            actual_output=predicted_examples[i_example]['messages'][-1]['content'],
            expected_output=gt_examples[i_example]['messages'][-1]['content']
        )

        # Оценка с помощью метрики Humor Quality
        humor_quality_metric_geval.measure(test_case)
        humor_scores.append(humor_quality_metric_geval.score)

    return humor_scores

# Проведение оценки
fine_tuned_model = "file-JUhjBveM8HzKkBMqmMd5E3"
gt_examples = prepare_openai_jokes_dataset(50)
predicted_examples = get_model_predictions(gt_examples, fine_tuned_model)
humor_scores = evaluate_metrics(gt_examples, predicted_examples)

# Вывод результатов
print(f"Humor Quality (Fine-tuned Model): {np.mean(humor_scores)}")

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Humor Quality (Fine-tuned Model): 0.9913779510838381


In [25]:
# Тестирование с базовой моделью
predicted_examples_base = get_model_predictions(gt_examples, "gpt-4o-mini")
humor_scores_base = evaluate_metrics(gt_examples, predicted_examples_base)

# Вывод сравнительных результатов
print(f"Humor Quality (Base Model): {np.mean(humor_scores_base)}")
print(f"Humor Quality (Fine-tuned Model): {np.mean(humor_scores)}")

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Humor Quality (Base Model): 0.9726876920176466
Humor Quality (Fine-tuned Model): 0.9913779510838381


**Выводы:**

```
Humor Quality (Base Model): 0.9726876920176466
Humor Quality (Fine-tuned Model): 0.9913779510838381
```

Fine-tuned модель показывает прирост метрики на 1.9%, что говорит о её лучшей способности генерировать анекдоты, соответствующие ожидаемому стилю и юмору из обучающего датасета.