In [32]:
import json
from langchain_openai import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import LlamaCpp
import pandas as pd
import yaml
import os
from tqdm.auto import tqdm

In [7]:
with open('secrets.yaml', 'r') as f:
    secrets = yaml.safe_load(f)

openai_key = secrets['openai_key']

In [8]:
template = """You are provided text, question and list of answers. You need to select the correct answer for the given question. The correct answer is the one that is most relevant to the question.
Return the index of the correct answer. Your answer should be a number 1 or 2. Do not include any other information in your answer.

text: {text}

question: {question}

list of answers:
1: {answer1}
2: {answer2}
Answer: """

In [9]:
prompt = ChatPromptTemplate.from_template(template)

In [10]:
llm = OpenAI(api_key=openai_key)

In [11]:
eval_chain = prompt | llm

In [12]:
text = "Bleach: Hell Verse (Japanese: BLEACH , Hepburn: BurÄ«chi Jigoku-Hen) is a 2010 Japanese animated film directed by Noriyuki Abe."
question = "Who directed Bleach: Hell Verse?"
answer1 = "Noriyuki Abe"
answer2 = "Bleach: Hell Verse is a 2010 Japanese animated film directed by Noriyuki Abe."


In [13]:
res = eval_chain.invoke({"text": text, "question": question, "answer1": answer1, "answer2": answer2})
res

'1'

# Сравнение резльутатов

- Сравниваем Rag_0 и серию [cosine_reranked_ft, cosine_reranked_ftpp, cosine_reranked_gt]
- Сравниваем тольк те, в ответах которых есть различия
- Сравнивать со стороым подходом реранжирования (llm) нет смысла, так как он нигде не менял

In [40]:
metrics_list = []

for domain in ['movie', 'computer', 'nature']:

    df = pd.read_csv(os.path.join('artifacts', domain, 'eval_dataset.csv'))

    # фильтрум те, в которых были разные rag, то есть реранжирование вообще имело смысл
    df = df[df.uniq_count > 1]
    total_rows = df.shape[0]

    # Фильтруем, где вообщее есть различия
    df = df[(df.rag_0 != df.cosine_reranked_ft) | (df.rag_0 != df.cosine_reranked_ftpp) | (df.rag_0 != df.cosine_reranked_gt)]
    diff_rows = df.shape[0]

    columns = ['cosine_reranked_ft', 'cosine_reranked_ftpp', 'cosine_reranked_gt']
    for column in columns:
        print(f'{domain} {column}')

        rag_better_count = 0
        reranked_better_count = 0
        unknown_count = 0

        for index in tqdm(df.index):
            text = df.loc[index, 'sent']
            question = df.loc[index, 'question']
            answer1 = df.loc[index, 'rag_0']
            answer2 = df.loc[index, column]
            res = eval_chain.invoke({"text": text, "question": question, "answer1": answer1, "answer2": answer2})
            if res.strip(' .)') == '1':
                rag_better_count += 1
            elif res.strip(' .)') == '2':
                reranked_better_count += 1
            else:
                print(f'Error parse llm output: {res}')
                unknown_count += 1

        metrics_list.append({
            'domain': domain,
            'algorithm': column,
            'total_rows': total_rows,
            'diff_rows': diff_rows,
            'rag_better_count': rag_better_count,
            'reranked_better_count': reranked_better_count,
            'unknown_count': unknown_count
        })

metrics_list

movie cosine_reranked_ft


100%|██████████| 41/41 [00:16<00:00,  2.48it/s]


movie cosine_reranked_ftpp


100%|██████████| 41/41 [00:15<00:00,  2.63it/s]


movie cosine_reranked_gt


100%|██████████| 41/41 [00:15<00:00,  2.60it/s]


computer cosine_reranked_ft


  9%|▉         | 2/22 [00:00<00:06,  3.23it/s]

Error parse llm output: 2



100%|██████████| 22/22 [00:08<00:00,  2.59it/s]


computer cosine_reranked_ftpp


100%|██████████| 22/22 [00:08<00:00,  2.48it/s]


computer cosine_reranked_gt


100%|██████████| 22/22 [00:07<00:00,  2.85it/s]


nature cosine_reranked_ft


100%|██████████| 38/38 [00:15<00:00,  2.51it/s]


nature cosine_reranked_ftpp


100%|██████████| 38/38 [00:15<00:00,  2.53it/s]


nature cosine_reranked_gt


  3%|▎         | 1/38 [00:00<00:17,  2.06it/s]

Error parse llm output: 1: Sapotaceae


100%|██████████| 38/38 [00:15<00:00,  2.53it/s]


[{'domain': 'movie',
  'algorithm': 'cosine_reranked_ft',
  'total_rows': 214,
  'diff_rows': 41,
  'rag_better_count': 27,
  'reranked_better_count': 14,
  'unknown_count': 0},
 {'domain': 'movie',
  'algorithm': 'cosine_reranked_ftpp',
  'total_rows': 214,
  'diff_rows': 41,
  'rag_better_count': 24,
  'reranked_better_count': 17,
  'unknown_count': 0},
 {'domain': 'movie',
  'algorithm': 'cosine_reranked_gt',
  'total_rows': 214,
  'diff_rows': 41,
  'rag_better_count': 26,
  'reranked_better_count': 15,
  'unknown_count': 0},
 {'domain': 'computer',
  'algorithm': 'cosine_reranked_ft',
  'total_rows': 62,
  'diff_rows': 22,
  'rag_better_count': 15,
  'reranked_better_count': 6,
  'unknown_count': 1},
 {'domain': 'computer',
  'algorithm': 'cosine_reranked_ftpp',
  'total_rows': 62,
  'diff_rows': 22,
  'rag_better_count': 13,
  'reranked_better_count': 9,
  'unknown_count': 0},
 {'domain': 'computer',
  'algorithm': 'cosine_reranked_gt',
  'total_rows': 62,
  'diff_rows': 22,
  'r

In [42]:
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv(os.path.join('artifacts', 'rag_metrics.csv'), index=False)
metrics_df

Unnamed: 0,domain,algorithm,total_rows,diff_rows,rag_better_count,reranked_better_count,unknown_count
0,movie,cosine_reranked_ft,214,41,27,14,0
1,movie,cosine_reranked_ftpp,214,41,24,17,0
2,movie,cosine_reranked_gt,214,41,26,15,0
3,computer,cosine_reranked_ft,62,22,15,6,1
4,computer,cosine_reranked_ftpp,62,22,13,9,0
5,computer,cosine_reranked_gt,62,22,17,5,0
6,nature,cosine_reranked_ft,114,38,21,17,0
7,nature,cosine_reranked_ftpp,114,38,17,21,0
8,nature,cosine_reranked_gt,114,38,25,12,1


**Вывод**:
- Количество ответов, где RAG был лучше, больше чем RAG+RR
- Есть только один случай nature+cosine_reranked_ftpp, где RAG+RR немного лушче RAG
- можно отметить, что ftpp (построцессинговые файнтюненые) триплеты на всех доменах показали лучший результат, и даже лушче gt
- возможно llm (второй подход к ранжирвованию) была права, оставляя ранжирование исходного RAG	