In [None]:
from typing import List
# suppress pandas future warning
import warnings
import pandas as pd

# run opentrons_simulate in the terminal and get the stdout
# and save it to a file

import os
import subprocess
import uuid
import shutil
import datetime
import glob
from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, TypeVar, Generic, Type, cast, overload

warnings.filterwarnings("ignore", category=FutureWarning)

from utils import init_df_eval, save_prompt_and_answer_with_modelname, extract_markdown_code_blocks, prepare_python_script, run_opentrons_simulate, check_filename_extract_model_info

In [None]:
dataset_path = "./question_and_answer"
df_eval = init_df_eval(dataset_path)
df_eval

In [None]:
def evaluate():
    """
    evaluate the answer text stored in `./question_and_answer/*.txt`
    run `opentrons_simulate` and save the result to the pandas dataframe
    """
    for row in df_eval.itertuples():
        file_path = row.prompt_answer_file_path
        filename = os.path.basename(file_path)
        model, finetuned, prompt_ver = check_filename_extract_model_info(filename)
        python_script_path = prepare_python_script(file_path)
        output_path, output_text, result_type = run_opentrons_simulate(python_script_path, 'eva.txt')
        print(f'prompt_ver: {prompt_ver}, model: {model}, result_type: {result_type}')
        df_eval.at[row.Index, 'opentrons_simulate_result_file_path'] = output_path
        df_eval.at[row.Index, 'opentrons_simulate_result_raw_text'] = output_text
        df_eval.at[row.Index, 'opentrons_simulate_result_last_line'] = output_text.splitlines()[-1]
        df_eval.at[row.Index, 'opentrons_simulate_result'] = result_type
        df_eval.at[row.Index, 'model'] = model
        df_eval.at[row.Index, 'finetuned'] = finetuned
        df_eval.at[row.Index, 'prompt_ver'] = prompt_ver
        df_eval.at[row.Index, 'python_script_file_path'] = python_script_path
    # save df_eval
    df_eval.to_csv(dataset_path + '/df_eval.csv', index=False)

evaluate()

In [None]:
df_eval

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

MODEL_LIST = [ 'ada', 'finetuned-davinci', 'code-davinci-002', 'text-davinci-003',
       'chatgpt', 'gpt-3.5-turbo', 'gpt-4', 'gpt-3.5-turbo_1', 'gpt-3.5-turbo_2', 'gpt-3.5-turbo_3',
       'gpt-3.5-turbo_4', 'gpt-3.5-turbo_5','gpt-4_1', 'gpt-4_2', 'gpt-4_3', 'gpt-4_4', 'gpt-4_5']

def add_error_kind(df_eval: pd.DataFrame):
    """
    Extract exceptions from `opentrons_simulate_result_raw_text` or `opentrons_simulate_result_last_line` column and add it to `error_kind` column
    # e.g. IndentaionError, NameError, etc.
    """
    # extract before ':'
    df_eval['error_kind'] = df_eval['opentrons_simulate_result_last_line'].str.extract(r'([a-zA-Z]+):')
    # if ExceptionInProtocolError, set error_kind to 'ExceptionInProtocolError'
    df_eval.loc[df_eval['opentrons_simulate_result_raw_text'].str.contains('ExceptionInProtocolError'), 'error_kind'] = 'ExceptionInProtocolError'
    # if ApiDeprecationError, set error_kind to 'ApiDeprecationError'
    df_eval.loc[df_eval['opentrons_simulate_result_raw_text'].str.contains('ApiDeprecationError'), 'error_kind'] = 'ApiDeprecationError'
    # MalformedProtocolError
    df_eval.loc[df_eval['opentrons_simulate_result_raw_text'].str.contains('MalformedProtocolError'), 'error_kind'] = 'MalformedProtocolError'
    # FileNotFoundError
    df_eval.loc[df_eval['opentrons_simulate_result_raw_text'].str.contains('FileNotFoundError'), 'error_kind'] = 'FileNotFoundError'
    # if error and opentrons_simulate_result_last_line is empty, add UnknownError
    df_eval.loc[(df_eval['opentrons_simulate_result'] == 'error') & ((df_eval['error_kind'] == '') | df_eval['error_kind'].isnull()), 'error_kind'] = 'UnknownError'
    # if opentrons_simulate_result == 'ok', set error_kind to 'success'
    df_eval.loc[df_eval['opentrons_simulate_result'] == 'ok', 'error_kind'] = 'success'
    return df_eval

def filter_prompt_ver(df_eval: pd.DataFrame, prompt_ver: List[str] = ['v1', 'v2']):
    return df_eval[df_eval['prompt_ver'].isin(prompt_ver)]

def plot_result(df_eval: pd.DataFrame, prompt_ver: str = 'v1'):
    df_eval = df_eval[df_eval['prompt_ver'] == prompt_ver]
    # sort by model name
    df_eval = df_eval.sort_values(by='model', key=lambda x: x.map(MODEL_LIST.index))
    # filter out 'chatgpt
    df_eval = df_eval[df_eval['model'] != 'chatgpt']
    if len(df_eval) == 0:
        print(f'no data for prompt_ver={prompt_ver} and model={model}')
        return
    # plot

    base_palette = sns.color_palette("flare", len(ERROR_LIST))
    # color_palette = [base_palette[i] if key == 'success' else 'lightgreen' for i, key  in enumerate(ERROR_LIST) ]
    color_palette = {key: base_palette[i] if key != 'success' else 'lightgreen' for i, key  in enumerate(ERROR_LIST) }
    # {key: 'green' if key == 'success' else 'red' for key  in error_kind_list }
    # fontsize
    plt.rcParams['font.size'] = 18
    plt.figure(figsize=(20, 12))
    sns.countplot(x='model', hue='error_kind', data=df_eval, palette=color_palette)
    plt.title(f'result, prompt_ver={prompt_ver}')
    # rotate x labe not to overlap
    plt.xlabel(None)
    plt.xticks(rotation=60)
    # plt.show()
    plt.tight_layout()
    plt.savefig(f'./fig/result_prompt_ver_{prompt_ver}.pdf', dpi=300)

    # plot table
    plt.figure(figsize=(20, 12))
    sns.heatmap(pd.crosstab(df_eval['model'], df_eval['error_kind']), annot=True, fmt='d', cmap='Blues')
    plt.savefig(f'./fig/result_prompt_ver_{prompt_ver}_table.png', dpi=300)
    plt.title(f'result, prompt_ver={prompt_ver}')
    return



In [None]:
PROMPT_VAR_LIST = ['v1', 'v2', 'v3', 'v4', 'v3.1', 'v4.1']
UNIQUE_MODEL_LIST = ["gpt-4", "gpt-3.5-turbo", "ada", "text-davinci-003"]

ERROR_LIST = ['ApiDeprecationError', 'UnknownError', 'IndentationError', 'RuntimeError', 'SyntaxError', 'ExceptionInProtocolError', None, 'AttributeError', 'success', 'models', 'NameError', 'TabError', 'ValueError', 'MalformedProtocolError', 'FileNotFoundError', 'TypeError']

USE_MODEL_LIST = ["gpt-4_1", "gpt-4_2", "gpt-4_3", "gpt-4_4", "gpt-4_5", "gpt-3.5-turbo_1", "gpt-3.5-turbo_2", "gpt-3.5-turbo_3", "gpt-3.5-turbo_4", "gpt-3.5-turbo_5"]
USE_MODEL_GROUP_LIST = ["gpt-4", "gpt-3.5-turbo"]
USE_COLUMNS = ['prompt_ver', 'model_group', 'error_kind', 'iteration', 'conversation_id']

# filter prompt version
# USE_PROMPT_VER = ['v1', 'v2', 'v3', 'v4', 'v3.1', 'v4.1', 'v2.cell2', 'v2.medium2', 'v2.cell2.medium2', 'v2-B']
# df_eval = filter_prompt_ver(df_eval, prompt_ver=USE_PROMPT_VER)

plot_result(add_error_kind(df_eval), prompt_ver='v1')
plot_result(add_error_kind(df_eval), prompt_ver='v2')

# plot_result(add_error_kind(df_eval), prompt_ver='v2.cell2')
# plot_result(add_error_kind(df_eval), prompt_ver='v2.medium2')
# plot_result(add_error_kind(df_eval), prompt_ver='v2.cell2.medium2')
# plot_result(add_error_kind(df_eval), prompt_ver='v2-B')
# plot_result(add_error_kind(df_eval), prompt_ver='v2-B.cell2')
# plot_result(add_error_kind(df_eval), prompt_ver='v2-B.target2')
# plot_result(add_error_kind(df_eval), prompt_ver='v2-B.cell2.target2')

# plot_result(add_error_kind(df_eval), prompt_ver='v3')
# plot_result(add_error_kind(df_eval), prompt_ver='v4')
# plot_result(add_error_kind(df_eval), prompt_ver='v3.1')
# plot_result(add_error_kind(df_eval), prompt_ver='v4.1')

df_eval = add_error_kind(df_eval)

In [None]:
list(df_eval.error_kind.unique())

In [None]:
def add_conversation_id(df_eval: pd.DataFrame) -> pd.DataFrame:
    """
    Add conversation_id to each row. This is generated inside `call_evaluate_loop` function and is used to identify the conversation.

    treat some conversational models as one model, for example, gpt-4_1, gpt-4_2, gpt-4_3, gpt-4_4, gpt-4_5 as gpt-4
    but extract the number of the last underscore, for example, gpt-4_5 -> 5

    E.g.
        | model (str) | prompt_ver (str) | success | error_kind | model_group (str) | count (int) | conversation_id (uuid)
        | --- | --- | --- | --- | --- | --- | --- |
        | gpt-4_1 | v1 | 0 | 0 | gpt-4 | 1 | uuid |
        | gpt-4_2 | v1 | 0 | 0 | gpt-4 | 2 | uuid |
        | gpt-4_3 | v1 | 0 | 0 | gpt-4 | 3 | uuid |
    """
    for row in df_eval.itertuples():
        # extract the number of the last underscore
        count = row.model.split('_')[-1]
        # if count is digit, treat the model as one model
        if count.isdigit():
            df_eval.loc[row.Index, 'model_group'] = row.model.replace(f'_{count}', '')
            df_eval.loc[row.Index, 'iteration'] = int(count)
            # extract uuid from 
            filename = df_eval.loc[row.Index, 'prompt_answer_file_path'].split('/')[-1] # chat_loop_1_2c2cef27-69dc-401c-a8af-01bbb295e342_gpt-4_token_100_temperature_0.9_2023-03-27-14-46-47_prompt_v4
            df_eval.loc[row.Index, 'conversation_id'] = filename.split('_')[3]
            if filename.split('_')[3] == 'last':
                df_eval.loc[row.Index, 'conversation_id'] = filename.split('_')[4]
        else:
            df_eval.loc[row.Index, 'model_group'] = row.model
            df_eval.loc[row.Index, 'iteration'] = 0
            df_eval.loc[row.Index, 'conversation_id'] = ''
    return df_eval

df_eval_conversation = add_conversation_id(df_eval)

In [None]:
def use_some_columns(df_eval: pd.DataFrame):
    """
    Just ignore some columns that are not needed for plotting or table.
    """
    df = df_eval[USE_COLUMNS]
    # filter by USE_MODEL_LIST
    df = df[df['model_group'].isin(USE_MODEL_GROUP_LIST)]
    # iteration to int
    df['iteration'] = df['iteration'].astype(int)
    # filer iteration zero
    df = df[df['iteration'] != 0]
    return df

df_eval_short = use_some_columns(df_eval_conversation)
df_eval_short.head(10)

In [None]:

def format_dataframe(df: pd.DataFrame, model: str, prompt_ver: str) -> Tuple[pd.DataFrame, list]:
    """
    To create raw data table for the paper.
    """
    # use gpt-4 only
    df = df[df['model_group'] == model]
    # use v1 only
    df = df[df['prompt_ver'] == prompt_ver]
    grouped_db = df.groupby(['conversation_id', 'iteration']).error_kind.apply(list).reset_index()

    result_df = pd.DataFrame(columns=['Model', 'Prompt', 'Sample No.', 'conversation_id', '1', '2', '3', '4', '5'])
    result_df.loc[0] = ['Model', 'Prompt', 'Sample No.', 'conversation_id',  '1', '2', '3', '4', '5']

    results = []
    sample_no = 1
    current_conversation_id = grouped_db['conversation_id'][0]
    print(f'unique conversation id length: {len(grouped_db["conversation_id"].unique())}')
    for index, row in grouped_db.iterrows():
        if row['conversation_id'] != current_conversation_id:
            current_conversation_id = row['conversation_id']
            sample_no += 1

        row_data = [f'{model}', f'{prompt_ver}', str(sample_no), current_conversation_id]

        for i in range(1, 6):
            if row['iteration'] == i:
                row_data.append(row['error_kind'])
            else:
                row_data.append('')

        # append row
        result_df.loc[len(result_df)] = [*row_data]
        # result_df.loc[sample_no] = [*row_data]
        results.append(row_data)

    df_grouped = result_df.groupby(['Sample No.', 'Model', 'Prompt', 'conversation_id']).agg({str(i): lambda x: x.dropna().tolist() for i in range(1, 6)})
    result_df = result_df[['Sample No.', 'Model', 'Prompt', 'conversation_id']].drop_duplicates().merge(df_grouped.reset_index(), on='Sample No.')
    # drop ['Model_y', 'Prompt_y'] and  then rename ['Model_x', 'Prompt_x'] to ['Model', 'Prompt']
    result_df = result_df.drop(['Model_y', 'Prompt_y'], axis=1).rename(columns={'Model_x': 'Model', 'Prompt_x': 'Prompt'})
    # for [1, 2, 3, 4, 5] columns, convert list to string
    for i in range(1, 6):
        result_df[str(i)] = result_df[str(i)].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", ''))
        # also, replace ', ' to ''
        result_df[str(i)] = result_df[str(i)].apply(lambda x: x.replace(', ', ''))
    # drop first row
    result_df = result_df.drop(result_df.index[0])

    # save csv
    result_df.to_csv(f'./table/all_{model}_{prompt_ver}.csv', index=False)
    return result_df, results


# Assuming you have the DataFrame named 'df_eval_short'
# Replace 'df_eval_short' with the actual DataFrame name if it's different
df_format_gpt4_v1, aa_tmp_list = format_dataframe(df_eval_short, 'gpt-4', 'v1')
df_format_gpt4_v2, _ = format_dataframe(df_eval_short, 'gpt-4', 'v2')
df_format_gpt3_5_v1, _ = format_dataframe(df_eval_short, 'gpt-3.5-turbo', 'v1')
df_format_gpt3_5_v2, _ = format_dataframe(df_eval_short, 'gpt-3.5-turbo', 'v2')

# concat and save csv
df_format = pd.concat([df_format_gpt4_v1, df_format_gpt4_v2, df_format_gpt3_5_v1, df_format_gpt3_5_v2])
df_format.to_csv('./table/all.csv', index=False)

# Print the resulting DataFrame
print(df_format)


# Deprecated

In [None]:
def group_by_error_kind_and_model_and_prompt_ver(df: pd.DataFrame):
    df = df.groupby(['error_kind', 'model', 'prompt_ver']).size().reset_index(name='count')
    # df = df.sort_values(by=['error_kind', 'model', 'prompt_ver'], key=lambda x: x.map(MODEL_LIST.index))
    # filter by USE_MODEL_LIST
    df = df[df['model'].isin(USE_MODEL_LIST)]
    return df

df_eval_groupby = group_by_error_kind_and_model_and_prompt_ver(df_eval)


In [None]:

def save_table_latex_csv(df: pd.DataFrame, file_name: str):
    save_path = './table'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    df.to_csv(os.path.join(save_path, f'{file_name}.csv'), index=False)
    df.to_latex(os.path.join(save_path, f'{file_name}.txt'), index=False)
    df.to_pickle(os.path.join(save_path, f'{file_name}.pkl'))
    return




def group_by_model_prompt_ver(df_eval: pd.DataFrame):
    pass

def create_table(df_eval: pd.DataFrame):
    """
    summarize the result as a table

    E.g.
        Simple table
        | model (str) | prompt_ver (str) | success | error | success_rate (only for ada, text-davinci-003)
        | --- | --- | --- | --- | --- | --- | --- |
        | ada | v1 | 0 | 0 | 0 | 0 | 0 |
    """
    success_rate_models = ['ada', 'text-davinci-003']
    # create a table
    df_table = pd.DataFrame(columns=['model', 'prompt_ver', 'success', 'error'])
    for model in MODEL_LIST:
        for prompt_ver in USE_PROMPT_VER:
            df = df_eval[(df_eval['model'] == model) & (df_eval['prompt_ver'] == prompt_ver)]
            if df.empty:
                continue
            if model in ['finetuned-davinci', 'chatgpt', 'gpt-4', 'gpt-3.5-turbo', 'code-davinci-002']:
                continue
            # count success by opentrons_simulate_result == 'ok'
            res_count = df['opentrons_simulate_result'].value_counts()
            success_count = res_count['ok'] if 'ok' in res_count else 0
            print(f'{model}, {prompt_ver}, {success_count}')
            error_count = res_count['error'] if 'error' in res_count else 0

            if model in success_rate_models:
                success_rate = success_count / (success_count + error_count)
            else:
                success_rate = None

            df_table = df_table.append({'model': model, 'prompt_ver': prompt_ver, 'success': success_count, 'error': error_count, 'success_rate': success_rate}, ignore_index=True)

    save_table_latex_csv(df_table, 'success_error_count')
    return df_table

def calculate_success_rate_by_n_turns(df_table: pd.DataFrame):
    """
    df_table

    success, error, count
    13,9,1,gpt-4_1
    5,4,2,gpt-4_2
    1,3,3,gpt-4_3
    1,2,4,gpt-4_4
    2,0,5,gpt-4_5

    1回めまでの成功率 13/13+9
    2回めまでの成功率 13+5/13+9
    3回めまでの成功率 13+5+1/13+9

    """
    df_table_success_rate = pd.DataFrame(columns=['model', 'prompt_ver', 'success_by_turn_1', 'success_by_turn_2', 'success_by_turn_3', 'success_by_turn_4', 'success_by_turn_5', 'success_rate'])
    conversational_models = ['gpt-4', 'gpt-3.5-turbo']
    for model in conversational_models:
        for prompt_ver in USE_PROMPT_VER:
            # the total num is first call
            df_first_call = df_table[(df_table['model'] == f'{model}_1') & (df_table['prompt_ver'] == prompt_ver)]
            total = df_first_call['success'].sum() + df_first_call['error'].sum()
            previous_success = 0
            success_rates = []
            for turn in range(1, 6):
                model_turn = f'{model}_{turn}'
                df = df_table[(df_table['model'] == model_turn) & (df_table['prompt_ver'] == prompt_ver)]
                if df.empty:
                    continue
                current_success = df['success'].sum()
                success_rate = (previous_success + current_success) / total
                success_rates.append(success_rate)
                previous_success += current_success
                print(f'{model_turn}, {prompt_ver}, {success_rate}, {current_success=}, {previous_success=}, {total=}')
            new_row = {'model': model, 'prompt_ver': prompt_ver}
            for i, success_rate in enumerate(success_rates):
                new_row[f'success_by_turn_{i+1}'] = success_rate
            df_table_success_rate = df_table_success_rate.append(new_row, ignore_index=True)

    # for other models "ada", "text-davinci-003", simply add success_rate column
    for model in MODEL_LIST:
        if model not in ['ada', 'text-davinci-003']:
            continue
        for prompt_ver in USE_PROMPT_VER:
            df = df_table[(df_table['model'] == model) & (df_table['prompt_ver'] == prompt_ver)]
            if df.empty:
                continue
            df_table_success_rate = df_table_success_rate.append({'model': model, 'prompt_ver': prompt_ver, 'success_rate': df['success_rate'].values[0]}, ignore_index=True)

    save_table_latex_csv(df_table_success_rate, 'success_by_n_turns')
    return df_table_success_rate


def create_conversation_each_success_table(df_eval: pd.DataFrame):
    """
    | conversation_id (str) | model_group (str) | count (int) | 1st_success (bool) | 2nd_success (bool) | 3rd_success (bool) | 4th_success (bool) | 5th_success (bool) |
    | --- | --- | --- | --- | --- | --- | --- | --- |
    | 2c2cef27-69dc-401c-a8af-01bbb295e342 | gpt-4 | 1 | True | False | False | False | False |
    """
    all_conversation_ids = df_eval['conversation_id'].unique()
    df_table = pd.DataFrame(columns=['conversation_id', 'model_group', 'count', 'prompt_ver', '1_success', '2_success', '3_success', '4_success', '5_success'])
    for conversation_id in all_conversation_ids:
        df_conversation = df_eval[df_eval['conversation_id'] == conversation_id]
        for row in df_conversation.itertuples():
            count = row.count
            if count == 0:
                continue
            df_table.loc[row.Index, 'prompt_ver'] = row.prompt_ver
            df_table.loc[row.Index, 'conversation_id'] = conversation_id
            df_table.loc[row.Index, 'model_group'] = row.model_group
            df_table.loc[row.Index, 'count'] = count
            df_table.loc[row.Index, f'{count}_success'] = True if row.opentrons_simulate_result == 'ok' else False
        # df_table.to_csv(f'./result/conversation_each_success_{conversation_id}.csv', index=False)
    # all NaN {count}th_success to False
    for count in range(1, 6):
        df_table[f'{count}_success'] = df_table[f'{count}_success'].fillna(False)

    # only left the rows that has biggest count
    df_table = df_table.sort_values(by=['conversation_id', 'count'], ascending=False)
    df_table = df_table.drop_duplicates(subset=['conversation_id'], keep='first')
    df_table = df_table.reset_index(drop=True)

    # add 'success_count' column
    df_table['success_after_n_call'] = 0
    for row in df_table.iterrows():
        for count in range(1, 6):
            if row[1][f'{count}_success']:
                df_table.loc[row[0], 'success_after_n_call'] = count
                break

    # not conversation model
    df_no_conversation = df_eval[df_eval['conversation_id'] == '']
    for row in df_no_conversation.itertuples():
        new_row = {
            'conversation_id': '',
            'model_group': row.model_group,
            'count': 1,
            'prompt_ver': row.prompt_ver,
            '1_success': True if row.opentrons_simulate_result == 'ok' else False,
            '2_success': False,
            '3_success': False,
            '4_success': False,
            '5_success': False,
            'success_after_n_call': 1 if row.opentrons_simulate_result == 'ok' else 0,
        }
        df_table = df_table.append(new_row, ignore_index=True)

    save_table_latex_csv(df_table, 'table_conversation_each_success')
    return df_table

def create_table_conversation_each(df_eval: pd.DataFrame):
    """
    summarize the result as a table

    E.g.
        Simple table
        | model (str) | prompt_ver (str) | count | success | error | total | success rate | error rate |
        | --- | --- | --- | --- | --- | --- | --- |
        | gpt-4_1 | v1 | 0 | 0 | 0 | 0 | 0 |
        | gpt-4_2 | v1 | 0 | 0 | 0 | 0 | 0 |
    """
    # create a table
    df_table = pd.DataFrame(columns=['model', 'prompt_ver', 'success', 'error', 'total', 'success_rate', 'error_rate'])
    for model in MODEL_LIST:
        for prompt_ver in ['v1', 'v2', 'v3', 'v4', 'v3.1', 'v4.1']:
            df = df_eval[(df_eval['model'] == model) & (df_eval['prompt_ver'] == prompt_ver)]
            total = len(df)
            if total == 0:
                print(f'no data for {model}, {prompt_ver}')
                continue
            success = len(df[df['error_kind'] == 'success'])
            error = total - success
            success_rate = success / total
            error_rate = error / total
            if 'gpt-4_' in model or 'gpt-3.5_' in model:
                count = model.split('_')[-1]
            else:
                count = 0
            df_table = df_table.append({'model': model, 'prompt_ver': prompt_ver, 'count': count, 'success': success, 'error': error, 'total': total, 'success_rate': success_rate, 'error_rate': error_rate}, ignore_index=True)

    return df_table

def is_conversation_model(model: str):
    if 'gpt-4' in  model or 'gpt-3.5-turbo' in  model:
        return True
    else:
        return False

def create_n_success(df_eval: pd.DataFrame):
    """
    1回目の会話までで成功する確率 (1_success_rate)
    2回目の会話までで成功する確率・・・
    """
    df_table = pd.DataFrame(columns=['model', 'prompt_ver', 'count', '1_success_rate', '2_success_rate', '3_success_rate', '4_success_rate', '5_success_rate'])
    for model in UNIQUE_MODEL_LIST:
        for prompt_ver in ['v1', 'v2', 'v3', 'v4', 'v3.1', 'v4.1']:
            df = df_eval[(df_eval['model_group'] == model) & (df_eval['prompt_ver'] == prompt_ver)]
            total = len(df)
            if total == 0:
                print(f'no data for {model}, {prompt_ver}')
                continue
            if is_conversation_model(model):
                count = model.split('_')[-1]
            else:
                count = 0
            new_row = {
                'model': model,
                'prompt_ver': prompt_ver,
                'count': count,
            }
            previous_success_rate = 0
            total = len(df[df['count'] == count])
            for n in range(1, 6):
                df_n = df[df['success_after_n_call'] >= n]
                success_rate = (len(df_n) / total) + previous_success_rate
                new_row[f'{n}_success_rate'] = success_rate
                previous_success_rate = success_rate
            df_table = df_table.append(new_row, ignore_index=True)
    return df_table




df_eval_count = add_conversation_id(df_eval)
df_table_each = create_table_conversation_each(df_eval_count)
df_table = create_table(df_eval)
df_table_success_rate = calculate_success_rate_by_n_turns(df_table)


df_table_each_success = create_conversation_each_success_table(df_eval_count)

# df_table_each_success_rate = create_n_success(df_table_each_success)
