In [4]:
import pandas as pd
import json
import os

In [5]:
# TODO: add the parser from https://github.com/ksreenivasan/gorilla/blob/pallavi_workspace/berkeley-function-call-leaderboard/kartik_analyze_fc_vs_prompt_generations.ipynb so that I can also view generations
def get_acc_df(model='databricks-dbrx-instruct-old-run'):
    # get all json files in score/ directory
    results_dir = f'score/{model}'
    json_files = [f'{results_dir}/{f}' for f in os.listdir(results_dir) if f.endswith('.json')]

    results_dict = {'filename': [], 'accuracy': [], 'correct_count': [], 'total_count': []}
    for filename in json_files:
        with open(filename, 'r') as f:
            data = [json.loads(line) for line in f.readlines()]
            accuracy_info = data[0]
            results_dict['filename'].append(filename)
            for key in accuracy_info.keys():
                results_dict[key].append(accuracy_info[key])

    results_df = pd.DataFrame(results_dict)
    results_df['metric'] = results_df['filename'].apply(lambda x: x.split('/')[-1].split('.')[0])

    overall_acc = results_df.correct_count.sum() / results_df.total_count.sum()
    print(f'Overall accuracy: {overall_acc}')
    return results_df

In [6]:
results_df = get_acc_df(model='databricks-dbrx-instruct-old-run')
results_df[['metric', 'accuracy', 'correct_count', 'total_count']]

Overall accuracy: 0.6676470588235294


Unnamed: 0,metric,accuracy,correct_count,total_count
0,javascript_score,0.52,26,50
1,parallel_function_score,0.755,151,200
2,executable_parallel_function_score,0.72,36,50
3,executable_parallel_multiple_function_score,0.45,18,40
4,parallel_multiple_function_score,0.5,100,200
5,java_score,0.37,37,100
6,relevance_score,0.483333,116,240
7,simple_score,0.8275,331,400
8,executable_multiple_function_score,0.76,38,50
9,multiple_function_score,0.79,158,200


In [7]:
fc_results_df = get_acc_df(model='databricks-dbrx-instruct-FC-full-run')
fc_results_df[['metric', 'accuracy', 'correct_count', 'total_count']]

Overall accuracy: 0.28760330578512394


Unnamed: 0,metric,accuracy,correct_count,total_count
0,java_score,0.36,36,100
1,simple_score,0.515,206,400
2,parallel_function_score,0.0,0,200
3,javascript_score,0.4,20,50
4,executable_parallel_function_score,0.0,0,50
5,executable_simple_score,0.69,69,100
6,rest_score,0.1,7,70
7,relevance_score,0.041667,10,240


In [8]:
fc_results_df = get_acc_df(model='databricks-dbrx-instruct-FC')
fc_results_df[['metric', 'accuracy', 'correct_count', 'total_count']]

Overall accuracy: 0.3493670886075949


Unnamed: 0,metric,accuracy,correct_count,total_count
0,javascript_score,0.4,20,50
1,relevance_score,0.066667,16,240
2,java_score,0.33,33,100
3,simple_score,0.5175,207,400


In [9]:
prompt_results_df = get_acc_df(model='databricks-dbrx-instruct')
prompt_results_df[['metric', 'accuracy', 'correct_count', 'total_count']]

Overall accuracy: 0.6471074380165289


Unnamed: 0,metric,accuracy,correct_count,total_count
0,parallel_function_score,0.735,147,200
1,rest_score,0.614286,43,70
2,simple_score,0.8325,333,400
3,relevance_score,0.4125,99,240
4,javascript_score,0.46,23,50
5,executable_parallel_function_score,0.68,34,50
6,java_score,0.25,25,100
7,executable_simple_score,0.79,79,100


In [10]:
fc_results_df = get_acc_df(model='gpt-3.5-turbo-0125-FC')
fc_results_df[['metric', 'accuracy', 'correct_count', 'total_count']]

Overall accuracy: 0.5970588235294118


Unnamed: 0,metric,accuracy,correct_count,total_count
0,relevance_score,0.033333,8,240
1,parallel_function_score,0.905,181,200
2,executable_parallel_function_score,0.7,35,50
3,simple_score,0.59,236,400
4,javascript_score,0.62,31,50
5,executable_parallel_multiple_function_score,0.4,16,40
6,java_score,0.53,53,100
7,multiple_function_score,0.705,141,200
8,executable_multiple_function_score,0.72,36,50
9,rest_score,0.942857,66,70


In [11]:
model_list = ['databricks-dbrx-instruct-FC', 'databricks-dbrx-instruct', 'gpt-3.5-turbo-0125-FC']
df1 = get_acc_df(model=model_list[0])
df2 = get_acc_df(model=model_list[1])

print(f'metric \t\t\t\t\t | {model_list[0]}_acc \t | {model_list[1]}_acc')
for metric in df1['metric'].unique():
    print(f'{metric} \t\t\t\t | {df1[df1["metric"] == metric]["accuracy"].item()} \t | {df2[df2["metric"] == metric]["accuracy"].item()}')

Overall accuracy: 0.3493670886075949
Overall accuracy: 0.6471074380165289
metric 					 | databricks-dbrx-instruct-FC_acc 	 | databricks-dbrx-instruct_acc
javascript_score 				 | 0.4 	 | 0.46
relevance_score 				 | 0.06666666666666667 	 | 0.4125
java_score 				 | 0.33 	 | 0.25
simple_score 				 | 0.5175 	 | 0.8325


In [12]:
def get_results_df(models=['databricks-dbrx-instruct-old-run']):
    error_results_df_list = []
    full_results_df_list = []
    acc_dict = {'model': [], 'filename': [], 'accuracy': [], 'correct_count': [], 'total_count': []}
    for model in models:
        # model = 'gpt-3.5-turbo-0125'
        results_dir = f'score/{model}'
        json_files = [f'{results_dir}/{f}' for f in os.listdir(results_dir) if f.endswith('.json')]
        for filename in json_files:
            with open(filename, 'r') as f:
                try:
                    data = [json.loads(line) for line in f.readlines()]
                    # skip the accuracy line
                    df = pd.DataFrame(data[1:])
                    df['filename'] = filename.split('/')[-1]
                    error_results_df_list.append(df)
                    # parse out accuracy_info
                    acc_info = data[0]
                    acc_dict['filename'].append(filename.split('/')[-1])
                    acc_info['model'] = model
                    for key in acc_info.keys():
                        acc_dict[key].append(acc_info[key])
                except Exception as e:
                    print(f'Error reading {filename}: {e}')
                    
    # now read full results
    for model in models:
        results_dir = f'result/{model}'
        json_files = [f'{results_dir}/{f}' for f in os.listdir(results_dir) if f.endswith('.json')]
        for filename in json_files:
            with open(filename, 'r') as f:
                try:
                    data = [json.loads(line) for line in f.readlines()]
                    df = pd.DataFrame(data)
                    df['filename'] = filename.split('/')[-1]
                    df['model_name'] = model
                    full_results_df_list.append(df)
                except Exception as e:
                    print(f'Error reading {filename}: {e}')

    acc_df = pd.DataFrame(acc_dict)
    acc_df['metric'] = acc_df['filename'].apply(lambda x: x.split('/')[-1].split('.')[0])
    error_result_df = pd.concat(error_results_df_list)
    full_result_df = pd.concat(full_results_df_list)

    for model in acc_df['model'].unique():
        acc = acc_df[acc_df['model'] == model].correct_count.sum() / acc_df[acc_df['model'] == model].total_count.sum()
        print(f'Model: {model} : Acc = {100.0*acc}%')

    return acc_df, error_result_df, full_result_df

In [13]:
acc_df, error_result_df, full_result_df = get_results_df(models=['databricks-dbrx-instruct-old-run',
                                                                 'databricks-dbrx-instruct-FC',
                                                                 'databricks-dbrx-instruct-FC-full-run',
                                                                 'databricks-dbrx-instruct',
                                                                 'gpt-3.5-turbo-0125-FC'
                                                                 ])

Model: databricks-dbrx-instruct-old-run : Acc = 66.76470588235294%
Model: databricks-dbrx-instruct-FC : Acc = 34.93670886075949%
Model: databricks-dbrx-instruct-FC-full-run : Acc = 28.760330578512395%
Model: databricks-dbrx-instruct : Acc = 64.7107438016529%
Model: gpt-3.5-turbo-0125-FC : Acc = 59.705882352941174%


In [19]:
dbrx_fc_errors = error_result_df[(error_result_df['model_name'] == 'databricks-dbrx-instruct-FC')
                                 & (error_result_df['filename'] == 'simple_score.json')]
dbrx_fc_results = full_result_df[(full_result_df['model_name'] == 'databricks-dbrx-instruct-FC')
                                 & (full_result_df['filename'] == 'gorilla_openfunctions_v1_test_simple_result.json')]
prompt = dbrx_fc_errors.prompt.values[0]
print(f"Idx: {dbrx_fc_errors[dbrx_fc_errors['prompt'] == prompt].id.values[0]}")
print(f"Filename: {dbrx_fc_errors[dbrx_fc_errors['prompt'] == prompt].filename.values[0]}")
print(json.dumps(dbrx_fc_errors[dbrx_fc_errors['prompt'] == prompt].model_result_raw.values[0][0], indent=2))

Idx: 4
Filename: simple_score.json
{
  "algebra_quadratic_roots": "{\"a\": 1, \"b\": 3, \"c\": 2}"
}


In [22]:
print(json.dumps(dbrx_fc_errors[dbrx_fc_errors['prompt'] == prompt].possible_answer.values[0], indent=2))

{
  "algebra.quadratic_roots": {
    "a": [
      1
    ],
    "b": [
      -3
    ],
    "c": [
      2
    ]
  }
}


In [23]:
gpt_3_5_results = full_result_df[(full_result_df['model_name'] == 'gpt-3.5-turbo-0125-FC')
                                 & (full_result_df['filename'] == 'gorilla_openfunctions_v1_test_simple_result.json')]
gpt_3_5_errors = error_result_df[(error_result_df['model_name'] == 'gpt-3.5-turbo-0125-FC') &
                                 (error_result_df['filename'] == 'simple_score.json')]
gpt_3_5_results.head()

Unnamed: 0,idx,result,input_token_count,output_token_count,latency,filename,model_name
0,0,"[{'calculate_triangle_area': '{""base"": 10, ""he...",120,57,2.744657,gorilla_openfunctions_v1_test_simple_result.json,gpt-3.5-turbo-0125-FC
1,1,"[{'math_factorial': '{""number"": 5}'}]",80,30,1.071474,gorilla_openfunctions_v1_test_simple_result.json,gpt-3.5-turbo-0125-FC
2,2,"[{'math_hypot': '{""x"": 4, ""y"": 5}'}, {'math_hy...",146,59,1.66054,gorilla_openfunctions_v1_test_simple_result.json,gpt-3.5-turbo-0125-FC
3,3,"[{'algebra_quadratic_roots': '{""a"": 1, ""b"": -3...",118,41,2.197197,gorilla_openfunctions_v1_test_simple_result.json,gpt-3.5-turbo-0125-FC
4,4,"[{'solve_quadratic_equation': '{""a"": 2, ""b"": 6...",110,40,1.198794,gorilla_openfunctions_v1_test_simple_result.json,gpt-3.5-turbo-0125-FC


In [28]:
dbrx_only_error_ids = list(set(dbrx_fc_errors.id.unique()) - set(gpt_3_5_errors.id.unique()))

def print_error_by_id(idx_list):
    if not isinstance(idx_list, list):
        idx_list = [idx_list]
    for idx in idx_list:
        print("----------------------------------------------------------------------------")
        dbrx_error = dbrx_fc_errors[dbrx_fc_errors['id'] == idx]
        gpt_3_5_result = gpt_3_5_results[gpt_3_5_results['idx'] == idx-1]
        dbrx_result = dbrx_fc_results[dbrx_fc_results['idx'] == idx-1]
        print(f"Idx: {dbrx_error.id.values[0]}")
        print(f"Filename: {dbrx_error.filename.values[0]}")
        # print(f"Prompt: {json.dumps(dbrx_error.prompt.item(), indent=2)}")
        print(f"Question: {json.dumps(dbrx_error.prompt.values[0]['question'], indent=2)}")
        print(f"Dbrx (error_df) result_raw: {json.dumps(dbrx_error.model_result_raw.values[0][0], indent=2)}")
        print(f"Dbrx error: {json.dumps(dbrx_error.error.values[0], indent=2)}")

        print(f"GPT 3.5 result_raw: {json.dumps(gpt_3_5_result.result.values[0][0], indent=2)}")
        print("\n----------------------------------------------------------------------------\n")
        # print(f"Dbrx result_raw: {json.dumps(dbrx_result.result.item()[0], indent=2)}")

In [29]:
print_error_by_id(dbrx_only_error_ids[0])

----------------------------------------------------------------------------
Idx: 4
Filename: simple_score.json
Question: "Find the roots of a quadratic equation with coefficients a=1, b=-3, c=2."
Dbrx (error_df) result_raw: {
  "algebra_quadratic_roots": "{\"a\": 1, \"b\": 3, \"c\": 2}"
}
Dbrx error: [
  "Invalid value for parameter 'b': 3. Expected one of [-3]."
]
GPT 3.5 result_raw: {
  "algebra_quadratic_roots": "{\"a\": 1, \"b\": -3, \"c\": 2}"
}

----------------------------------------------------------------------------



In [30]:
gpt_3_5_errors.iloc[0:5]

Unnamed: 0,id,model_name,test_category,valid,error,error_type,prompt,model_result_raw,model_result_decoded,possible_answer,filename,model_result,decoded_result
0,1,gpt-3.5-turbo-0125-FC,simple,False,[Wrong number of functions.],simple_function_checker:wrong_count,{'question': 'Find the area of a triangle with...,"[{'calculate_triangle_area': '{""base"": 10, ""he...","[{'calculate_triangle_area': {'base': 10, 'hei...","{'calculate_triangle_area': {'base': [10], 'he...",simple_score.json,,
1,3,gpt-3.5-turbo-0125-FC,simple,False,[Wrong number of functions.],simple_function_checker:wrong_count,{'question': 'Calculate the hypotenuse of a ri...,"[{'math_hypot': '{""x"": 4, ""y"": 5}'}, {'math_hy...","[{'math_hypot': {'x': 4, 'y': 5}}, {'math_hypo...","{'math.hypot': {'x': [4], 'y': [5], 'z': ['', ...",simple_score.json,,
2,6,gpt-3.5-turbo-0125-FC,simple,False,[Wrong number of functions.],simple_function_checker:wrong_count,{'question': 'Find the roots of a quadratic eq...,"[{'solve_quadratic': '{""a"": 3, ""b"": -11, ""c"": ...","[{'solve_quadratic': {'a': 3, 'b': -11, 'c': -...","{'solve_quadratic': {'a': [3], 'b': [-11], 'c'...",simple_score.json,,
3,7,gpt-3.5-turbo-0125-FC,simple,False,[Wrong number of functions.],simple_function_checker:wrong_count,{'question': 'What are the roots of the quadra...,"[{'solve_quadratic': '{""a"": 2, ""b"": 5, ""c"": 3}...","[{'solve_quadratic': {'a': 2, 'b': 5, 'c': 3}}...","{'solve_quadratic': {'a': [2], 'b': [5], 'c': ...",simple_score.json,,
4,8,gpt-3.5-turbo-0125-FC,simple,False,[Wrong number of functions.],simple_function_checker:wrong_count,{'question': 'What is the circumference of a c...,"[{'calculate_circumference': '{""radius"": 4, ""u...","[{'calculate_circumference': {'radius': 4, 'un...","{'calculate_circumference': {'radius': [4], 'u...",simple_score.json,,


In [31]:
dbrx_fc_results.head()

Unnamed: 0,idx,result,input_token_count,output_token_count,latency,filename,model_name
0,0,"[{'calculate_triangle_area': '{""base"": 10, ""he...",255,12,0.796675,gorilla_openfunctions_v1_test_simple_result.json,databricks-dbrx-instruct-FC
1,1,"[{'math_factorial': '{""number"": 5}'}]",245,8,0.529542,gorilla_openfunctions_v1_test_simple_result.json,databricks-dbrx-instruct-FC
2,2,"[{'math_hypot': '{""x"": 4, ""y"": 5}'}]",259,12,0.669622,gorilla_openfunctions_v1_test_simple_result.json,databricks-dbrx-instruct-FC
3,3,"[{'algebra_quadratic_roots': '{""a"": 1, ""b"": 3,...",256,18,0.699065,gorilla_openfunctions_v1_test_simple_result.json,databricks-dbrx-instruct-FC
4,4,"[{'solve_quadratic_equation': '{""a"": 2, ""b"": 6...",253,18,0.829437,gorilla_openfunctions_v1_test_simple_result.json,databricks-dbrx-instruct-FC


In [32]:
dbrx_fc_errors.iloc[0:5]

Unnamed: 0,id,model_name,test_category,valid,error,error_type,prompt,model_result_raw,model_result_decoded,possible_answer,filename,model_result,decoded_result
0,4,databricks-dbrx-instruct-FC,simple,False,[Invalid value for parameter 'b': 3. Expected ...,value_error:others,{'question': 'Find the roots of a quadratic eq...,"[{'algebra_quadratic_roots': '{""a"": 1, ""b"": 3,...","[{'algebra_quadratic_roots': {'a': 1, 'b': 3, ...","{'algebra.quadratic_roots': {'a': [1], 'b': [-...",simple_score.json,,
1,6,databricks-dbrx-instruct-FC,simple,False,[Invalid value for parameter 'b': 11. Expected...,value_error:others,{'question': 'Find the roots of a quadratic eq...,"[{'solve_quadratic': '{""a"": 3, ""b"": 11, ""c"": 4...","[{'solve_quadratic': {'a': 3, 'b': 11, 'c': 4}}]","{'solve_quadratic': {'a': [3], 'b': [-11], 'c'...",simple_score.json,,
2,13,databricks-dbrx-instruct-FC,simple,False,[Invalid value for parameter 'units': 'units'....,value_error:string,{'question': 'Calculate the circumference of a...,"[{'geometry_circumference': '{  ""radius"": 3, ...","[{'geometry_circumference': {'radius': 3, 'uni...","{'geometry.circumference': {'radius': [3], 'un...",simple_score.json,,
3,14,databricks-dbrx-instruct-FC,simple,False,[Invalid value for parameter 'function': 'y = ...,value_error:string,{'question': 'Calculate the area under the cur...,"[{'calculate_area_under_curve': '{""function"": ...",[{'calculate_area_under_curve': {'function': '...,{'calculate_area_under_curve': {'function': ['...,simple_score.json,,
4,15,databricks-dbrx-instruct-FC,simple,False,[Invalid value for parameter 'x_value': 1.0. E...,value_error:others,{'question': 'Calculate the derivative of the ...,"[{'calculate_derivative': '{""function"": ""3*x**...",[{'calculate_derivative': {'function': '3*x**2...,{'calculate_derivative': {'function': ['3x^2 +...,simple_score.json,,


In [33]:
print_error_by_id(dbrx_only_error_ids[0:5])

----------------------------------------------------------------------------
Idx: 4
Filename: simple_score.json
Question: "Find the roots of a quadratic equation with coefficients a=1, b=-3, c=2."
Dbrx (error_df) result_raw: {
  "algebra_quadratic_roots": "{\"a\": 1, \"b\": 3, \"c\": 2}"
}
Dbrx error: [
  "Invalid value for parameter 'b': 3. Expected one of [-3]."
]
GPT 3.5 result_raw: {
  "algebra_quadratic_roots": "{\"a\": 1, \"b\": -3, \"c\": 2}"
}

----------------------------------------------------------------------------

----------------------------------------------------------------------------
Idx: 13
Filename: simple_score.json
Question: "Calculate the circumference of a circle with radius 3"
Dbrx (error_df) result_raw: {
  "geometry_circumference": "{\n  \"radius\": 3,\n  \"units\": \"units\"\n}"
}
Dbrx error: [
  "Invalid value for parameter 'units': 'units'. Expected one of ['cm', '']. Case insensitive."
]
GPT 3.5 result_raw: {
  "geometry_circumference": "{\"radius\":3}