In [1]:
import os

In [2]:
base_dir="/home/user/outputs_kd-llm/evalplus_results/humaneval"

In [3]:
llama_31_8b_base = {
'file' : os.path.join(base_dir, "workspace--outputs_kd-llm--8b_lora-Llama-3.1-8B-alpaca-code-120k_vllm_temp_0.0_eval_results.json"),
'desc' : "Llama-3.1-8B_base"
}
llama_32_1b_base = {
'file' : os.path.join(base_dir, "meta-llama--Llama-3.2-1B_vllm_temp_0.0-perf-instruct_eval_results.json"),
'desc' : "Llama-3.2-1B_base"
}

llama_32_1b_ft = {
'file' : os.path.join(base_dir, "workspace--outputs_kd-llm--1b_lora-Llama-3.2-1B-alpaca-code-120k_vllm_temp_0.0-perf-instruct_eval_results.json"),
'desc' : "Llama-3.2-1B_ft"
}

llama_32_1b_fkld = {
    'file' : os.path.join(base_dir, "workspace--outputs_kd-llm----train_llama3_1-8b-llama3_2-1b_fkd_2-20241130-2026_vllm_temp_0.0_eval_results.json"),
    'desc' : "Llama-3.2-1B_fkld"
}

llama_32_1b_sym_kld = {
'file' : os.path.join(base_dir, "workspace--outputs_kd-llm----train_llama3_1-8b-llama3_2-1b_sym_kld_7-20241130-1620_vllm_temp_0.0_eval_results.json"),
'desc' : "Llama-3.2-1B_sym_kld"
}

In [4]:
llama_tests = [llama_31_8b_base, llama_32_1b_base, llama_32_1b_ft, llama_32_1b_fkld, llama_32_1b_sym_kld]

In [5]:
import json
def load_and_save(llama_tests):
    base_stats = {}
    plus_stats = {}
    for test in llama_tests:
        with open(test['file'], 'r') as f:
            data = json.load(f)
            # print(data)
        output_str = f"\nFile: {test['file']}\n" + "="*80 + "\n"
        eval_result = data['eval']
        base_stat = {}
        plus_stat = {}
        for k, v in eval_result.items():
            assert len(v) == 1
            result = v[0]
            output_str += "="*80 + "\n"
            output_str += f"task_id: {result['task_id']}" + "\n"
            output_str += f"base_status: {result['base_status']}" + "\n"
            output_str += f"plus_status: {result['plus_status']}" + "\n"
            output_str += result['solution'] + "\n"
            output_str += "="*80 + "\n\n"
            base_stat[result['task_id']] = result['base_status']
            plus_stat[result['task_id']] = result['plus_status']
        pass_at_k = data['pass_at_k']
        output_str += f"base_score: {pass_at_k['base']}" + "\n"
        output_str += f"plus_score: {pass_at_k['plus']}" + "\n"

        base_stats[test['desc']] = base_stat
        plus_stats[test['desc']] = plus_stat
        # print(output_str)
        # save output_str to file
        with open(test['desc'] + ".txt", 'w') as f:
            f.write(output_str)
    # save base_stat and plus_stat to file
    with open("Llama-32_base_stat.json", 'w') as f:
        json.dump(base_stats, f)
    with open("Llam-32_plus_stat.json", 'w') as f:
        json.dump(plus_stats, f)

In [6]:
load_and_save(llama_tests)

In [7]:
# load json dict file (Llama-3.2-1B_base_base_stat.json) to df 
import pandas as pd
import json
def load_json_to_df(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data.items(), columns=['task_id', 'base_status'])
    return df




In [8]:
stat_files = ["Llama-32_base_stat.json", "Llam-32_plus_stat.json"]

In [9]:
df = load_json_to_df(stat_files[0])
df_expanded = df.set_index('task_id')['base_status'].apply(pd.Series).reset_index()
# df_expanded.plot(kind='bar', x='task_id', figsize=(15, 7))
print(df_expanded.T)

                               0                  1                2  \
task_id        Llama-3.1-8B_base  Llama-3.2-1B_base  Llama-3.2-1B_ft   
HumanEval/6                 fail               fail             fail   
HumanEval/5                 pass               fail             fail   
HumanEval/1                 fail               fail             fail   
HumanEval/2                 pass               pass             pass   
...                          ...                ...              ...   
HumanEval/156               fail               fail             fail   
HumanEval/63                pass               fail             fail   
HumanEval/31                pass               pass             pass   
HumanEval/130               fail               fail             fail   
HumanEval/15                pass               fail             pass   

                               3                     4  
task_id        Llama-3.2-1B_fkld  Llama-3.2-1B_sym_kld  
HumanEval/6          

In [10]:
!pip install seaborn



In [11]:
import seaborn as sns

def stat_analysis(stat_files):
    for file in stat_files:
        df = load_json_to_df(stat_files[0])
        df_expanded = df.set_index('task_id')['base_status'].apply(pd.Series).reset_index()
        # df_expanded.plot(kind='bar', x='task_id', figsize=(15, 7))
        print(f"File: {file}")
        print(df_expanded.T.to_string())
        import matplotlib.pyplot as plt

        def color_pass_fail(val):
            color = 'red' if val == 'fail' else 'green'
            return f'color: {color}'

        styled_df = df_expanded.style.applymap(color_pass_fail, subset=pd.IndexSlice[:, df_expanded.columns != 'task_id'])
        display(styled_df)
    return df_expanded

In [12]:
# def stat_analysis(stat_files):
#     for file in stat_files:
#         df = load_json_to_df(stat_files[0])
#         df_expanded = df.set_index('task_id')['base_status'].apply(pd.Series).reset_index()
#         # df_expanded.plot(kind='bar', x='task_id', figsize=(15, 7))
#         print(f"File: {file}")
#         print(df_expanded.T.to_string())
        

In [13]:
expended_df = stat_analysis(stat_files[0:1])

File: Llama-32_base_stat.json
                               0                  1                2                  3                     4
task_id        Llama-3.1-8B_base  Llama-3.2-1B_base  Llama-3.2-1B_ft  Llama-3.2-1B_fkld  Llama-3.2-1B_sym_kld
HumanEval/6                 fail               fail             fail               fail                  fail
HumanEval/5                 pass               fail             fail               fail                  fail
HumanEval/1                 fail               fail             fail               fail                  fail
HumanEval/2                 pass               pass             pass               pass                  pass
HumanEval/0                 pass               fail             pass               pass                  pass
HumanEval/3                 pass               fail             fail               fail                  fail
HumanEval/9                 pass               pass             fail               fail   

  styled_df = df_expanded.style.applymap(color_pass_fail, subset=pd.IndexSlice[:, df_expanded.columns != 'task_id'])


Unnamed: 0,task_id,HumanEval/6,HumanEval/5,HumanEval/1,HumanEval/2,HumanEval/0,HumanEval/3,HumanEval/9,HumanEval/8,HumanEval/7,HumanEval/10,HumanEval/11,HumanEval/4,HumanEval/13,HumanEval/12,HumanEval/14,HumanEval/19,HumanEval/17,HumanEval/24,HumanEval/18,HumanEval/16,HumanEval/26,HumanEval/20,HumanEval/23,HumanEval/37,HumanEval/40,HumanEval/36,HumanEval/39,HumanEval/38,HumanEval/43,HumanEval/41,HumanEval/32,HumanEval/33,HumanEval/22,HumanEval/28,HumanEval/44,HumanEval/49,HumanEval/46,HumanEval/35,HumanEval/45,HumanEval/51,HumanEval/52,HumanEval/34,HumanEval/29,HumanEval/30,HumanEval/42,HumanEval/27,HumanEval/50,HumanEval/47,HumanEval/48,HumanEval/56,HumanEval/54,HumanEval/53,HumanEval/21,HumanEval/59,HumanEval/62,HumanEval/64,HumanEval/25,HumanEval/57,HumanEval/58,HumanEval/65,HumanEval/61,HumanEval/67,HumanEval/70,HumanEval/72,HumanEval/68,HumanEval/69,HumanEval/73,HumanEval/66,HumanEval/71,HumanEval/76,HumanEval/75,HumanEval/77,HumanEval/78,HumanEval/74,HumanEval/79,HumanEval/82,HumanEval/81,HumanEval/84,HumanEval/80,HumanEval/87,HumanEval/85,HumanEval/86,HumanEval/89,HumanEval/91,HumanEval/97,HumanEval/95,HumanEval/93,HumanEval/92,HumanEval/88,HumanEval/90,HumanEval/94,HumanEval/96,HumanEval/99,HumanEval/98,HumanEval/102,HumanEval/101,HumanEval/103,HumanEval/109,HumanEval/105,HumanEval/110,HumanEval/108,HumanEval/106,HumanEval/113,HumanEval/111,HumanEval/104,HumanEval/100,HumanEval/116,HumanEval/107,HumanEval/114,HumanEval/115,HumanEval/112,HumanEval/117,HumanEval/119,HumanEval/127,HumanEval/124,HumanEval/128,HumanEval/126,HumanEval/122,HumanEval/125,HumanEval/129,HumanEval/123,HumanEval/118,HumanEval/120,HumanEval/121,HumanEval/83,HumanEval/55,HumanEval/60,HumanEval/139,HumanEval/138,HumanEval/134,HumanEval/137,HumanEval/131,HumanEval/132,HumanEval/135,HumanEval/133,HumanEval/136,HumanEval/145,HumanEval/140,HumanEval/143,HumanEval/149,HumanEval/162,HumanEval/144,HumanEval/161,HumanEval/153,HumanEval/160,HumanEval/141,HumanEval/150,HumanEval/147,HumanEval/157,HumanEval/155,HumanEval/151,HumanEval/152,HumanEval/158,HumanEval/142,HumanEval/154,HumanEval/146,HumanEval/159,HumanEval/148,HumanEval/163,HumanEval/156,HumanEval/63,HumanEval/31,HumanEval/130,HumanEval/15
0,Llama-3.1-8B_base,fail,pass,fail,pass,pass,pass,pass,fail,pass,fail,pass,pass,pass,pass,pass,fail,pass,fail,pass,pass,fail,pass,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,pass,pass,fail,pass,pass,pass,pass,pass,pass,pass,pass,pass,pass,pass,pass,pass,fail,pass,pass,pass,fail,fail,fail,pass,fail,pass,fail,pass,fail,fail,pass,fail,fail,fail,pass,pass,fail,fail,fail,pass,fail,pass,fail,fail,fail,pass,fail,fail,pass,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,pass,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,pass,fail,fail,pass,fail,fail,pass,fail,pass,pass,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,pass,fail,pass,pass,fail,fail,pass,fail,fail,pass,pass,fail,pass
1,Llama-3.2-1B_base,fail,fail,fail,pass,fail,fail,pass,fail,pass,fail,fail,pass,pass,pass,fail,fail,fail,fail,pass,pass,fail,fail,pass,fail,pass,fail,fail,fail,pass,fail,fail,fail,pass,pass,fail,fail,fail,pass,pass,fail,pass,pass,pass,pass,pass,pass,fail,fail,pass,fail,fail,pass,fail,fail,fail,fail,pass,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail
2,Llama-3.2-1B_ft,fail,fail,fail,pass,pass,fail,fail,fail,pass,fail,pass,pass,pass,pass,pass,fail,fail,fail,pass,pass,fail,fail,pass,fail,fail,fail,fail,fail,pass,fail,fail,fail,pass,pass,fail,fail,fail,pass,pass,pass,pass,pass,pass,pass,pass,fail,pass,pass,pass,fail,fail,pass,fail,fail,fail,fail,pass,fail,pass,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,pass,fail,pass
3,Llama-3.2-1B_fkld,fail,fail,fail,pass,pass,fail,fail,fail,pass,fail,pass,pass,pass,pass,pass,fail,fail,fail,pass,pass,fail,fail,pass,fail,pass,fail,fail,fail,pass,fail,fail,fail,pass,pass,fail,fail,fail,pass,pass,pass,pass,pass,pass,pass,pass,fail,fail,pass,pass,fail,fail,pass,fail,fail,fail,fail,pass,fail,pass,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,pass,fail,pass
4,Llama-3.2-1B_sym_kld,fail,fail,fail,pass,pass,fail,fail,fail,pass,fail,pass,pass,pass,pass,pass,fail,fail,fail,pass,pass,fail,fail,pass,fail,pass,fail,fail,fail,pass,fail,fail,fail,pass,pass,fail,fail,fail,pass,pass,pass,pass,pass,pass,pass,pass,pass,fail,pass,pass,fail,fail,pass,fail,fail,fail,fail,pass,fail,pass,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,pass,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,fail,pass,fail,fail,pass,fail,fail,fail,fail,fail,fail,fail,pass,fail,pass


In [23]:
164*0.39

63.96

In [20]:
def count_cases(df, target_sequence):
    import pandas as pd

    # Assuming expended_df is your DataFrame
    # Define the target sequence
    # target_sequence = ['pass', 'fail', 'fail', 'fail', 'fail']

    # Function to check if a column matches the target sequence
    def matches_target_sequence(column):
        return list(column) == target_sequence

    # Apply the function to each column and count the matches
    matching_columns_count = expended_df.apply(matches_target_sequence).sum()

    print(f"Number of columns with 'pass', 'fail', 'fail', 'fail', 'fail' in order: {matching_columns_count}")

In [21]:
only_teacher_passes = ['pass', 'fail', 'fail', 'fail', 'fail']
count_cases(expended_df, target_sequence=only_teacher_passes)


Number of columns with 'pass', 'fail', 'fail', 'fail', 'fail' in order: 26


In [26]:
only_teacher_passes = ['pass', 'fail', 'pass', 'pass', 'pass']
count_cases(expended_df, target_sequence=only_teacher_passes)


Number of columns with 'pass', 'fail', 'fail', 'fail', 'fail' in order: 9


In [27]:
only_teacher_passes = ['fail', 'fail', 'fail', 'fail', 'pass']
count_cases(expended_df, target_sequence=only_teacher_passes)

Number of columns with 'pass', 'fail', 'fail', 'fail', 'fail' in order: 1


In [17]:
import json

with open(llama_32_1b_base_file, "r") as f:
    base_data = json.load(f)

NameError: name 'llama_32_1b_base_file' is not defined

In [None]:
base_data['eval']

{'HumanEval/0': [{'task_id': 'HumanEval/0',
   'solution': 'from typing import List\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    return any(\n        abs(numbers[i] - numbers[i + 1]) < threshold\n        for i in range(len(numbers) - 1)\n    )',
   'base_status': 'fail',
   'plus_status': 'fail',
   'base_fail_tests': [[[1.0, 2.0, 5.9, 4.0, 5.0], 0.95]],
   'plus_fail_tests': [[[0.1, 0.5, 1.0, 1.5, 2.0, 0.5], 0.4]]}],
 'HumanEval/1': [{'task_id': 'HumanEval/1',
   'solution': 'from typing import List\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into sep

In [None]:
base_data

{'date': '2024-11-28 05:15',
 'hash': 'fe585eb4df8c88d844eeb463ea4d0302',
 'eval': {'HumanEval/0': [{'task_id': 'HumanEval/0',
    'solution': 'from typing import List\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    return any(\n        abs(numbers[i] - numbers[i + 1]) < threshold\n        for i in range(len(numbers) - 1)\n    )',
    'base_status': 'fail',
    'plus_status': 'fail',
    'base_fail_tests': [[[1.0, 2.0, 5.9, 4.0, 5.0], 0.95]],
    'plus_fail_tests': [[[0.1, 0.5, 1.0, 1.5, 2.0, 0.5], 0.4]]}],
  'HumanEval/1': [{'task_id': 'HumanEval/1',
    'solution': 'from typing import List\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing