## Revised CoT Length Analysis

There are several plots we can create with this

1. Simple percentile plot (x-axis is the percentile of word count)
2. NLP Task type (for each NLP task type, we visualize how the word output length impacts performance)
3. 

In [6]:
import os
import json
import pandas as pd
import numpy as np

folder_path = "/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/performance_prompt_10"

percentile_folders = os.listdir(folder_path)
percentile_folders.remove("performance_prompt_10.tar.gz")  # unused file
percentile_folders.sort()  # sort in ascending order



In [7]:
percentile_folders# Create a dictionary to store the dataframes for each percentile

['performance_t_10',
 'performance_t_10_20',
 'performance_t_20_30',
 'performance_t_30_40',
 'performance_t_40_50',
 'performance_t_50_60',
 'performance_t_60_70',
 'performance_t_70_80',
 'performance_t_80_90',
 'performance_t_90_100']

In [8]:
# Initialize list to store average performance for each percentile [t10, t20, .., t100]
avg_performances = {}

print(percentile_folders)

for i, folder in enumerate(percentile_folders):
    path = os.path.join(folder_path, folder)

    avg_perf = []

    for file in os.listdir(path):
        if file.endswith(".json"):
            with open(os.path.join(path, file), "r") as f:
                # Load the JSON data
                data = json.load(f)

            # Iterate through the JSON file
            for model_name, d in data.items():
                # For CLF tasks
                if "accuracy" in d:
                    # Extract the accuracy metric
                    accuracy = d["accuracy"]["mean"]
                    # Append the accuracy to the list
                    avg_perf.append(accuracy)

                elif "f1_event" in d:
                    # Extract the F1 score metric
                    f1_score = d["f1_event"]["mean"]
                    # Append the F1 score to the list
                    avg_perf.append(f1_score)

                elif "rouge" in d:
                    # Extract the ROUGE score metric
                    rouge_score = d["rouge"]["mean"]
                    # Append the ROUGE score to the list
                    avg_perf.append(rouge_score)

                else:
                    print(f"Unknown metric in file: {file}")

        else:
            print(f"Skipping non-JSON file: {file}")

    # Calculate the average performance for the current file
    if len(avg_perf) > 0:
        avg_performance = np.mean(avg_perf)
    else:
        print('No valid performance metrics found in file:', file)
        avg_performance = 0

                
    avg_performances[i] = avg_performance



['performance_t_10', 'performance_t_10_20', 'performance_t_20_30', 'performance_t_30_40', 'performance_t_40_50', 'performance_t_50_60', 'performance_t_60_70', 'performance_t_70_80', 'performance_t_80_90', 'performance_t_90_100']


In [9]:
avg_performances

{0: 31.25172788831009,
 1: 33.7172816350533,
 2: 35.55663068957776,
 3: 35.88112448533007,
 4: 36.45400975033148,
 5: 37.18352553965084,
 6: 38.2295622675733,
 7: 38.48318778516153,
 8: 39.00902062770076,
 9: 40.89801245327306}

In [10]:
d = {"Llama-3.3-70B-Instruct": {
        "accuracy_subject": {
            "mean": 58.246511627906976,
            "std": 7.712689242645014,
            "ci": [
                57.76790312150295,
                58.725120134311005
            ]
        },
        "precision_subject": {
            "mean": 66.97410551806024,
            "std": 5.64778851748585,
            "ci": [
                66.62363377703717,
                67.32457725908331
            ]
        }}}


## Calculate By Model

In [21]:
print(percentile_folders)

['performance_t_10', 'performance_t_10_20', 'performance_t_20_30', 'performance_t_30_40', 'performance_t_40_50', 'performance_t_50_60', 'performance_t_60_70', 'performance_t_70_80', 'performance_t_80_90', 'performance_t_90_100']


In [None]:
# Initialize list to store average performance for each percentile [t10, t20, .., t100]
avg_performances = {}

for i, folder in enumerate(percentile_folders):
    print(folder)
    path = os.path.join(folder_path, folder)

    avg_perf = {}

    for file in os.listdir(path):
        if file.endswith(".json"):
            with open(os.path.join(path, file), "r") as f:
                # Load the JSON data
                data = json.load(f)

            # Iterate through the JSON file
            for model_name, d in data.items():
                if model_name not in avg_perf:
                        avg_perf[model_name] = []

                # For CLF tasks
                if "accuracy" in d:
                    # Extract the accuracy metric
                    accuracy = d["accuracy"]["mean"]

                    # Append the accuracy to the list
                    avg_perf[model_name].append(accuracy)

                elif "f1_event" in d:
                    # Extract the F1 score metric
                    f1_score = d["f1_event"]["mean"]

                    # Append the F1 score to the list
                    avg_perf[model_name].append(f1_score)

                elif "rouge" in d:
                    # Extract the ROUGE score metric
                    rouge_score = d["rouge"]["mean"]
                    # Append the ROUGE score to the list
                    avg_perf[model_name].append(rouge_score)

                else:
                    print(f"Unknown metric in file: {file}")

        else:
            print(f"Skipping non-JSON file: {file}")

    out = {}

    for model, di in avg_perf.items():
        # out[model] = 
        out[model] = round(np.mean(di), 2)

    avg_performances[i] = out



performance_t_10
82.CHIP-CTC.cot.performance.json
43.IMCS-V2-NER.cot.performance.json
1-2.ADE-ADE relation.cot.performance.json
85.IMCS-V2-SR.cot.performance.json
105.MIMIC-IV CDM.cot.performance.json
8.CARES.icd10_chapter.cot.performance.json
63.MTSamples-temporal annotation.cot.performance.json
27.DiSMed.cot.performance.json
55.MedNLI.cot.performance.json
41.n2c2 2014 - De-identification.cot.performance.json
8.CARES.icd10_sub_block.cot.performance.json
48.meddocan.cot.performance.json
87.IMCS-V2-DAC.cot.performance.json
38-3.i2b2-2010-Relations-Challenge-relation.cot.performance.json
51.MEDIQA_2019_Task2_RQE.cot.performance.json
76-3.MTS-Dialog-MEDIQA-2023-sum-task-B.cot.performance.json
7.Cantemist.CODING.cot.performance.json
33.GOUT-CC.consensus.cot.performance.json
106.MIMIC-III Outcome.LoS.cot.performance.json
107.MIMIC-IV BHC.cot.performance.json
31.Ex4CDS.cot.performance.json
29.EHRQA.qa.cot.performance.json
6.Brateca.mortality.cot.performance.json
17-1.CLEF_eHealth_2020_CodiEs

In [32]:
out

{'Llama-3.3-70B-Instruct': 41.94,
 'MeLLaMA-70B-chat': 29.44,
 'Qwen2.5-72B-Instruct': 45.14,
 'gpt-4o': 47.07}

Desired output:

{
    model1: {
        0,
        1,
        2,
        3,
    }

    model2: {
        
    }
}

In [33]:
avg_performances


{0: {'Llama-3.3-70B-Instruct': 32.52,
  'MeLLaMA-70B-chat': 23.59,
  'Qwen2.5-72B-Instruct': 33.84,
  'gpt-4o': 35.05},
 1: {'Llama-3.3-70B-Instruct': 34.34,
  'MeLLaMA-70B-chat': 26.57,
  'Qwen2.5-72B-Instruct': 36.26,
  'gpt-4o': 37.71},
 2: {'Llama-3.3-70B-Instruct': 35.88,
  'MeLLaMA-70B-chat': 29.69,
  'Qwen2.5-72B-Instruct': 37.52,
  'gpt-4o': 39.14},
 3: {'Llama-3.3-70B-Instruct': 35.97,
  'MeLLaMA-70B-chat': 29.57,
  'Qwen2.5-72B-Instruct': 38.41,
  'gpt-4o': 39.58},
 4: {'Llama-3.3-70B-Instruct': 36.68,
  'MeLLaMA-70B-chat': 30.38,
  'Qwen2.5-72B-Instruct': 38.54,
  'gpt-4o': 40.22},
 5: {'Llama-3.3-70B-Instruct': 37.25,
  'MeLLaMA-70B-chat': 31.43,
  'Qwen2.5-72B-Instruct': 39.22,
  'gpt-4o': 40.83},
 6: {'Llama-3.3-70B-Instruct': 38.07,
  'MeLLaMA-70B-chat': 31.78,
  'Qwen2.5-72B-Instruct': 40.43,
  'gpt-4o': 42.64},
 7: {'Llama-3.3-70B-Instruct': 38.94,
  'MeLLaMA-70B-chat': 30.08,
  'Qwen2.5-72B-Instruct': 41.27,
  'gpt-4o': 43.65},
 8: {'Llama-3.3-70B-Instruct': 39.5,
  '

In [34]:
df = pd.DataFrame.from_dict(avg_performances, orient='index')
df = df.rename(index={0: "t10", 1: "t20", 2: "t30", 3: "t40", 4: "t50", 5: "t60", 6: "t70", 7: "t80", 8: "t90", 9: "t100"})
df = df.rename(columns={0: "Llama-3.3-70B-Instruct", 1: "Llama-3.3-70B-Chat", 2: "Llama-3.3-70B-Chat-Full", 3: "Llama-3.3-70B-Chat-Full-SFT", 4: "Llama-7.1-70B-Instruct", 5: "Llama-7.1-70B-Chat", 6: "Llama-7.1-70B-Chat-Full", 7: "Llama-7.1-70B-Chat-Full-SFT", 8: "Llama2-13b-Instruct", 9: "Llama2-13b-Instruct-SFT"})
df.to_csv("performance_prompt_10.csv", index=True)
