# Figure Creation Notebook

-------------------

## 0. Helpful Initializations

In [2]:
import pandas as pd
import numpy as np
import math
import json
import os
import re


def load_sheet(sheet_path):
    '''
    Input: path to excel sheet
    Output: pandas dataframe of entire excel spreadsheet
    '''
    sheet = pd.ExcelFile(sheet_path)

    return sheet


sheet_path = "/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx"
google_sheet = load_sheet(sheet_path)

In [3]:
model_size_mapping = {'DeepSeek-R1-Distill-Qwen-1.5B': 1.5, 'DeepSeek-R1-Distill-Qwen-7B': 7,
                       'DeepSeek-R1-Distill-Llama-8B': 8, 'DeepSeek-R1-Distill-Qwen-14B': 14,
                         'DeepSeek-R1-Distill-Qwen-32B': 32, 'DeepSeek-R1-Distill-Llama-70B': 70, 
                         'DeepSeek-R1': 671, 'Baichuan-M1-14B-Instruct': 14, 'gemma-2-9b-it': 9, 
                         'gemma-2-27b-it': 27, 'gemma-3-1b-it': 1, 'gemma-3-4b-it': 4, 'gemma-3-12b-it': 12, 
                         'gemma-3-27b-it': 27, 'Llama-3.1-8B-Instruct': 8, 'Llama-3.1-70B-Instruct': 70, 'Llama-3.2-1B-Instruct': 1, 'Llama-3.2-3B-Instruct': 3, 'Llama-3.3-70B-Instruct': 70, 'Llama-4-Scout-17B-16E-Instruct': 109, 'Llama-3.1-Nemotron-70B-Instruct-HF': 70, 'meditron-7b': 7, 'meditron-70b': 70, 'MeLLaMA-13B-chat': 13, 'MeLLaMA-70B-chat': 70, 'Llama3-OpenBioLLM-8B': 8, 'Llama3-OpenBioLLM-70B': 70, 'MMed-Llama-3-8B': 8, 'Llama-3.1-8B-UltraMedical': 8, 'Llama-3-70B-UltraMedical': 70, 'Ministral-8B-Instruct-2410': 8, 'Mistral-Small-Instruct-2409': 22, 'Mistral-Small-24B-Instruct-2501': 24, 'Mistral-Small-3.1-24B-Instruct-2503': 24, 'Mistral-Large-Instruct-2411': 123, 'BioMistral-7B': 7, 'Phi-3.5-mini-instruct': 4, 'Phi-3.5-MoE-instruct': 42, 'Phi-4': 14, 'Qwen2.5-1.5B-Instruct': 1.5, 'Qwen2.5-3B-Instruct': 3, 'Qwen2.5-7B-Instruct': 7, 'Qwen2.5-72B-Instruct': 72, 'QwQ-32B-Preview': 32, 'QWQ-32B': 32, 'Athene-V2-Chat': 72, 'Yi-1.5-9B-Chat-16K': 9, 'Yi-1.5-34B-Chat-16K': 34, 'gpt-35-turbo-0125': '/', 'gpt-4o-0806': '/', 'gemini-2.0-flash-001': '/', 'gemini-1.5-pro-002': '/'}

In [4]:
models = [
    "DeepSeek-R1",
    "gpt-4o-0806",
    "gemini-1.5-pro-002",
    "gemini-2.0-flash-001",
    "Athene-V2-Chat",
    "Mistral-Large-Instruct-2411",
    "Qwen2.5-72B-Instruct",
    "gemma-3-27b-it",
    "Llama-3.3-70B-Instruct",
    "DeepSeek-R1-Distill-Llama-70B",
    "Llama-3.1-70B-Instruct",
    "QWQ-32B",
    "DeepSeek-R1-Distill-Qwen-32B",
    "gemma-3-12b-it",
    "Baichuan-M1-14B-Instruct",
    "gemma-2-27b-it",
    "Phi-4",
    "Mistral-Small-3.1-24B-Instruct-2503",
    "gpt-35-turbo-0125",
    "DeepSeek-R1-Distill-Qwen-14B",
    "Mistral-Small-24B-Instruct-2501",
    "gemma-2-9b-it",
    "Llama-3-70B-UltraMedical",
    "Llama3-OpenBioLLM-70B",
    "Mistral-Small-Instruct-2409",
    "Llama-4-Scout-17B-16E-Instruct",
    "Qwen2.5-7B-Instruct",
    "Yi-1.5-34B-Chat-16K",
    "QwQ-32B-Preview",
    "Llama-3.1-8B-Instruct",
    "Llama-3.1-Nemotron-70B-Instruct-HF",
    "Ministral-8B-Instruct-2410",
    "MeLLaMA-70B-chat",
    "gemma-3-4b-it",
    "Yi-1.5-9B-Chat-16K",
    "Phi-3.5-MoE-instruct",
    "Qwen2.5-3B-Instruct",
    "DeepSeek-R1-Distill-Llama-8B",
    "Llama-3.2-3B-Instruct",
    "Phi-3.5-mini-instruct",
    "MMed-Llama-3-8B",
    "Qwen2.5-1.5B-Instruct",
    "DeepSeek-R1-Distill-Qwen-7B",
    "MeLLaMA-13B-chat",
    "Llama-3.1-8B-UltraMedical",
    "Llama3-OpenBioLLM-8B",
    "meditron-70b",
    "gemma-3-1b-it",
    "BioMistral-7B",
    "DeepSeek-R1-Distill-Qwen-1.5B",
    "Llama-3.2-1B-Instruct",
    "meditron-7b"
]

In [5]:
all_english_tasks = {
    "ADE-Identification",
    "ADE-Extraction",
    "ADE-Drug dosage",
    "BrainMRI-AIS",
    "ClinicalNotes-UPMC",
    "CLIP",
    "GOUT-CC-Consensus",
    "n2c2 2006-De-identification",
    "Medication extraction",
    "n2c2 2010-Concept",
    "n2c2 2010-Assertion",
    "n2c2 2010-Relation",
    "n2c2 2014-De-identification",
    "MEDIQA 2019-RQE",
    "MedNLI",
    "MedSTS",
    "MTS",
    "MTS-Temporal",
    "n2c2 2018-ADE&medication",
    "MEDIQA 2023-chat-A",
    "MEDIQA 2023-sum-A",
    "MEDIQA 2023-sum-B",
    "n2c2 2014-Diabetes",
    "n2c2 2014-CAD",
    "n2c2 2014-Hyperlipidemia",
    "n2c2 2014-Hypertension",
    "n2c2 2014-Medication",
    "icliniq-10k",
    "HealthCareMagic-100k",
    "MIMIC-IV CDM",
    "MIMIC-III Outcome.LoS",
    "MIMIC-III Outcome.Mortality",
    "MIMIC-IV BHC",
    "MIMIC-IV DiReCT.Dis",
    "MIMIC-IV DiReCT.PDD"
}

In [6]:
data = google_sheet.parse("All-Sheet")

model_domain_mapping = {}

for row_idx, model_name in enumerate(data["Model Name"]):
    if model_name not in model_domain_mapping:
        domain = data["Model Domain"][row_idx]
        if domain == "gen":
            domain = "General"
        elif domain == "med":
            domain = "Medical"
        model_domain_mapping[model_name] = domain

print(model_domain_mapping)

{'Baichuan-M1-14B-Instruct': 'Medical', 'DeepSeek-R1': 'General', 'DeepSeek-R1-Distill-Llama-8B': 'General', 'DeepSeek-R1-Distill-Llama-70B': 'General', 'DeepSeek-R1-Distill-Qwen-1.5B': 'General', 'DeepSeek-R1-Distill-Qwen-7B': 'General', 'DeepSeek-R1-Distill-Qwen-14B': 'General', 'DeepSeek-R1-Distill-Qwen-32B': 'General', 'gemma-2-9b-it': 'General', 'gemma-2-27b-it': 'General', 'gemma-3-1b-it': 'General', 'gemma-3-4b-it': 'General', 'gemma-3-12b-it': 'General', 'gemma-3-27b-it': 'General', 'Llama-3.1-8B-Instruct': 'General', 'Llama-3.1-70B-Instruct': 'General', 'Llama-3.2-1B-Instruct': 'General', 'Llama-3.2-3B-Instruct': 'General', 'Llama-3.3-70B-Instruct': 'General', 'Llama-4-Scout-17B-16E-Instruct': 'General', 'Llama-3.1-Nemotron-70B-Instruct-HF': 'General', 'meditron-7b': 'Medical', 'meditron-70b': 'Medical', 'MeLLaMA-13B-chat': 'Medical', 'MeLLaMA-70B-chat': 'Medical', 'Llama3-OpenBioLLM-8B': 'Medical', 'Llama3-OpenBioLLM-70B': 'Medical', 'MMed-Llama-3-8B': 'Medical', 'Llama-3.1-8

In [7]:
accessibility_mapping = {
    "Baichuan-M1-14B-Instruct": "open source",
    "DeepSeek-R1": "open source",
    "DeepSeek-R1-Distill-Llama-8B": "open source",
    "DeepSeek-R1-Distill-Llama-70B": "open source",
    "DeepSeek-R1-Distill-Qwen-1.5B": "open source",
    "DeepSeek-R1-Distill-Qwen-7B": "open source",
    "DeepSeek-R1-Distill-Qwen-14B": "open source",
    "DeepSeek-R1-Distill-Qwen-32B": "open source",
    "gemma-2-9b-it": "open source",
    "gemma-2-27b-it": "open source",
    "gemma-3-1b-it": "open source",
    "gemma-3-4b-it": "open source",
    "gemma-3-12b-it": "open source",
    "gemma-3-27b-it": "open source",
    "Llama-3.1-8B-Instruct": "open source",
    "Llama-3.1-70B-Instruct": "open source",
    "Llama-3.2-1B-Instruct": "open source",
    "Llama-3.2-3B-Instruct": "open source",
    "Llama-3.3-70B-Instruct": "open source",
    "Llama-4-Scout-17B-16E-Instruct": "open source",
    "Llama-3.1-Nemotron-70B-Instruct-HF": "open source",
    "meditron-7b": "open source",
    "meditron-70b": "open source",
    "MeLLaMA-13B-chat": "open source",
    "MeLLaMA-70B-chat": "open source",
    "Llama3-OpenBioLLM-8B": "open source",
    "Llama3-OpenBioLLM-70B": "open source",
    "MMed-Llama-3-8B": "open source",
    "Llama-3.1-8B-UltraMedical": "open source",
    "Llama-3-70B-UltraMedical": "open source",
    "Ministral-8B-Instruct-2410": "open source",
    "Mistral-Small-Instruct-2409": "open source",
    "Mistral-Small-24B-Instruct-2501": "open source",
    "Mistral-Small-3.1-24B-Instruct-2503": "open source",
    "Mistral-Large-Instruct-2411": "open source",
    "BioMistral-7B": "open source",
    "Phi-3.5-mini-instruct": "open source",
    "Phi-3.5-MoE-instruct": "open source",
    "Phi-4": "open source",
    "Qwen2.5-1.5B-Instruct": "open source",
    "Qwen2.5-3B-Instruct": "open source",
    "Qwen2.5-7B-Instruct": "open source",
    "Qwen2.5-72B-Instruct": "open source",
    "QwQ-32B-Preview": "open source",
    "QWQ-32B": "open source",
    "Athene-V2-Chat": "open source",
    "Yi-1.5-9B-Chat-16K": "open source",
    "Yi-1.5-34B-Chat-16K": "open source",
    "gemini-1.5-pro-002": "commercial",
    "gemini-2.0-flash-001": "commercial",
    "gpt-4o-0806": "commercial",
    "gpt-35-turbo-0125": "commercial"
}

In [8]:
task_to_type_map = {
    "ADE-Identification": "Text Classification",
    "ADE-Extraction": "Event Extraction",
    "ADE-Drug dosage": "Event Extraction",
    "BARR2": "Event Extraction",
    "BrainMRI-AIS": "Text Classification",
    "Brateca-Hospitalization": "Text Classification",
    "Brateca-Mortality": "Text Classification",
    "Cantemist-Coding": "Normalization and Coding",
    "Cantemis-NER": "Named Entity Recognition",
    "Cantemis-Norm": "Normalization and Coding",
    "CARES-Area": "Text Classification",
    "CARES ICD10 Block": "Normalization and Coding",
    "CARES-ICD10 Chapter": "Normalization and Coding",
    "CARES-ICD10 Subblock": "Normalization and Coding",
    "CHIP-CDEE": "Event Extraction",
    "C-EMRS": "Text Classification",
    "CodiEsp-ICD-10-CM": "Normalization and Coding",
    "CodiEsp-ICD-10-PCS": "Normalization and Coding",
    "ClinicalNotes-UPMC": "Text Classification",
    "PPTS": "Text Classification",
    "CLINpt-NER": "Named Entity Recognition",
    "CLIP": "Text Classification",
    "cMedQA": "Question Answering",
    "DialMed": "Text Classification",
    "DiSMed-NER": "Named Entity Recognition",
    "MIE": "Event Extraction",
    "EHRQA-Primary department": "Text Classification",
    "EHRQA-QA": "Question Answering",
    "EHRQA-Sub department": "Text Classification",
    "Ex4CDS": "Named Entity Recognition",
    "GOUT-CC-Consensus": "Text Classification",
    "n2c2 2006-De-identification": "Named Entity Recognition",
    "Medication extraction": "Event Extraction",
    "n2c2 2010-Concept": "Named Entity Recognition",
    "n2c2 2010-Assertion": "Named Entity Recognition",
    "n2c2 2010-Relation": "Event Extraction",
    "n2c2 2014-De-identification": "Named Entity Recognition",
    "IMCS-V2-NER": "Named Entity Recognition",
    "JP-STS": "Semantic Similarity",
    "meddocan": "Named Entity Recognition",
    "MEDIQA 2019-RQE": "Natural Language Inference",
    "MedNLI": "Natural Language Inference",
    "MedSTS": "Semantic Similarity",
    "MTS": "Text Classification",
    "MTS-Temporal": "Named Entity Recognition",
    "n2c2 2018-ADE&medication": "Event Extraction",
    "NorSynthClinical-NER": "Named Entity Recognition",
    "NorSynthClinical-RE": "Event Extraction",
    "NUBES": "Event Extraction",
    "MEDIQA 2023-chat-A": "Summarization",
    "MEDIQA 2023-sum-A": "Text Classification",
    "MEDIQA 2023-sum-B": "Summarization",
    "RuMedDaNet": "Natural Language Inference",
    "CBLUE-CDN": "Normalization and Coding",
    "CHIP-CTC": "Text Classification",
    "CHIP-MDCFNPC": "Event Extraction",
    "MedDG": "Question Answering",
    "IMCS-V2-SR": "Event Extraction",
    "IMCS-V2-MRG": "Summarization",
    "IMCS-V2-DAC": "Text Classification",
    "n2c2 2014-Diabetes": "Event Extraction",
    "n2c2 2014-CAD": "Event Extraction",
    "n2c2 2014-Hyperlipidemia": "Event Extraction",
    "n2c2 2014-Hypertension": "Event Extraction",
    "n2c2 2014-Medication": "Event Extraction",
    "CAS-label": "Event Extraction",
    "CAS-evidence": "Summarization",
    "RuMedNLI": "Natural Language Inference",
    "RuDReC-NER": "Named Entity Recognition",
    "NorSynthClinical-PHI": "Named Entity Recognition",
    "RuCCoN": "Named Entity Recognition",
    "CLISTER": "Semantic Similarity",
    "BRONCO150-NER&Status": "Event Extraction",
    "CARDIO-DE": "Named Entity Recognition",
    "GraSSCo PHI": "Named Entity Recognition",
    "IFMIR-Incident type": "Text Classification",
    "IFMIR-NER": "Named Entity Recognition",
    "IFMIR - NER&factuality": "Event Extraction",
    "iCorpus": "Named Entity Recognition",
    "icliniq-10k": "Question Answering",
    "HealthCareMagic-100k": "Question Answering",
    "MIMIC-IV CDM": "Text Classification",
    "MIMIC-III Outcome.LoS": "Text Classification",
    "MIMIC-III Outcome.Mortality": "Text Classification",
    "MIMIC-IV BHC": "Summarization",
    "MIMIC-IV DiReCT.Dis": "Text Classification",
    "MIMIC-IV DiReCT.PDD": "Text Classification"
}


In [9]:
def sort_by_reference(reference, toSort):
    # Create a name-to-index mapping from the reference list
    order = {name: idx for idx, (name, _) in enumerate(reference)}

    # Step 2: Reorder cot_performance using those indices
    ans = sorted(toSort, key=lambda x: order.get(x[0], float('inf')))

    return ans

_____

## 1. Model vs. Performance

This creates the first figure in the data analysis Overleaf

____

In [10]:
# sheet_path = "/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx"
sheet_path = "/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx"
google_sheet = load_sheet(sheet_path)

def get_model_performances(google_sheet, sheet_name, cot):
    '''
    Inputs:
        - google_sheet: loaded google sheet (entire .xlsx file)
        - sheet_name: name of the sheet you want to load
        - cot: boolean indicating whether to get CoT or DA performance
    
    Output:
        - model_performances: Dictionary mapping { model name : list of all task performances (relative difference) with that model}
    '''

    model_performances = {}

    data = google_sheet.parse(sheet_name)

    if cot:
        col = "CoT Score"

    else:
        col = "Direct Score"


    for row, model in enumerate(data["Model Name"]):

        if model not in model_performances:
            model_performances[model] = []
        
        model_performances[model].append(data[col][row])

    return model_performances


In [11]:
clf_model_performances = get_model_performances(google_sheet, 'CLF-Difference', cot=True)
ext_model_performances = get_model_performances(google_sheet, 'EXT-Difference', cot=True)
gen_model_performances = get_model_performances(google_sheet, 'Gen-Difference', cot=True)


combined_model_performances = {}

# Combine into one
for model in clf_model_performances:
    combined_model_performances[model] = clf_model_performances[model]
    combined_model_performances[model].extend(ext_model_performances[model])
    combined_model_performances[model].extend(gen_model_performances[model])

# Average performances
for model in combined_model_performances:
    combined_model_performances[model] = round(sum(combined_model_performances[model]) / len(combined_model_performances[model]), 2)

In [12]:
print(sorted(combined_model_performances.items(), key=lambda x: x[1], reverse=True))

[('DeepSeek-R1', 42.1), ('gemini-2.0-flash-001', 41.98), ('gpt-4o-0806', 40.66), ('gemini-1.5-pro-002', 40.53), ('Athene-V2-Chat', 39.34), ('DeepSeek-R1-Distill-Llama-70B', 38.95), ('Mistral-Large-Instruct-2411', 38.9), ('Qwen2.5-72B-Instruct', 38.86), ('DeepSeek-R1-Distill-Qwen-32B', 38.72), ('gemma-3-27b-it', 37.55), ('QWQ-32B', 37.03), ('Llama-3.3-70B-Instruct', 36.83), ('Mistral-Small-3.1-24B-Instruct-2503', 36.23), ('gemma-3-12b-it', 35.37), ('Llama-3.1-70B-Instruct', 35.1), ('DeepSeek-R1-Distill-Qwen-14B', 34.79), ('Baichuan-M1-14B-Instruct', 34.36), ('gemma-2-27b-it', 34.22), ('Phi-4', 32.59), ('gpt-35-turbo-0125', 31.63), ('Mistral-Small-24B-Instruct-2501', 31.59), ('Mistral-Small-Instruct-2409', 31.17), ('Qwen2.5-7B-Instruct', 30.25), ('gemma-2-9b-it', 29.94), ('Yi-1.5-34B-Chat-16K', 29.57), ('Llama-3-70B-UltraMedical', 29.44), ('Llama-3.1-8B-Instruct', 29.4), ('Llama-4-Scout-17B-16E-Instruct', 29.38), ('MeLLaMA-70B-chat', 29.25), ('Llama3-OpenBioLLM-70B', 28.78), ('gemma-3-4b

### Get Model Sizes

In [14]:
sheet = pd.ExcelFile("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/Reference for Clinical Benchmark and LLM.xlsx")

model_data = sheet.parse("Models (Simplified)")


In [15]:
model_size_mapping = {}

for row, model_name in enumerate(model_data["Name"]):
    if model_name in models:
        model_size_mapping[model_name] = model_data["Size (B)"][row]
    else:
        print(f"Model {model_name} not found in the list of models.")

In [16]:
print(model_size_mapping)

{'DeepSeek-R1-Distill-Qwen-1.5B': 1.5, 'DeepSeek-R1-Distill-Qwen-7B': 7, 'DeepSeek-R1-Distill-Llama-8B': 8, 'DeepSeek-R1-Distill-Qwen-14B': 14, 'DeepSeek-R1-Distill-Qwen-32B': 32, 'DeepSeek-R1-Distill-Llama-70B': 70, 'DeepSeek-R1': 671, 'Baichuan-M1-14B-Instruct': 14, 'gemma-2-9b-it': 9, 'gemma-2-27b-it': 27, 'gemma-3-1b-it': 1, 'gemma-3-4b-it': 4, 'gemma-3-12b-it': 12, 'gemma-3-27b-it': 27, 'Llama-3.1-8B-Instruct': 8, 'Llama-3.1-70B-Instruct': 70, 'Llama-3.2-1B-Instruct': 1, 'Llama-3.2-3B-Instruct': 3, 'Llama-3.3-70B-Instruct': 70, 'Llama-4-Scout-17B-16E-Instruct': 109, 'Llama-3.1-Nemotron-70B-Instruct-HF': 70, 'meditron-7b': 7, 'meditron-70b': 70, 'MeLLaMA-13B-chat': 13, 'MeLLaMA-70B-chat': 70, 'Llama3-OpenBioLLM-8B': 8, 'Llama3-OpenBioLLM-70B': 70, 'MMed-Llama-3-8B': 8, 'Llama-3.1-8B-UltraMedical': 8, 'Llama-3-70B-UltraMedical': 70, 'Ministral-8B-Instruct-2410': 8, 'Mistral-Small-Instruct-2409': 22, 'Mistral-Small-24B-Instruct-2501': 24, 'Mistral-Small-3.1-24B-Instruct-2503': 24, 'M

____

## 2. NLP Task vs. Performance

In [None]:
google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

def get_nlp_task_performances(google_sheet, sheet_names, is_CoT):
    '''
    Inputs:
    '''

    nlp_performances = {}  # maps nlp task --> average performances

    for sheet_name in sheet_names:
        data = google_sheet.parse(sheet_name)

        for row_idx, task_type in enumerate(data['Task Type']):

            if task_type not in nlp_performances:
                nlp_performances[task_type] = []

            if is_CoT:
                score = data["CoT Score"][row_idx]
            else:
                score = data["Direct Score"][row_idx]

            nlp_performances[task_type].append(score)

    # Average the performances
    for nlp_task, values_list in nlp_performances.items():
        nlp_performances[nlp_task] = round(sum(values_list) / len(values_list), 2)
        
    return nlp_performances


In [None]:
perf = get_nlp_task_performances(google_sheet, ['CLF-Difference', 'EXT-Difference', 'Gen-Difference'], is_CoT=True)

df = pd.DataFrame(perf, index=[0])



df.to_csv('testingg.csv')

In [None]:
print(perf)

_____

## 3. Types of LLM: Medical vs. General  (Performance Drops)

In [10]:
google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

def get_domain_task_performances(google_sheet, sheet_names, is_CoT, model_domain):
    # Want our output to be: 
    # dictionary mapping { nlp task --> performance on ONLY MEDICAL MODELS }

    nlp_performances = {}  # maps nlp task --> average performances of a certain kind of model

    for sheet_name in sheet_names:
        data = google_sheet.parse(sheet_name)

        for row_idx, task_type in enumerate(data['Task Type']):
            model_type = data['Model Domain'][row_idx]

            # Add the task type to our final dictionary
            if task_type not in nlp_performances:
                nlp_performances[task_type] = []

            # Add the correct metric (direct or CoT) to the dictionary IFF the model type matches the model domain (gen, or med)
            if is_CoT and model_type == model_domain:
                score = data["CoT Score"][row_idx]
                nlp_performances[task_type].append(score)
            elif not is_CoT and model_type == model_domain:
                score = data["Direct Score"][row_idx]
                nlp_performances[task_type].append(score)

    # Average the performances
    for nlp_task, values_list in nlp_performances.items():
        nlp_performances[nlp_task] = round(sum(values_list) / len(values_list), 2)
        
    # print(nlp_performances)
    return nlp_performances

#  Need to generate 4 lists
#  CoT + gen
#  CoT + med
#  Direct + gen
#  Direct + med

sheet_names = ['CLF-Difference', 'EXT-Difference', 'Gen-Difference']

cot_general = get_domain_task_performances(google_sheet, sheet_names, is_CoT=True, model_domain="gen")
cot_medical = get_domain_task_performances(google_sheet, sheet_names, is_CoT=True, model_domain="med")
direct_general = get_domain_task_performances(google_sheet, sheet_names, is_CoT=False, model_domain="gen")
direct_medical = get_domain_task_performances(google_sheet, sheet_names, is_CoT=False, model_domain="med")



In [11]:
print(cot_general)
print(cot_medical)
print(direct_general)
print(direct_medical)


{'Text Classification': 56.31, 'Normalization and Coding': 3.38, 'Semantic Similarity': 36.13, 'Natural Language Inference': 76.54, 'Event Extraction': 13.67, 'Named Entity Recognition': 24.79, 'Question Answering': 13.46, 'Summarization': 23.05}
{'Text Classification': 42.35, 'Normalization and Coding': 1.31, 'Semantic Similarity': 29.11, 'Natural Language Inference': 63.25, 'Event Extraction': 6.5, 'Named Entity Recognition': 9.46, 'Question Answering': 10.99, 'Summarization': 15.47}
{'Text Classification': 58.07, 'Normalization and Coding': 3.69, 'Semantic Similarity': 38.1, 'Natural Language Inference': 77.81, 'Event Extraction': 17.55, 'Named Entity Recognition': 28.37, 'Question Answering': 16.0, 'Summarization': 29.39}
{'Text Classification': 45.95, 'Normalization and Coding': 1.54, 'Semantic Similarity': 31.21, 'Natural Language Inference': 67.34, 'Event Extraction': 9.27, 'Named Entity Recognition': 13.02, 'Question Answering': 11.8, 'Summarization': 19.93}


____

## 4. Types of LLM: Commercial vs. Open Source (Performance Drops)

Very similar to the previous section. This section visualizes how the use of commercial vs. open source models relates to the performance drop associated with using CoT vs. Direct Answering


From the previous section, we can conclude that their is a much larger performance drop in medical models using CoT vs. when using DA.

In [None]:
google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

def get_domain_task_performances(google_sheet, sheet_names, is_CoT, model_accessibility):
    # Want our output to be: 
    # dictionary mapping { nlp task --> performance on ONLY OPEN SOURCE MODELS }, wlog

    nlp_performances = {}  # maps nlp task --> average performances of a certain kind of model

    for sheet_name in sheet_names:
        data = google_sheet.parse(sheet_name)

        for row_idx, task_type in enumerate(data['Task Type']):
            model_name = data['Model Name'][row_idx]

            model_type = accessibility_mapping[model_name]   # open source or commercial

            # # Add the task type to our final dictionary
            if task_type not in nlp_performances:
                nlp_performances[task_type] = []


            if is_CoT and model_type == model_accessibility:   # CoT for commercial/open source:
                score = data["CoT Score"][row_idx]
                nlp_performances[task_type].append(score)

            elif not is_CoT and model_type == model_accessibility:
                score = data["Direct Score"][row_idx]
                nlp_performances[task_type].append(score)

    # Average the performances
    for nlp_task, values_list in nlp_performances.items():
        nlp_performances[nlp_task] = round(sum(values_list) / len(values_list), 2)
        
    # print(nlp_performances)
    return nlp_performances

#  Need to generate 4 lists
#  CoT + open
#  CoT + commercial
#  Direct + open
#  Direct + commerical

sheet_names = ['CLF-Difference', 'EXT-Difference', 'Gen-Difference']

cot_open = get_domain_task_performances(google_sheet, sheet_names, is_CoT=True, model_accessibility="open source")
cot_comm = get_domain_task_performances(google_sheet, sheet_names, is_CoT=True, model_accessibility="commercial")
direct_open = get_domain_task_performances(google_sheet, sheet_names, is_CoT=False, model_accessibility="open source")
direct_comm = get_domain_task_performances(google_sheet, sheet_names, is_CoT=False, model_accessibility="commercial")

In [None]:
print(cot_open)
print(cot_comm)
print(direct_open)
print(direct_comm)


____

## 5. Individual Task vs. Performance


Each sheet has different tasks (mutually exclusive). Should iterate through each sheet.

In [None]:
def get_task_name_performance(google_sheet, sheet_names, is_CoT):
    task_performances = {}  # maps each TASK NAME (i.e. ADE-Identification) to their performances

    for sheet_name in sheet_names:

        data = google_sheet.parse(sheet_name)

        for row_idx, task_name in enumerate(data["Task Name"]):
            if task_name not in task_performances:
                task_performances[task_name] = []

            if is_CoT:
                score = data["CoT Score"][row_idx]

            else:
                score = data["Direct Score"][row_idx]

            task_performances[task_name].append(score)

    # Average the performances
    for task, values_list in task_performances.items():
        task_performances[task] = round(sum(values_list) / len(values_list), 2)

    return task_performances

In [None]:
sheet_names = ['CLF-Difference', 'EXT-Difference', 'Gen-Difference']

# Get performances on each task via CoT prompting
cot_task_performances = get_task_name_performance(google_sheet, sheet_names, is_CoT=True)

# Get performances on each task via Direct answering
direct_task_performances = get_task_name_performance(google_sheet, sheet_names, is_CoT=False)

In [None]:
print(cot_task_performances)
print(direct_task_performances)

#### Create CSV file for this data


- Header (Columns): Task Name, Task Type, Direct Score, CoT Score, Difference, Relative Difference

In [None]:
final_csv = {
    "Task Name": [],
    "Task Type": [],
    "Direct Score": [],
    "CoT Score": [],
    "Difference": [],
    "Relative Difference": []
} 

for task_name, score in cot_task_performances.items():
    final_csv["Task Name"].append(task_name)
    final_csv["Task Type"].append(task_to_type_map[task_name])
    final_csv["CoT Score"].append(score)

for task_name, score in direct_task_performances.items():
    final_csv["Direct Score"].append(score)

cot_scores = list(cot_task_performances.values())
direct_scores = list(direct_task_performances.values())

for direct, cot in zip(direct_scores, cot_scores):
    final_csv["Difference"].append(round(direct - cot, 2))
    final_csv["Relative Difference"].append(round((direct - cot) / direct * 100, 2))

df = pd.DataFrame(final_csv)
df.to_csv("Task Name vs. Performance.csv")

In [None]:
df = pd.DataFrame(final_csv)
df.to_csv("Task Name vs. Performance.csv")

___

____
____

___

___

### ?. English-Task Only + Model Specific

In [None]:
# MIMIC-III Outcome.Mortality
# ADE-Extraction
# n2c2 2014-De-identification
# MedNLI
# HealthCareMagic-100k
# MedSTS
# MIMIC-IV BHC

In [None]:
english_tasks = {'MIMIC-III Outcome.Mortality', 'ADE-Extraction', 'n2c2 2014-De-identification', 'MedNLI', 'HealthCareMagic-100k', 'MedSTS', 'MIMIC-IV BHC'}

models = {'Qwen2.5-72B-Instruct', 'Mistral-Large-Instruct-2411', 'Athene-V2-Chat', 'Llama-3.3-70B-Instruct'}

In [None]:
google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

data = google_sheet.parse('All-Sheet')


def get_scores(google_sheet, isCoT):
    result = {}  # mapping of task: [list of scores]

    for row_idx, task in enumerate(data["Task Name"]):
        if task in english_tasks:

            if task not in result:
                result[task] = []

            model_name = data["Model Name"][row_idx]

            if model_name in models:
                if isCoT:
                    result[task].append(data["CoT Score"][row_idx])
                else:
                    result[task].append(data["Direct Score"][row_idx])

        
    return result

cot = get_scores(google_sheet, True)
direct = get_scores(google_sheet, False)
        

In [None]:
print(cot)
print(direct)

#### Compute Differences

In [None]:
# # Average values for each task
# for key, values_ls in cot.items():
#     cot[key] = round(sum(values_ls) / len(values_ls), 2)

# for key, values_ls in direct.items():
#     direct[key] = round(sum(values_ls) / len(values_ls), 2)

print('CoT:', cot)
print('Direct:', direct)

# Compute relative differences (%) for each task   --> (direct - cot) / direct
relative_differences = {}
for key in cot:
    relative_diff = round( ((direct[key] - cot[key]) / direct[key]) * 100, 2)

    relative_differences[key] = relative_diff

print(relative_differences)



___

___

## 6. High-Low LLM vs. Performance

In [None]:
def get_best_models(google_sheet):
    data = google_sheet.parse("All-Sheet")

    results = {}

    for row_idx, model_name in enumerate(data["Model Name"]):

        if model_name not in results:
            results[model_name] = []

        direct_score = data["Direct Score"][row_idx]
        cot_score = data["CoT Score"][row_idx]

        results[model_name].append(float(direct_score))
        results[model_name].append(float(cot_score))

    # Average Performances
    for model_name in results:
        results[model_name] = round( sum(results[model_name]) / len(results[model_name]), 2)

    return results

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

results = get_best_models(google_sheet)


#### Sort the results by average performances

In [None]:
sorted_models = sorted(results.items(), key = lambda x: x[1], reverse=True )
print(sorted_models)


In [2]:
top_20 = [('DeepSeek-R1', 43.17), ('gemini-2.0-flash-001', 42.51), ('gpt-4o-0806', 42.43), ('gemini-1.5-pro-002', 42.19), ('Mistral-Large-Instruct-2411', 40.59), ('Athene-V2-Chat', 40.52), ('Qwen2.5-72B-Instruct', 40.24), ('DeepSeek-R1-Distill-Llama-70B', 39.37), ('DeepSeek-R1-Distill-Qwen-32B', 39.23), ('gemma-3-27b-it', 38.72)]
top_20_40 = [('Llama-3.3-70B-Instruct', 38.35), ('QWQ-32B', 38.2), ('Mistral-Small-3.1-24B-Instruct-2503', 37.98), ('Llama-3.1-70B-Instruct', 37.1), ('gemma-3-12b-it', 36.35), ('gemma-2-27b-it', 36.22), ('Baichuan-M1-14B-Instruct', 35.22), ('Mistral-Small-24B-Instruct-2501', 34.56), ('DeepSeek-R1-Distill-Qwen-14B', 34.54), ('Phi-4', 34.36)]
top_40_60 = [('gpt-35-turbo-0125', 33.46), ('Mistral-Small-Instruct-2409', 33.18), ('gemma-2-9b-it', 32.51), ('Llama-4-Scout-17B-16E-Instruct', 32.25), ('Llama-3-70B-UltraMedical', 31.42), ('Llama3-OpenBioLLM-70B', 30.9), ('Yi-1.5-34B-Chat-16K', 30.85), ('Qwen2.5-7B-Instruct', 30.78), ('MeLLaMA-70B-chat', 30.76), ('Llama-3.1-8B-Instruct', 29.19)]
top_60_80 = [('Llama-3.1-Nemotron-70B-Instruct-HF', 28.42), ('gemma-3-4b-it', 28.38), ('Ministral-8B-Instruct-2410', 28.14), ('DeepSeek-R1-Distill-Llama-8B', 27.91), ('QwQ-32B-Preview', 27.53), ('Phi-3.5-MoE-instruct', 27.41), ('Yi-1.5-9B-Chat-16K', 27.1), ('Qwen2.5-3B-Instruct', 26.01), ('Phi-3.5-mini-instruct', 24.66), ('DeepSeek-R1-Distill-Qwen-7B', 24.57), ('Llama-3.2-3B-Instruct', 22.25)]
bottom_20 = [('Qwen2.5-1.5B-Instruct', 20.82), ('MeLLaMA-13B-chat', 20.51), ('Llama-3.1-8B-UltraMedical', 19.25), ('MMed-Llama-3-8B', 18.27), ('BioMistral-7B', 15.63), ('gemma-3-1b-it', 14.63), ('meditron-70b', 14.43), ('DeepSeek-R1-Distill-Qwen-1.5B', 13.84), ('Llama3-OpenBioLLM-8B', 13.74), ('Llama-3.2-1B-Instruct', 12.29), ('meditron-7b', 9.52)]

# print(len(top_20))
# print(len(top_20_40))
# print(len(top_40_60))
# print(len(top_60_80))
# print(len(bottom_20))

top_20 = [tp[0] for tp in top_20]
top_20_40 = [tp[0] for tp in top_20_40]
top_40_60 = [tp[0] for tp in top_40_60]
top_60_80 = [tp[0] for tp in top_60_80]
bottom_20 = [tp[0] for tp in bottom_20]

print(bottom_20)

['Qwen2.5-1.5B-Instruct', 'MeLLaMA-13B-chat', 'Llama-3.1-8B-UltraMedical', 'MMed-Llama-3-8B', 'BioMistral-7B', 'gemma-3-1b-it', 'meditron-70b', 'DeepSeek-R1-Distill-Qwen-1.5B', 'Llama3-OpenBioLLM-8B', 'Llama-3.2-1B-Instruct', 'meditron-7b']


#### Get the average performances for tasks evaluated with each group of models

In [53]:
def get_average_performances(google_sheet, valid_models):
    '''
    valid_models: list containing the __ 20% of models
    '''
    data = google_sheet.parse("All-Sheet")
    
    cot_scores = []
    direct_scores = []

    # want to determien the relationshpi between the average performance of a gorup of models AND how CoT/direct impacts
    # Average the performance of all tasks with 
    for row_idx, model_name in enumerate(data["Model Name"]):
        # Must be in the valid models
        if model_name in valid_models:
            cot_scores.append(data["CoT Score"][row_idx])
            direct_scores.append(data["Direct Score"][row_idx])

    avg_cot_perf = round(sum(cot_scores) / len(cot_scores), 2)
    avg_direct_perf = round(sum(direct_scores) / len(direct_scores), 2)

    # Relative difference
    relative_diff = round(((avg_direct_perf - avg_cot_perf) / avg_direct_perf) * 100, 2)

    return relative_diff

top_20_diff = get_average_performances(google_sheet, top_20)
top_40_diff = get_average_performances(google_sheet, top_20_40)
top_60_diff = get_average_performances(google_sheet, top_40_60)
top_80_diff = get_average_performances(google_sheet, top_60_80)
bottom_20_diff = get_average_performances(google_sheet, bottom_20)

print(top_20_diff)
print(top_40_diff)
print(top_60_diff)
print(top_80_diff)
print(bottom_20_diff)

5.42
7.81
9.95
11.62
14.07


___
___

### 6.2 Top LLMs. vs. Invalid Rate

In [29]:
def get_average_performances(google_sheet, valid_models):
    '''
    valid_models: list containing the __ 20% of models
    '''
    data = google_sheet.parse("All-Sheet")
    
    invalid_scores = []

    # want to determien the relationshpi between the average performance of a gorup of models AND how CoT/direct impacts
    # Average the performance of all tasks with 
    for row_idx, model_name in enumerate(data["Model Name"]):
    
        # Must be in the valid models
        if model_name in valid_models:
            if data["Invalid Difference"][row_idx] == 0:
                continue
            invalid_scores.append(data["Invalid Difference"][row_idx])

   
    avg_invalid_diff = np.mean(invalid_scores)

    return avg_invalid_diff

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet-With-Invalid.xlsx")
top_20_diff = get_average_performances(google_sheet, top_20)
top_40_diff = get_average_performances(google_sheet, top_20_40)
top_60_diff = get_average_performances(google_sheet, top_40_60)
top_80_diff = get_average_performances(google_sheet, top_60_80)
bottom_20_diff = get_average_performances(google_sheet, bottom_20)

print(top_20_diff)
print(top_40_diff)
print(top_60_diff)
print(top_80_diff)
print(bottom_20_diff)


0.6281995133819953
2.58007722007722
1.1230293159609122
4.091622023809523
13.103409669211194


___

____

## 7. English-Only Analysis


____

#### ALL ENGLISH TASKS

In [None]:
all_english_tasks = {
    "ADE-Identification",
    "ADE-Extraction",
    "ADE-Drug dosage",
    "BrainMRI-AIS",
    "ClinicalNotes-UPMC",
    "CLIP",
    "GOUT-CC-Consensus",
    "n2c2 2006-De-identification",
    "Medication extraction",
    "n2c2 2010-Concept",
    "n2c2 2010-Assertion",
    "n2c2 2010-Relation",
    "n2c2 2014-De-identification",
    "MEDIQA 2019-RQE",
    "MedNLI",
    "MedSTS",
    "MTS",
    "MTS-Temporal",
    "n2c2 2018-ADE&medication",
    "MEDIQA 2023-chat-A",
    "MEDIQA 2023-sum-A",
    "MEDIQA 2023-sum-B",
    "n2c2 2014-Diabetes",
    "n2c2 2014-CAD",
    "n2c2 2014-Hyperlipidemia",
    "n2c2 2014-Hypertension",
    "n2c2 2014-Medication",
    "icliniq-10k",
    "HealthCareMagic-100k",
    "MIMIC-IV CDM",
    "MIMIC-III Outcome.LoS",
    "MIMIC-III Outcome.Mortality",
    "MIMIC-IV BHC",
    "MIMIC-IV DiReCT.Dis",
    "MIMIC-IV DiReCT.PDD"
}

print(len(all_english_tasks))

____

___

#### 7.1 Overall Performance of LLMs Across English Tasks and Different Inference Strategies

In [None]:
def get_model_performance(google_sheet, isCoT):
    data = google_sheet.parse("All-Sheet")

    results = {}   # maps model --> performances iff task is english

    for row_idx, model_name in enumerate(data["Model Name"]):
        if model_name not in results:
            results[model_name] = []

        task = data["Task Name"][row_idx]

        if task in all_english_tasks:
            if isCoT:
                cot_score = data["CoT Score"][row_idx]
                results[model_name].append(cot_score)

            else:
                direct_score = data["Direct Score"][row_idx]
                results[model_name].append(direct_score)

    for key in results:
        results[key] = round(np.mean(results[key]), 2)

    return sorted(results.items(), key = lambda x: x[1], reverse=True)


direct_performances = get_model_performance(google_sheet, False)
cot_performances = get_model_performance(google_sheet, True)



# SORT COT TO HAVE THE SAME ORDER

# Create a name-to-index mapping from the reference list
order = {name: idx for idx, (name, _) in enumerate(direct_performances)}

# Step 2: Reorder cot_performance using those indices
sorted_cot = sorted(cot_performances, key=lambda x: order.get(x[0], float('inf')))


# Sorted by direct scores
print(direct_performances)
print(sorted_cot)

#### Construct CSV

In [None]:
table = {
    "Model": [],
    "Model Size": [],
    "Model Type": [],
    "Model Domain": [],
    "Direct Score": [],
    "CoT Score": [],
    "Difference": [],
    "Relative Difference": []
}


# Add Direct and Cot Scores
for model, score in direct_performances:
    table["Model"].append(model)
    table["Model Size"].append(model_size_mapping[model])
    table["Model Type"].append(accessibility_mapping[model])
    table["Model Domain"].append(model_domain_mapping[model])
    table["Direct Score"].append(score)

for model, score in sorted_cot:
    table["CoT Score"].append(score)

# Fix Accessibility stuff
for i, access in enumerate(table["Model Type"]):
    if access == 'commercial':
        table["Model Type"][i] = "Commercial"

    elif access == "open source":
        table["Model Type"][i] = "Open-Source"
        

# Calculate Differences!

for idx, direct_score in enumerate(table["Direct Score"]):
    diff = round(table["CoT Score"][idx] - direct_score, 2)

    table['Difference'].append(diff)

    relative_diff = round( (diff / direct_score) * 100, 2 )

    table["Relative Difference"].append(relative_diff)


#### CREATE CSV FILE

In [None]:
df = pd.DataFrame(table)
df.to_csv("English-LLM-Performance.csv")

____

____

### 7.2 NLP Task vs. Performance for English Tasks

In [None]:
google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

def get_nlp_task_performances(google_sheet, sheet_names, is_CoT):
    '''
    Inputs:
    '''

    nlp_performances = {}  # maps nlp task --> average performances

    # for sheet_name in sheet_names:
    counter = 0
    data = google_sheet.parse("All-Sheet")

    for row_idx, task_type in enumerate(data['Task Type']):
        task_name = data["Task Name"][row_idx]
        if task_name in all_english_tasks:
            if task_type not in nlp_performances:
                nlp_performances[task_type] = []

            if is_CoT:
                score = data["CoT Score"][row_idx]
            else:
                score = data["Direct Score"][row_idx]

            nlp_performances[task_type].append(score)


    # Average the performances
    for nlp_task, values_list in nlp_performances.items():
        nlp_performances[nlp_task] = round(sum(values_list) / len(values_list), 2)
        # nlp_performances[nlp_task] = round(np.mean(values_list))

    return nlp_performances

In [None]:
cot_perf = get_nlp_task_performances(google_sheet, ['CLF-Difference', 'EXT-Difference', 'Gen-Difference'], is_CoT=True)
direct_perf = get_nlp_task_performances(google_sheet, ['CLF-Difference', 'EXT-Difference', 'Gen-Difference'], is_CoT=False)


direct_perf = sorted(direct_perf.items(), key = lambda x: x[1], reverse=True)
cot_perf = sorted(cot_perf.items(), key = lambda x: x[1], reverse=True)


cor_perf = sort_by_reference(direct_perf, cot_perf)

In [None]:
print(direct_perf)
print(cot_perf)

#### Create CSV!

In [None]:
table = {
    "NLP Task": [],
    "Task Type": [],
    "Direct Score": [],
    "CoT Score": [],
    "Difference": [],
    "Relative Difference": []
}

for task, score in direct_perf:
    table["NLP Task"].append(task)
    table["Task Type"].append('1')
    table["Direct Score"].append(score)

for task, score in cot_perf:
    table['CoT Score'].append(score)

for idx, direct_score in enumerate(table["Direct Score"]):
    diff = round( table["CoT Score"][idx] - direct_score, 2)

    table['Difference'].append(diff)

    relative_diff = round( (diff / direct_score) * 100, 2 )

    table["Relative Difference"].append(relative_diff)

In [None]:
df = pd.DataFrame(table)
df.to_csv("English-NLP-Task.csv")

___

___

#### 7.3 English-Only: Model Domain (Medical vs. General)

In [None]:
google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

def get_domain_task_performances(google_sheet, is_CoT, model_domain):
    # Want our output to be: 
    # dictionary mapping { nlp task --> performance on ONLY MEDICAL MODELS }

    nlp_performances = {}  # maps nlp task --> average performances of a certain kind of model

    data = google_sheet.parse("All-Sheet")

    for row_idx, task_type in enumerate(data['Task Type']):
        task_name = data["Task Name"][row_idx]
        if task_name in all_english_tasks:
            model_type = data['Model Domain'][row_idx]
            # Add the task type to our final dictionary
            if task_type not in nlp_performances:
                nlp_performances[task_type] = []

            # Add the correct metric (direct or CoT) to the dictionary IFF the model type matches the model domain (gen, or med)
            if is_CoT and model_type == model_domain:
                score = data["CoT Score"][row_idx]
                nlp_performances[task_type].append(score)

            elif not is_CoT and model_type == model_domain:
                score = data["Direct Score"][row_idx]
                nlp_performances[task_type].append(score)

    # Average the performances
    for nlp_task, values_list in nlp_performances.items():
        nlp_performances[nlp_task] = round(sum(values_list) / len(values_list), 2)
        
    # print(nlp_performances)
    return nlp_performances

#  Need to generate 4 lists
#  CoT + gen
#  CoT + med
#  Direct + gen
#  Direct + med

sheet_names = ['CLF-Difference', 'EXT-Difference', 'Gen-Difference']

cot_general = get_domain_task_performances(google_sheet, is_CoT=True, model_domain="gen")
cot_medical = get_domain_task_performances(google_sheet, is_CoT=True, model_domain="med")
direct_general = get_domain_task_performances(google_sheet, is_CoT=False, model_domain="gen")
direct_medical = get_domain_task_performances(google_sheet, is_CoT=False, model_domain="med")


### Create CSV

In [None]:
print(cot_general)
print(cot_medical)
print(direct_general)
print(direct_medical)

In [None]:
table = {
    "NLP Task": [],
    "D, Gen": [],
    "CoT, Gen": [],
    "D, Med": [],
    "CoT, Med": [],
    "Gen Diff": [],
    "Med Diff": [],
    "Gen Relative Diff": [],
    "Med Relative Diff": []
}

for task, score in cot_general.items():
    table["NLP Task"].append(task)
    table["CoT, Gen"].append(score)

for task, score in cot_medical.items():
    table["CoT, Med"].append(score)

for task, score in direct_general.items():
    table["D, Gen"].append(score)

for task, score in direct_medical.items():
    table["D, Med"].append(score)


for i, score in enumerate(table["D, Gen"]):
    gen_diff = round(table["CoT, Gen"][i] - table["D, Gen"][i], 2)
    med_diff = round(table["CoT, Med"][i] - table["D, Med"][i], 2)

    table["Gen Diff"].append(gen_diff)
    table["Med Diff"].append(med_diff)

    gen_rel_diff = round( (gen_diff / table["D, Gen"][i]) * 100, 2 )
    med_rel_diff = round( (med_diff / table["D, Med"][i]) * 100, 2 )

    table["Gen Relative Diff"].append(gen_rel_diff)
    table["Med Relative Diff"].append(med_rel_diff)



In [None]:
df = pd.DataFrame(table)
df.to_csv("English LLM Domain vs. Performance.csv")

___

___

### 7. 4 English Only: Commercial vs. Open Source

In [None]:
google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

def get_domain_task_performances(google_sheet, is_CoT, model_accessibility):
    # Want our output to be: 
    # dictionary mapping { nlp task --> performance on ONLY OPEN SOURCE MODELS }, wlog

    nlp_performances = {}  # maps nlp task --> average performances of a certain kind of model


    data = google_sheet.parse("All-Sheet")

    for row_idx, task_type in enumerate(data['Task Type']):

        task_name = data["Task Name"][row_idx]

        if task_name in all_english_tasks:
            model_name = data['Model Name'][row_idx]

            model_type = accessibility_mapping[model_name]   # open source or commercial

            # # Add the task type to our final dictionary
            if task_type not in nlp_performances:
                nlp_performances[task_type] = []


            if is_CoT and model_type == model_accessibility:   # CoT for commercial/open source:
                score = data["CoT Score"][row_idx]
                nlp_performances[task_type].append(score)

            elif not is_CoT and model_type == model_accessibility:
                score = data["Direct Score"][row_idx]
                nlp_performances[task_type].append(score)

    # Average the performances
    for nlp_task, values_list in nlp_performances.items():
        nlp_performances[nlp_task] = round(sum(values_list) / len(values_list), 2)
        
    # print(nlp_performances)
    return nlp_performances

#  Need to generate 4 lists
#  CoT + open
#  CoT + commercial
#  Direct + open
#  Direct + commerical

sheet_names = ['CLF-Difference', 'EXT-Difference', 'Gen-Difference']

cot_open = get_domain_task_performances(google_sheet, is_CoT=True, model_accessibility="open source")
cot_comm = get_domain_task_performances(google_sheet, is_CoT=True, model_accessibility="commercial")
direct_open = get_domain_task_performances(google_sheet, is_CoT=False, model_accessibility="open source")
direct_comm = get_domain_task_performances(google_sheet, is_CoT=False, model_accessibility="commercial")

In [None]:
table = {
    "NLP Task": [],
    "Dir, Open": [],
    "CoT, Open": [],
    "Dir, Comm": [],
    "CoT, Comm": [],
    "Open Diff": [],
    "Comm Diff": [],
    "Open Relative Diff": [],
    "Comm Relative Diff": []
}

for task, score in cot_open.items():
    table["NLP Task"].append(task)
    table["CoT, Open"].append(score)

for task, score in cot_comm.items():
    table["CoT, Comm"].append(score)

for task, score in direct_open.items():
    table["Dir, Open"].append(score)

for task, score in direct_comm.items():
    table["Dir, Comm"].append(score)


for i, score in enumerate(table["Dir, Open"]):
    open_diff = round(table["CoT, Open"][i] - table["Dir, Open"][i], 2)
    comm_diff = round(table["CoT, Comm"][i] - table["Dir, Comm"][i], 2)

    table["Open Diff"].append(open_diff)
    table["Comm Diff"].append(comm_diff)

    open_rel_diff = round( (open_diff / table["Dir, Open"][i]) * 100, 2 )
    comm_rel_diff = round( (comm_diff / table["Dir, Comm"][i]) * 100, 2 )

    table["Open Relative Diff"].append(open_rel_diff)
    table["Comm Relative Diff"].append(comm_rel_diff)



In [None]:
df = pd.DataFrame(table)
df.to_csv("English LLM Domain vs. Accessibility.csv")

df

In [None]:
def get_task_name_performance(google_sheet, is_CoT):
    task_performances = {}  # maps each TASK NAME (i.e. ADE-Identification) to their performances

    data = google_sheet.parse("All-Sheet")

    for row_idx, task_name in enumerate(data["Task Name"]):
        if task_name in all_english_tasks:
            if task_name not in task_performances:
                task_performances[task_name] = []

            if is_CoT:
                score = data["CoT Score"][row_idx]

            else:
                score = data["Direct Score"][row_idx]

            task_performances[task_name].append(score)

    # Average the performances
    for task, values_list in task_performances.items():
        task_performances[task] = round(sum(values_list) / len(values_list), 2)

    return task_performances

In [None]:
# Get performances on each task via CoT prompting
cot_task_performances = get_task_name_performance(google_sheet, is_CoT=True)

# Get performances on each task via Direct answering
direct_task_performances = get_task_name_performance(google_sheet, is_CoT=False)

In [None]:
final_csv = {
    "Task Name": [],
    "Task Type": [],
    "Direct Score": [],
    "CoT Score": [],
    "Difference": [],
    "Relative Difference": []
} 

for task_name, score in cot_task_performances.items():
    final_csv["Task Name"].append(task_name)
    final_csv["Task Type"].append(task_to_type_map[task_name])
    final_csv["CoT Score"].append(score)

for task_name, score in direct_task_performances.items():
    final_csv["Direct Score"].append(score)

cot_scores = list(cot_task_performances.values())
direct_scores = list(direct_task_performances.values())

for direct, cot in zip(direct_scores, cot_scores):
    final_csv["Difference"].append(round(direct - cot, 2))
    final_csv["Relative Difference"].append(round((direct - cot) / direct * 100, 2))


In [None]:
df = pd.DataFrame(final_csv)
df.to_csv("English Task Name vs. Performance.csv")

___

## 8. Length of CoT Output --> SEE "word_count_analysis.ipynb"


___

___

## 9. Multi-Prompt Analysis (see other file "prompt_analysis")

___

## 10. Exclude Invalid Tasks

A follow-up to section 6, where we only evaluate and analyze the performance drops for tasks the have NO INVALID score for both CoT and Direct. We compare the performance drops across different model groups to see whether the drops in performance are due to invalid rates, or the models other features as well

In [3]:
# Load Sheet with only tasks that have 0 invalid rate on both CoT and Direct
# We already have the model splits (by 20% in performance)
# Get drops in performance for each group
# Issue is that different models have different tasks upon which they got 0 invalid rate

In [16]:
def get_performances_no_invalid(google_sheet, valid_models):
    '''
    valid_models: list containing the __ 20% of models
    '''
    data = google_sheet.parse("No-Invalid-Task-Sheet")
    
    relative_diff = []

    # want to determien the relationshpi between the average performance of a gorup of models AND how CoT/direct impacts
    # Average the performance of all tasks with 
    for row_idx, model_name in enumerate(data["Model Name"]):
    
        # Must be in the valid models
        if model_name in valid_models:
            relative_diff.append(data["Relative Difference"][row_idx])

   
    avg_relative_diff = np.mean(relative_diff)


    return avg_relative_diff

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet-With-Invalid.xlsx")

top_20_diff = get_performances_no_invalid(google_sheet, top_20)
top_40_diff = get_performances_no_invalid(google_sheet, top_20_40)
top_60_diff = get_performances_no_invalid(google_sheet, top_40_60)
top_80_diff = get_performances_no_invalid(google_sheet, top_60_80)
bottom_20_diff = get_performances_no_invalid(google_sheet, bottom_20)

print(top_20_diff)
print(top_40_diff)
print(top_60_diff)
print(top_80_diff)
print(bottom_20_diff)


0.03239651416122004
-0.06096590909090909
0.11828125
0.1392982456140351
0.023976608187134492


In [7]:
print(len(bottom_20))

11


In [13]:
print(len(top_20_40))

10


___

## 11. Revised CoT Output Length Analysis

See word_count_analysis_2.0

___


## 12. Group Models by Model Size

Similar to part 6, but we bucket by model sizes

In [6]:
import numpy as np

def get_model_sizes(google_sheet):
    data = google_sheet.parse("All-Sheet")

    no = set()

    results = {}

    for row_idx, model_name in enumerate(data["Model Name"]):
        if np.isnan(data["Model Size"][row_idx]):
            no.add(model_name)
            continue
        
        if model_name not in results:
            results[model_name] = []

        results[model_name] = data["Model Size"][row_idx]

    return results

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet.xlsx")

results = get_model_sizes(google_sheet)

sorted_models = sorted(results.items(), key = lambda x: x[1], reverse=True )


In [7]:
grouped = {}

for model, size in sorted_models:
    if size not in grouped:
        grouped[size] = []

    grouped[size].append(model)

grouped

{671.0: ['DeepSeek-R1'],
 123.0: ['Mistral-Large-Instruct-2411'],
 109.0: ['Llama-4-Scout-17B-16E-Instruct'],
 72.0: ['Qwen2.5-72B-Instruct', 'Athene-V2-Chat'],
 70.0: ['DeepSeek-R1-Distill-Llama-70B',
  'Llama-3.1-70B-Instruct',
  'Llama-3.3-70B-Instruct',
  'Llama-3.1-Nemotron-70B-Instruct-HF',
  'meditron-70b',
  'MeLLaMA-70B-chat',
  'Llama3-OpenBioLLM-70B',
  'Llama-3-70B-UltraMedical'],
 42.0: ['Phi-3.5-MoE-instruct'],
 34.0: ['Yi-1.5-34B-Chat-16K'],
 32.0: ['DeepSeek-R1-Distill-Qwen-32B', 'QwQ-32B-Preview', 'QWQ-32B'],
 27.0: ['gemma-2-27b-it', 'gemma-3-27b-it'],
 24.0: ['Mistral-Small-24B-Instruct-2501',
  'Mistral-Small-3.1-24B-Instruct-2503'],
 22.0: ['Mistral-Small-Instruct-2409'],
 14.0: ['Baichuan-M1-14B-Instruct', 'DeepSeek-R1-Distill-Qwen-14B', 'Phi-4'],
 13.0: ['MeLLaMA-13B-chat'],
 12.0: ['gemma-3-12b-it'],
 9.0: ['gemma-2-9b-it', 'Yi-1.5-9B-Chat-16K'],
 8.0: ['DeepSeek-R1-Distill-Llama-8B',
  'Llama-3.1-8B-Instruct',
  'Llama3-OpenBioLLM-8B',
  'MMed-Llama-3-8B',
  'L

In [11]:
def get_size_performances(google_sheet, grouped_models):
    '''
    valid_models: list containing the __ 20% of models
    '''
    data = google_sheet.parse("All-Sheet")
    output = {}

    for model_size, models_ls in grouped_models.items():
        if model_size not in output:
            output[model_size] = {}

        # Compute the performances for all the models in that size range
        valid_models = models_ls[:]

        cot_scores = []
        direct_scores = []

        # want to determien the relationshpi between the average performance of a gorup of models AND how CoT/direct impacts
        # Average the performance of all tasks with 
        for row_idx, model_name in enumerate(data["Model Name"]):
            # Must be in the valid models (correct size bucket)
            if model_name in valid_models:
                cot_scores.append(data["CoT Score"][row_idx])
                direct_scores.append(data["Direct Score"][row_idx])

        # Average the CoT and Direct Scores
        avg_cot_score = np.mean(cot_scores)
        avg_direct_score = np.mean(direct_scores)

        avg_diff = round(avg_direct_score - avg_cot_score, 2)
        avg_rel_diff = round(avg_diff / avg_direct_score * 100, 2)

        # differences = []
        # for i in range(len(direct_scores)):
        #     differences.append(direct_scores[i] - cot_scores[i])

        # relative_differences_list = []

        # for i in range(len(differences)):
        #     if direct_scores[i] == 0:
        #         relative_differences_list.append(0)
        #         continue
        #     relative_differences_list.append( round(differences[i] / direct_scores[i] * 100, 2)   )

        # avg_cot_perf = round(sum(cot_scores) / len(cot_scores), 2)
        # avg_direct_perf = round(sum(direct_scores) / len(direct_scores), 2)

        # Relative difference
        # relative_diff = round(((avg_direct_perf - avg_cot_perf) / avg_direct_perf) * 100, 2)

        output[model_size]["Direct Score"] = round(avg_direct_score, 2)
        output[model_size]["CoT Score"] = round(avg_cot_score, 2)
        output[model_size]["Difference"] = avg_diff
        output[model_size]["Relative Difference"] = avg_rel_diff


    return output

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet-With-Invalid.xlsx")

get_size_performances(google_sheet, grouped)

{671.0: {'Direct Score': 44.25,
  'CoT Score': 42.1,
  'Difference': 2.15,
  'Relative Difference': 4.86},
 123.0: {'Direct Score': 42.28,
  'CoT Score': 38.9,
  'Difference': 3.37,
  'Relative Difference': 7.97},
 109.0: {'Direct Score': 35.12,
  'CoT Score': 29.38,
  'Difference': 5.74,
  'Relative Difference': 16.34},
 72.0: {'Direct Score': 41.66,
  'CoT Score': 39.1,
  'Difference': 2.56,
  'Relative Difference': 6.15},
 70.0: {'Direct Score': 33.23,
  'CoT Score': 29.45,
  'Difference': 3.78,
  'Relative Difference': 11.38},
 42.0: {'Direct Score': 29.54,
  'CoT Score': 25.27,
  'Difference': 4.27,
  'Relative Difference': 14.45},
 34.0: {'Direct Score': 32.12,
  'CoT Score': 29.57,
  'Difference': 2.55,
  'Relative Difference': 7.94},
 32.0: {'Direct Score': 36.95,
  'CoT Score': 33.02,
  'Difference': 3.94,
  'Relative Difference': 10.66},
 27.0: {'Direct Score': 39.06,
  'CoT Score': 35.88,
  'Difference': 3.17,
  'Relative Difference': 8.12},
 24.0: {'Direct Score': 38.63,
  

In [12]:
model_size_out = {671.0: {'Direct Score': 44.25,
  'CoT Score': 42.1,
  'Difference': 2.15,
  'Relative Difference': 4.86},
 123.0: {'Direct Score': 42.28,
  'CoT Score': 38.9,
  'Difference': 3.37,
  'Relative Difference': 7.97},
 109.0: {'Direct Score': 35.12,
  'CoT Score': 29.38,
  'Difference': 5.74,
  'Relative Difference': 16.34},
 72.0: {'Direct Score': 41.66,
  'CoT Score': 39.1,
  'Difference': 2.56,
  'Relative Difference': 6.15},
 70.0: {'Direct Score': 33.23,
  'CoT Score': 29.45,
  'Difference': 3.78,
  'Relative Difference': 11.38},
 42.0: {'Direct Score': 29.54,
  'CoT Score': 25.27,
  'Difference': 4.27,
  'Relative Difference': 14.45},
 34.0: {'Direct Score': 32.12,
  'CoT Score': 29.57,
  'Difference': 2.55,
  'Relative Difference': 7.94},
 32.0: {'Direct Score': 36.95,
  'CoT Score': 33.02,
  'Difference': 3.94,
  'Relative Difference': 10.66},
 27.0: {'Direct Score': 39.06,
  'CoT Score': 35.88,
  'Difference': 3.17,
  'Relative Difference': 8.12},
 24.0: {'Direct Score': 38.63,
  'CoT Score': 33.91,
  'Difference': 4.71,
  'Relative Difference': 12.19},
 22.0: {'Direct Score': 35.19,
  'CoT Score': 31.17,
  'Difference': 4.02,
  'Relative Difference': 11.42},
 14.0: {'Direct Score': 35.5,
  'CoT Score': 33.91,
  'Difference': 1.58,
  'Relative Difference': 4.45},
 13.0: {'Direct Score': 20.76,
  'CoT Score': 20.26,
  'Difference': 0.5,
  'Relative Difference': 2.41},
 12.0: {'Direct Score': 37.32,
  'CoT Score': 35.37,
  'Difference': 1.95,
  'Relative Difference': 5.23},
 9.0: {'Direct Score': 31.94,
  'CoT Score': 27.67,
  'Difference': 4.27,
  'Relative Difference': 13.37},
 8.0: {'Direct Score': 23.76,
  'CoT Score': 21.74,
  'Difference': 2.02,
  'Relative Difference': 8.5},
 7.0: {'Direct Score': 21.63,
  'CoT Score': 18.62,
  'Difference': 3.02,
  'Relative Difference': 13.96},
 4.0: {'Direct Score': 26.99,
  'CoT Score': 26.05,
  'Difference': 0.93,
  'Relative Difference': 3.45},
 3.0: {'Direct Score': 24.74,
  'CoT Score': 23.52,
  'Difference': 1.22,
  'Relative Difference': 4.93},
 1.5: {'Direct Score': 18.21,
  'CoT Score': 16.45,
  'Difference': 1.76,
  'Relative Difference': 9.66},
 1.0: {'Direct Score': 14.22,
  'CoT Score': 12.69,
  'Difference': 1.53,
  'Relative Difference': 10.76}}


df = pd.DataFrame(model_size_out).T

df.to_csv("Revised Model Size vs. Performance.csv")

___ 

## 13. Model Family Analysis



In [18]:
model_families = {
    "Baichuan": ["Baichuan-M1-14B-Instruct"],

    "DeepSeek": [
        "DeepSeek-R1",
        "DeepSeek-R1-Distill-Llama-8B",
        "DeepSeek-R1-Distill-Llama-70B",
        "DeepSeek-R1-Distill-Qwen-1.5B",
        "DeepSeek-R1-Distill-Qwen-7B",
        "DeepSeek-R1-Distill-Qwen-14B",
        "DeepSeek-R1-Distill-Qwen-32B",
    ],

    "Gemma": [
        "gemma-2-9b-it",
        "gemma-2-27b-it",
        "gemma-3-1b-it",
        'gemma-3-4b-it',
        'gemma-3-12b-it',
        'gemma-3-27b-it',
    ],

    "Llama": [
        'Llama-3.1-8B-Instruct',
        'Llama-3.1-70B-Instruct',
        'Llama-3.2-1B-Instruct',
        'Llama-3.2-3B-Instruct',
        'Llama-3.3-70B-Instruct',
        'Llama-4-Scout-17B-16E-Instruct',
        "Llama-3.1-Nemotron-70B-Instruct-HF",
    ],

    "Meditron": [
        'meditron-7b',
        'meditron-70b',
    ],

    "MeLLaMA": [
        "MeLLaMA-13B-chat",
        "MeLLaMA-70B-chat"
    ],

    "Llama3-OpenBioLLM": [
        "Llama3-OpenBioLLM-8B",
        "Llama3-OpenBioLLM-70B"
    ],

    "MMed-Llama": [
        "MMed-Llama-3-8B"
    ],

    "Llama-UltraMedical": [
        "Llama-3.1-8B-UltraMedical",
        "Llama-3-70B-UltraMedical"
    ],

    "Mistral": [
        "Ministral-8B-Instruct-2410",
        "Mistral-Small-Instruct-2409",
        "Mistral-Small-24B-Instruct-2501",
        "Mistral-Small-3.1-24B-Instruct-2503",
        "Mistral-Large-Instruct-2411"
    ],

    "BioMistral": [
        "BioMistral-7B"
    ],

    "Phi": [
        "Phi-3.5-mini-instruct",
        "Phi-3.5-MoE-instruct",
        "Phi-4"
    ],

    "Qwen2.5": [
        "Qwen2.5-1.5B-Instruct",
        "Qwen2.5-3B-Instruct",
        "Qwen2.5-7B-Instruct",
        "Qwen2.5-72B-Instruct"
    ],

    "QwQ": [
        "QwQ-32B-Preview",
        "QWQ-32B"
    ],

    "Athene": [
        "Athene-V2-Chat"
    ],

    "Yi": [
        "Yi-1.5-9B-Chat-16K",
        "Yi-1.5-34B-Chat-16K"
    ],

    "GPT": [
        "gpt-35-turbo-0125",
        "gpt-4o-0806"
    ],

    "Gemini": [
        "gemini-2.0-flash-001",
        "gemini-1.5-pro-002"
    ]

}


In [20]:
def get_family_performances(google_sheet, families):
    '''
    valid_models: list containing the __ 20% of models
    '''
    data = google_sheet.parse("All-Sheet")
    output = {}

    for model_family, models_ls in families.items():
        if model_family not in output:
            output[model_family] = {}

        # Compute the performances for all the models in that size range
        valid_models = models_ls[:]

        cot_scores = []
        direct_scores = []

        # want to determien the relationshpi between the average performance of a gorup of models AND how CoT/direct impacts
        # Average the performance of all tasks with 
        for row_idx, model_name in enumerate(data["Model Name"]):
            # Must be in the valid models
            if model_name in valid_models:
                cot_scores.append(data["CoT Score"][row_idx])
                direct_scores.append(data["Direct Score"][row_idx])

        avg_direct_score = np.mean(direct_scores)

        avg_cot_score = np.mean(cot_scores)

        avg_diff = avg_direct_score - avg_cot_score

        avg_rel_diff = round ( avg_diff / avg_direct_score * 100, 2)

        output[model_family]["Direct Score"] = round(avg_direct_score, 2)
        output[model_family]["CoT Score"] = round(avg_cot_score, 2)
        output[model_family]["Difference"] = round(avg_diff, 2)
        output[model_family]["Relative Difference"] = round(avg_rel_diff, 2)
        

    return output

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet-With-Invalid.xlsx")

family = get_family_performances(google_sheet, model_families)

# Positive indicates Direct did better. Negative indicates CoT did better

In [21]:
df = pd.DataFrame(family).T
df.to_csv("Revised Model Family vs. Performance.csv")

___

## 14. Clinical Context of Task Analysis

Answers the question: How does CoT vs. Direct performance vary across different clinical contexts?

In [13]:
# # # Get all clinical context: [tasks] mappings

context_to_task_mapping = {}

sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/Clinical Benchmark and LLM.xlsx")

data = sheet.parse("Task-all")

In [14]:
for row_idx, clinical_contexts in enumerate(data["Clinical context"]):

    task = data["Task name"][row_idx]

    # Check if multiple contexts:
    if "," in clinical_contexts:
        clinical_contexts = clinical_contexts.split(",")

        for context in clinical_contexts:
            context = context.strip()
            if context not in context_to_task_mapping:
                context_to_task_mapping[context] = []

            context_to_task_mapping[context].append(task)

    else:
        clinical_contexts = clinical_contexts.strip()
        if clinical_contexts not in context_to_task_mapping:
            context_to_task_mapping[clinical_contexts] = []

        context_to_task_mapping[clinical_contexts].append(task)


In [16]:
def get_context_performances(google_sheet, contexts):
    '''
    valid_models: list containing the __ 20% of models
    '''
    data = google_sheet.parse("All-Sheet")
    output = {}

    for context, tasks_ls in contexts.items():
        if context not in output:
            output[context] = {}

        valid_tasks = tasks_ls[:]

        cot_scores = []
        direct_scores = []

        for row_idx, task_name in enumerate(data["Task Name"]):
            # Must be in the valid models
            if task_name in valid_tasks:
                cot_scores.append(data["CoT Score"][row_idx])
                direct_scores.append(data["Direct Score"][row_idx])

        # Compute Averages
        avg_cot_score = np.mean(cot_scores)
        avg_direct_score = np.mean(direct_scores)

        avg_diff = avg_direct_score - avg_cot_score

        avg_rel_diff = round( avg_diff / avg_direct_score * 100 , 2)

        output[context]["Direct Score"] = round(avg_direct_score, 2)
        output[context]["CoT Score"] = round(avg_cot_score, 2)
        output[context]["Difference"] = round(avg_diff, 2)
        output[context]["Relative Difference"] = avg_rel_diff

    return output

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet-With-Invalid.xlsx")

contexts = get_context_performances(google_sheet, context_to_task_mapping)

# Positive indicates Direct did better. Negative indicates CoT did better

In [17]:
df = pd.DataFrame(contexts).T
df.to_csv("Revised Clinical Context vs. Performance.csv")

In [73]:
context_to_task_mapping

{'Pharmacology': ['ADE-Identification',
  'ADE-Extraction',
  'ADE-Drug dosage',
  'DialMed',
  'Medication extraction',
  'n2c2 2018-ADE&medication',
  'RuDReC-NER',
  'IFMIR-Incident type',
  'IFMIR-NER',
  'IFMIR - NER&factuality',
  'ADE-Identification',
  'ADE-Extraction',
  'ADE-Drug dosage',
  'DialMed',
  'Medication extraction',
  'n2c2 2018-ADE&medication',
  'RuDReC-NER',
  'IFMIR-Incident type',
  'IFMIR-NER',
  'IFMIR - NER&factuality',
  'ADE-Identification',
  'ADE-Extraction',
  'ADE-Drug dosage',
  'DialMed',
  'Medication extraction',
  'n2c2 2018-ADE&medication',
  'RuDReC-NER',
  'IFMIR-Incident type',
  'IFMIR-NER',
  'IFMIR - NER&factuality'],
 'General': ['BARR2',
  'Brateca-Hospitalization',
  'Brateca-Mortality',
  'CHIP-CDEE',
  'CodiEsp-ICD-10-CM',
  'CodiEsp-ICD-10-PCS',
  'ClinicalNotes-UPMC',
  'cMedQA',
  'EHRQA-Primary department',
  'EHRQA-QA',
  'EHRQA-Sub department',
  'JP-STS',
  'meddocan',
  'MEDIQA 2019-RQE',
  'MedSTS',
  'MTS',
  'NUBES',
  'ME

____

## 15. Combine All Tasks (Sections 5 and 9)

In [95]:
google_sheet = pd.ExcelFile("New-Prompt-Sheet.xlsx")

out = {
    "Task": [],
    "Task Type": [],
    "Direct Score": [],
    "CoT Score": [],
    "Direct - Cot": [],
    "Relative Difference": []
}



In [99]:
def get_task_performances(google_sheet, all_sheets):
    all_task_info = {}
    
    for sheet in all_sheets:
        data = google_sheet.parse(sheet)
        # print(data)

        for row_idx, task_name in enumerate(data["Task Name"]):
            if task_name in all_english_tasks:
                if task_name not in all_task_info:
                    all_task_info[task_name] = {}
                    all_task_info[task_name]["Task Type"] = data["Task Type"][row_idx]
                    all_task_info[task_name]["Direct Score"] = []
                    all_task_info[task_name]["CoT Score"] = []
                    all_task_info[task_name]["Direct - Cot"] = []
                    all_task_info[task_name]["Relative Difference"] = []

                all_task_info[task_name]["Direct Score"].append(data["Direct Score"][row_idx])
                all_task_info[task_name]["CoT Score"].append(data["CoT Score"][row_idx])
                all_task_info[task_name]["Direct - Cot"].append(data["Difference"][row_idx])
                all_task_info[task_name]["Relative Difference"].append(data["Relative Difference"][row_idx])


    return all_task_info


out = get_task_performances(google_sheet, ["Prompt 1", "Prompt 2", "Prompt 3", "Prompt 4",])

with open("section15_output_english_only_1234.json", "w") as f:
    json.dump(out, f, indent=4)

In [100]:
def create_final_table(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    out = {
        "Task": [],
        "Task Type": [],
        "Direct Score": [],
        "CoT Score": [],
        "Direct - Cot": [],
        "Relative Difference": []
    }

    # For each task, get the average of the scores and add to the final output
    for task_name in data:
        out["Task"].append(task_name)

        for task_attr in data[task_name]:
            # if task_attr == "Relative Difference":  # skip this calculation
            #     continue

            if task_attr == "Task Type":
                out[task_attr].append(data[task_name][task_attr])
                continue

            out[task_attr].append(round(np.mean(data[task_name][task_attr]), 2))

    return out


avg_out = create_final_table("section15_output_english_only_1234.json")


In [101]:
# Turn into CSV Dataframe
df = pd.DataFrame(avg_out)
df.to_csv("section15_output_english_only_1234.csv", index=False)

___

## 16. All Models

In [None]:
# Define task name to simplified task name mapping
google_sheet = pd.ExcelFile("Clinical Benchmark and LLM.xlsx")

data = google_sheet.parse("Task-all")

task_name_mapping = {}

for row_idx, task_name in enumerate(data["Task-Original"]):
    task_name_mapping[data["Task name"][row_idx]] = data["Task name"][row_idx]
    task_name_mapping[task_name] = data["Task name"][row_idx]


In [69]:
def get_model_performance(google_sheet, isCoT):
    data = google_sheet.parse("Revised-All-Sheet")

    results = {}  

    for row_idx, model_name in enumerate(data["Model Name"]):
        # model_name = task_name_mapping[model_name]
        if model_name not in results:
            results[model_name] = []

        task = data["Task Name"][row_idx]

        if isCoT:
            cot_score = data["CoT Score"][row_idx]
            results[model_name].append(cot_score)

        else:
            direct_score = data["Direct Score"][row_idx]
            results[model_name].append(direct_score)

    for key in results:
        results[key] = round(np.mean(results[key]), 2)

    return sorted(results.items(), key = lambda x: x[1], reverse=True)

google_sheet = load_sheet("/Users/kevinxie/Desktop/LLM CoT/LLM-CoT/CoT-Difference-Sheet-With-Invalid.xlsx")

direct_performances = get_model_performance(google_sheet, False)
cot_performances = get_model_performance(google_sheet, True)


# SORT COT TO HAVE THE SAME ORDER
# Create a name-to-index mapping from the reference list
order = {name: idx for idx, (name, _) in enumerate(direct_performances)}

# Step 2: Reorder cot_performance using those indices
sorted_cot = sorted(cot_performances, key=lambda x: order.get(x[0], float('inf')))


# Sorted by direct scores
print(direct_performances)
print(sorted_cot)

[('DeepSeek-R1', 44.25), ('gpt-4o-0806', 44.2), ('gemini-1.5-pro-002', 43.85), ('gemini-2.0-flash-001', 43.03), ('Mistral-Large-Instruct-2411', 42.17), ('Athene-V2-Chat', 41.29), ('Qwen2.5-72B-Instruct', 41.2), ('gemma-3-27b-it', 39.9), ('Llama-3.3-70B-Instruct', 39.9), ('DeepSeek-R1-Distill-Llama-70B', 39.79), ('DeepSeek-R1-Distill-Qwen-32B', 39.75), ('Mistral-Small-3.1-24B-Instruct-2503', 39.73), ('QWQ-32B', 39.37), ('Llama-3.1-70B-Instruct', 39.09), ('gemma-2-27b-it', 38.22), ('Mistral-Small-24B-Instruct-2501', 37.53), ('gemma-3-12b-it', 37.32), ('Phi-4', 36.13), ('Baichuan-M1-14B-Instruct', 36.08), ('gpt-35-turbo-0125', 35.3), ('Mistral-Small-Instruct-2409', 35.19), ('Llama-4-Scout-17B-16E-Instruct', 35.12), ('gemma-2-9b-it', 35.07), ('DeepSeek-R1-Distill-Qwen-14B', 34.28), ('Llama-3-70B-UltraMedical', 33.4), ('Llama3-OpenBioLLM-70B', 33.01), ('Llama-3.1-Nemotron-70B-Instruct-HF', 32.75), ('MeLLaMA-70B-chat', 32.26), ('Yi-1.5-34B-Chat-16K', 32.12), ('QwQ-32B-Preview', 31.74), ('Qwe

In [70]:
table = {
    "Model": [],
    "Model Size": [],
    "Model Type": [],
    "Model Domain": [],
    "Direct Score": [],
    "CoT Score": [],
    "Difference": [],
    "Relative Difference": []
}


# Add Direct and Cot Scores
for model, score in direct_performances:
    table["Model"].append(model)
    table["Model Size"].append(model_size_mapping[model])
    table["Model Type"].append(accessibility_mapping[model])
    table["Model Domain"].append(model_domain_mapping[model])
    table["Direct Score"].append(score)

for model, score in sorted_cot:
    table["CoT Score"].append(score)

# Fix Accessibility stuff
for i, access in enumerate(table["Model Type"]):
    if access == 'commercial':
        table["Model Type"][i] = "Commercial"

    elif access == "open source":
        table["Model Type"][i] = "Open-Source"
        

# Calculate Differences!

for idx, direct_score in enumerate(table["Direct Score"]):
    diff = round(table["CoT Score"][idx] - direct_score, 2)

    table['Difference'].append(diff)

    relative_diff = round( (diff / direct_score) * 100, 2 )

    table["Relative Difference"].append(relative_diff)


In [71]:
df = pd.DataFrame(table)
df.to_csv("Overall-LLM-Performance.csv")

___

## 17. Revised Multi-Prompt Analysis

Instead of analyzing individual task performances (i.e. 1-1.ADE-ADE Identification), we analyze the overall changes in performance across all NLP Tasks

In [35]:
def get_by_nlp_task(sheet_data):

    out = {}

    for row_idx, task_type in enumerate(sheet_data["Task Type"]):
        if task_type not in out:
            out[task_type] = {}
            out[task_type]["Direct Score"] = []
            out[task_type]["CoT Score"] = []


        dir_score = sheet_data["Direct Score"][row_idx]
        cot_score = sheet_data["CoT Score"][row_idx]

        out[task_type]["Direct Score"].append(dir_score)
        out[task_type]["CoT Score"].append(cot_score)
        

    for task_type, d in out.items():
        # out[k] = round( np.mean(out[k]), 2 )
        d["Difference"] = []

        for idx, dir_value in enumerate(d['Direct Score']):
            d["Difference"].append( dir_value - d['CoT Score'][idx ])


        out[task_type]['Relative Difference'] = []

        for idx, diff_value in enumerate(out[task_type]["Difference"]):
            dir_score = out[task_type]["Direct Score"][idx]

            if dir_score == 0:
                continue
            out[task_type]["Relative Difference"].append( round( diff_value / dir_score * 100, 2) )


    final_out = {}
    # Compute Averages
    for task_type, d in out.items():
        final_out[task_type] = {}

        # for key, values_ls in d.items():
        final_out[task_type]["Direct Score"] = round(np.mean( d["Direct Score"] ), 2)

        final_out[task_type]["CoT Score"] = round( np.mean( d["CoT Score"] ), 2)

        final_out[task_type]["Difference"] = round( -np.mean( d["Difference"] ), 2)

    for task_type, d in final_out.items():
        diff = final_out[task_type]["Direct Score"] - final_out[task_type]["CoT Score"]
        dir_score = final_out[task_type]["Direct Score"]

        final_out[task_type]["Relative Difference"] = round( -diff / dir_score * 100, 2)


    return final_out


all_prompt_sheet = pd.ExcelFile("New-Prompt-Sheet.xlsx")

current_prompt = "prompt_4"
all_prompt_data = all_prompt_sheet.parse(current_prompt)

out = get_by_nlp_task(all_prompt_data)

In [36]:
import pandas as pd

df = pd.DataFrame(out).T

df.to_csv(f"Revised Multiprompt NLP-{current_prompt}.csv")

In [37]:
out

{'Text Classification': {'Direct Score': 67.0,
  'CoT Score': 66.05,
  'Difference': -0.95,
  'Relative Difference': -1.42},
 'Event Extraction': {'Direct Score': 26.28,
  'CoT Score': 21.08,
  'Difference': -5.2,
  'Relative Difference': -19.79},
 'Named Entity Recognition': {'Direct Score': 39.31,
  'CoT Score': 37.02,
  'Difference': -2.29,
  'Relative Difference': -5.83},
 'Question Answering': {'Direct Score': 17.12,
  'CoT Score': 15.8,
  'Difference': -1.32,
  'Relative Difference': -7.71},
 'Summarization': {'Direct Score': 32.21,
  'CoT Score': 28.35,
  'Difference': -3.87,
  'Relative Difference': -11.98},
 'Normalization and Coding': {'Direct Score': 5.45,
  'CoT Score': 6.2,
  'Difference': 0.74,
  'Relative Difference': 13.76},
 'Semantic Similarity': {'Direct Score': 45.27,
  'CoT Score': 44.2,
  'Difference': -1.07,
  'Relative Difference': -2.36},
 'Natural Language Inference': {'Direct Score': 83.56,
  'CoT Score': 83.46,
  'Difference': -0.1,
  'Relative Difference': 

____

## 18. Revised CoT Token Length Analysis


Steps:

1. For each model-task pair output file, determine the token lengths of each output. Then, split those into the 10 buckets, each representing 10 percentiles of output length.
    - this is impossible given my current data...

2. 