In [1]:
import pandas as pd
import json

In [17]:
# input_file = "DeepSeek_R1_14b_Output_additonal_prompt.json"
# output_file = "Fixed_Deepseek_r1_14b.json"

# with open(input_file, "r") as f:
#     data = json.load(f)  

# fixed_data = []
# for record in data:
#     fixed_record = {k.replace("llm_", ""): v for k, v in record.items()}
#     fixed_data.append(fixed_record)

# with open(output_file, "w") as f:
#     json.dump(fixed_data, f, indent=2)

In [8]:
schema_fields = [
    "imatinib_mentioned",
    "related_drugs_mentioned",
    "cml_diagnosed",
    "cml_in_regression",
    "aml_diagnosed",
    "blast_phase_cml",
    "bmt_history",
    "acute_phase_cml"
]


In [6]:
def make_table1(filepath, model_name=None):
    """
    Create a Table 1 summary of counts (True/False) for schema fields
    from an LLM output JSON file.
    
    Parameters:
    -----------
    filepath : str
        Path to the JSON file containing LLM results.
    model_name : str, optional
        Name of the model (used for labeling output).
    
    Returns:
    --------
    pd.DataFrame
        Summary table with counts of True/False for each field.
    """
    try:
        with open(filepath, "r") as f:
            data = json.load(f)  
    except json.JSONDecodeError:
        with open(filepath, "r") as f:
            data = [json.loads(line) for line in f]  
    
    df = pd.DataFrame(data)
    
    table1 = {}
    for field in schema_fields:
        counts = df[field].value_counts().to_dict()
        table1[field] = {
            "True": counts.get(True, 0),
            "False": counts.get(False, 0)
        }
    
    # Convert to DataFrame
    table1_df = pd.DataFrame(table1).T
    table1_df.index.name = "Field"
    table1_df.reset_index(inplace=True)
    
    if model_name:
        table1_df.insert(0, "Model", model_name)
    
    return table1_df

In [6]:
t1_model1 = make_table1("labelled_notes_additonal_prompts_medgemma4b.json", model_name="Google-MedGemma-4b")
t1_model2 = make_table1("open_hermes_test_added_Prompt.json", model_name="OpenHermes-2.5-7B")
t1_model3 = make_table1("Fixed_Deepseek_r1_14b.json", model_name="DeepSeek_R1_14b")
manual_labels = make_table1("merged_manual_label.json", model_name="Manual")


In [22]:
t1_model1

Unnamed: 0,Model,Field,True,False
0,Google-MedGemma-4b,imatinib_mentioned,316,1446
1,Google-MedGemma-4b,related_drugs_mentioned,487,1275
2,Google-MedGemma-4b,cml_diagnosed,781,981
3,Google-MedGemma-4b,cml_in_regression,435,1327
4,Google-MedGemma-4b,aml_diagnosed,35,1727
5,Google-MedGemma-4b,blast_phase_cml,18,1744
6,Google-MedGemma-4b,bmt_history,322,1440
7,Google-MedGemma-4b,acute_phase_cml,34,1728


In [23]:
t1_model2

Unnamed: 0,Model,Field,True,False
0,OpenHermes-2.5-7B,imatinib_mentioned,328,1434
1,OpenHermes-2.5-7B,related_drugs_mentioned,371,1391
2,OpenHermes-2.5-7B,cml_diagnosed,524,1238
3,OpenHermes-2.5-7B,cml_in_regression,351,1411
4,OpenHermes-2.5-7B,aml_diagnosed,145,1617
5,OpenHermes-2.5-7B,blast_phase_cml,75,1687
6,OpenHermes-2.5-7B,bmt_history,381,1381
7,OpenHermes-2.5-7B,acute_phase_cml,92,1670


In [24]:
t1_model3

Unnamed: 0,Model,Field,True,False
0,DeepSeek_R1_14b,imatinib_mentioned,191,1571
1,DeepSeek_R1_14b,related_drugs_mentioned,186,1576
2,DeepSeek_R1_14b,cml_diagnosed,257,1505
3,DeepSeek_R1_14b,cml_in_regression,24,1738
4,DeepSeek_R1_14b,aml_diagnosed,99,1663
5,DeepSeek_R1_14b,blast_phase_cml,26,1736
6,DeepSeek_R1_14b,bmt_history,195,1567
7,DeepSeek_R1_14b,acute_phase_cml,26,1736


## Comparision of each model with manual labels (300 notes)

In [7]:
manual_labels

Unnamed: 0,Model,Field,True,False
0,Manual,imatinib_mentioned,19,281
1,Manual,related_drugs_mentioned,36,264
2,Manual,cml_diagnosed,37,263
3,Manual,cml_in_regression,11,289
4,Manual,aml_diagnosed,18,282
5,Manual,blast_phase_cml,3,297
6,Manual,bmt_history,22,278
7,Manual,acute_phase_cml,4,296


In [4]:
def make_table1(filepath, model_name=None, sample_limit=300):
    """
    Create a Table 1 summary of counts (True/False) for schema fields
    from an LLM output JSON file, capped at a sample limit.
    
    Parameters
    ----------
    filepath : str
        Path to the JSON file containing LLM results.
    model_name : str, optional
        Name of the model (used for labeling output).
    sample_limit : int, default=300
        Maximum number of samples to include from the JSON.
    
    Returns
    -------
    pd.DataFrame
        Summary table with counts of True/False for each field.
    """
    try:
        with open(filepath, "r") as f:
            data = json.load(f)
    except json.JSONDecodeError:
        with open(filepath, "r") as f:
            data = [json.loads(line) for line in f]
    
    # Apply sample limit
    data = data[:sample_limit]
    
    df = pd.DataFrame(data)
    
    table1 = {}
    for field in schema_fields:
        counts = df[field].value_counts().to_dict()
        table1[field] = {
            "True": counts.get(True, 0),
            "False": counts.get(False, 0)
        }
    
    # Convert to DataFrame
    table1_df = pd.DataFrame(table1).T
    table1_df.index.name = "Field"
    table1_df.reset_index(inplace=True)
    
    if model_name:
        table1_df.insert(0, "Model", model_name)
    
    return table1_df

In [9]:
t1_model1 = make_table1("labelled_notes_additonal_prompts_medgemma4b.json", model_name="Google-MedGemma-4b")
t1_model2 = make_table1("open_hermes_test_added_Prompt.json", model_name="OpenHermes-2.5-7B")
t1_model3 = make_table1("Fixed_Deepseek_r1_14b.json", model_name="DeepSeek_R1_14b")
manual_labels = make_table1("merged_manual_label.json", model_name="Manual")


In [10]:
t1_model1

Unnamed: 0,Model,Field,True,False
0,Google-MedGemma-4b,imatinib_mentioned,53,247
1,Google-MedGemma-4b,related_drugs_mentioned,72,228
2,Google-MedGemma-4b,cml_diagnosed,131,169
3,Google-MedGemma-4b,cml_in_regression,65,235
4,Google-MedGemma-4b,aml_diagnosed,5,295
5,Google-MedGemma-4b,blast_phase_cml,2,298
6,Google-MedGemma-4b,bmt_history,43,257
7,Google-MedGemma-4b,acute_phase_cml,3,297


In [11]:
t1_model2

Unnamed: 0,Model,Field,True,False
0,OpenHermes-2.5-7B,imatinib_mentioned,37,263
1,OpenHermes-2.5-7B,related_drugs_mentioned,31,269
2,OpenHermes-2.5-7B,cml_diagnosed,57,243
3,OpenHermes-2.5-7B,cml_in_regression,26,274
4,OpenHermes-2.5-7B,aml_diagnosed,24,276
5,OpenHermes-2.5-7B,blast_phase_cml,4,296
6,OpenHermes-2.5-7B,bmt_history,51,249
7,OpenHermes-2.5-7B,acute_phase_cml,11,289


In [12]:
t1_model3

Unnamed: 0,Model,Field,True,False
0,DeepSeek_R1_14b,imatinib_mentioned,20,280
1,DeepSeek_R1_14b,related_drugs_mentioned,13,287
2,DeepSeek_R1_14b,cml_diagnosed,21,279
3,DeepSeek_R1_14b,cml_in_regression,5,295
4,DeepSeek_R1_14b,aml_diagnosed,14,286
5,DeepSeek_R1_14b,blast_phase_cml,2,298
6,DeepSeek_R1_14b,bmt_history,24,276
7,DeepSeek_R1_14b,acute_phase_cml,1,299


In [13]:
manual_labels

Unnamed: 0,Model,Field,True,False
0,Manual,imatinib_mentioned,19,281
1,Manual,related_drugs_mentioned,36,264
2,Manual,cml_diagnosed,37,263
3,Manual,cml_in_regression,11,289
4,Manual,aml_diagnosed,18,282
5,Manual,blast_phase_cml,3,297
6,Manual,bmt_history,22,278
7,Manual,acute_phase_cml,4,296


In [31]:
with open("parsed_outputs.json", "r") as f:
    tuned = json.load(f)

with open("100_manual_notes_test.json", "r") as f:
    manual = json.load(f)

with open("open_hermes_test_added_Prompt.json", "r") as f:
    original = json.load(f)


In [33]:
df_tuned = pd.DataFrame(tuned)
df_manual = pd.DataFrame(manual)
df_original = pd.DataFrame(original)

In [39]:
manual_dict = {}
for entry in manual:
    note_num = int(entry["note_text"].split()[1])
    manual_dict[note_num] = {field: entry[field] for field in schema_fields}


In [41]:
tuned_dict = {int(k.split("_")[1]): v for k, v in tuned.items()}

In [43]:
original_dict = {201+i: {field: original[i][field] for field in schema_fields} for i in range(len(original))}

In [45]:
common_notes = set(manual_dict.keys()) & set(tuned_dict.keys()) & set(original_dict.keys())

In [47]:
def compute_accuracy(pred_dict, manual_dict, schema_fields, notes):
    results = []
    for field in schema_fields:
        correct = sum(pred_dict[n][field] == manual_dict[n][field] for n in notes)
        total = len(notes)
        acc = round(100 * correct / total, 2)
        results.append([field, correct, total, acc])
    return pd.DataFrame(results, columns=["Field", "Correct", "Total", "Accuracy (%)"])


In [48]:
baseline_acc = compute_accuracy(original_dict, manual_dict, schema_fields, common_notes)
tuned_acc = compute_accuracy(tuned_dict, manual_dict, schema_fields, common_notes)

In [57]:
baseline_acc

Unnamed: 0,Field,Correct,Total,Accuracy (%)
0,imatinib_mentioned,68,84,80.95
1,related_drugs_mentioned,61,84,72.62
2,cml_diagnosed,60,84,71.43
3,cml_in_regression,71,84,84.52
4,aml_diagnosed,71,84,84.52
5,blast_phase_cml,81,84,96.43
6,bmt_history,61,84,72.62
7,acute_phase_cml,79,84,94.05


In [58]:
tuned_acc

Unnamed: 0,Field,Correct,Total,Accuracy (%)
0,imatinib_mentioned,69,84,82.14
1,related_drugs_mentioned,65,84,77.38
2,cml_diagnosed,63,84,75.0
3,cml_in_regression,76,84,90.48
4,aml_diagnosed,76,84,90.48
5,blast_phase_cml,81,84,96.43
6,bmt_history,64,84,76.19
7,acute_phase_cml,81,84,96.43
