In [None]:
import re 
import pandas as pd

df = pd.read_csv("doc/predictions.txt", sep="\t")

# Columns from dataframe we want to benchmark
configs = ["llama3_base", "mistral_base", "llama3_ft", "mistral_ft", "llama3_base_rag1", "mistral_base_rag1", "llama3_base_rag2", "mistral_base_rag2", "llama3_ft_rag1", "mistral_ft_rag1", "llama3_ft_rag2", "mistral_ft_rag2"]

full_list = []
partial_list = []
error_code_list = []
hallucination_list = []
no_match_list = []

# Benchmark each configuration
for config in configs:
    full = 0
    partial = 0
    error_code = 0
    hallucination = 0
    no_match = 0

    def check_match(pred: str, gold: str) -> None:
        global full, partial, error_code, hallucination, no_match

        if pred == gold:
            full +=1
        elif pred[0:3] == gold[0:3]:
            partial+=1
        elif pred[0:5] == "Æ99.9":
            error_code+=1
        else:
            hallucination+=1
    
    for index, row in df.iterrows():
        prediction = re.search(r'[A-ZÆ]\d{2}(?:\.\d)?', str(row[config]))
        if(prediction):
            gold = row["icd10"]
            code = prediction.group()
            check_match(code, gold)
        else:
            # print(row[config])
            no_match+=1
            
    full_list.append(full)
    partial_list.append(partial)
    error_code_list.append(error_code)
    hallucination_list.append(hallucination)
    no_match_list.append(no_match)

    total = len(df)
    print(f"Config: {config}, Full: {full} ({(float(full)/total)*100:.0f}%), Partial: {partial} ({(float(partial)/total)*100:.0f}%), Error code: {error_code} ({(float(error_code)/total)*100:.0f}%), Hallucination: {hallucination} ({(float(hallucination)/total)*100:.0f}%)  No match: {no_match} ({(float(no_match)/total)*100:.0f}%)")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Setup data for dataframe
data = {
    "Config": ["llama3_base", "mistral_base", "llama3_ft", "mistral_ft",
               "llama3_base_rag1", "mistral_base_rag1", "llama3_base_rag2", "mistral_base_rag2",
               "llama3_ft_rag1", "mistral_ft_rag1", "llama3_ft_rag2", "mistral_ft_rag2"],
    "Full": full_list,
    "Partial": partial_list,
    "Error code": error_code_list,
    "Hallucination": hallucination_list,
    "No code": no_match_list
}

df = pd.DataFrame(data)

# Normalize data to percentages
df["Total"] = df["Full"] + df["Partial"] + df["Error code"] + df["Hallucination"] + df["No code"]
df["Full"] = df["Full"] / df["Total"] * 100
df["Partial"] = df["Partial"] / df["Total"] * 100
df["Error code"] = df["Error code"] / df["Total"] * 100
df["Hallucination"] = df["Hallucination"] / df["Total"] * 100
df["No code"] = df["No code"] / df["Total"] * 100

# Config labels are very wide, need to shorten
config_labels = [label.replace("_", "\n").replace("llama3", "L3").replace("mistral", "M")
                 .replace("base", "B").replace("ft", "FT").replace("rag1", "R1").replace("rag2", "R2")
                 for label in df["Config"]]

# Plot with shortened labels
fig, ax = plt.subplots(figsize=(8, 6))
df.set_index("Config")[["Full", "Partial", "Error code", "Hallucination", "No code"]].plot(
    kind="bar", stacked=True, ax=ax, color=["green", "yellow", "orange", "purple", "red"])
ax.set_ylabel("Percentage (%)")
ax.set_xlabel("Model and Configuration")
ax.set_ylim(0, 100)
ax.set_xticklabels(config_labels, rotation=0) 
plt.legend(title="Match Type", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.title("Model Predictions")

# Show plot with tight, compressed layout
plt.tight_layout()
plt.show()

In [None]:
# Find average response length
df = pd.read_csv("doc/predictions.txt", sep="\t")

configs = ["llama3_base", "mistral_base", "llama3_ft", "mistral_ft", "llama3_base_rag1", "mistral_base_rag1", "llama3_base_rag2", "mistral_base_rag2", "llama3_ft_rag1", "mistral_ft_rag1", "llama3_ft_rag2", "mistral_ft_rag2"]

for config in configs:
    print(f"{config}, Average length: {df[config].str.len().mean():.1f}")