In [None]:
import glob
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from collections import defaultdict

In [None]:
with open("accession_translator.json", "r") as s:
    accession_translator = json.load(s)

In [None]:
sns.set_theme(style="whitegrid")

In [None]:
datasets = {
    "augustus": "AUGUSTUS",
#    "braker3_noprotdb": "BRAKER3 (no orthoDB)",
    "braker3": "BRAKER3",
    "helixer": "Helixer",
    "GeneML800_c657g_ncbi_for_benchmarking_ep10": "geneML",
    "reference": "Reference",
}

In [None]:
busco_stats = defaultdict(list)
for file in glob.glob("/home/lisvad/mnt/nisin/geneml/outputs/*/busco2_*/*.json"):
    with open(file, 'r') as s:
        data = json.load(s)
        path_in = data["parameters"]["in"]
        accession = path_in.split('/')[6].replace(".faa",'')
        dataset = path_in.split('/')[5]
        busco_stats["accession"].append(accession)
        busco_stats["dataset"].append(dataset)
        busco_stats["completeness"].append(data["results"]["Complete percentage"])
        busco_stats["fragmented"].append(data["results"]["Fragmented percentage"])
        busco_stats["missing"].append(data["results"]["Missing percentage"])
busco_df = pd.DataFrame(busco_stats)
busco_df["dataset"] = pd.Categorical(busco_df["dataset"], categories=datasets.keys(), ordered=True)
busco_df["dataset"] = busco_df["dataset"].map(datasets)
busco_df["species"] = busco_df["accession"].map(accession_translator)
busco_df["species"] = pd.Categorical(busco_df["species"], categories=accession_translator.values(), ordered=True)

In [None]:
tool_palette = {
    "Reference" : "#000000",
    "AUGUSTUS" : "#5A749F",
    "BRAKER3" : "#B53535",
    "Helixer" : "#622870",
    "geneML" : "#FFAA00"
}

In [None]:
sns.pointplot(busco_df, y="species", x="completeness", hue="dataset", linestyle="none", palette=tool_palette)
plt.xlabel("BUSCO completeness (%)")
plt.xlim(0,100)

In [None]:
busco_df_long = busco_df.rename(columns={"completeness":"complete"}).melt(id_vars=["accession","dataset","species"], value_vars=["complete","missing","fragmented"],var_name="metric",value_name="percentage")

In [None]:
busco_palette = {
    "complete": "#63A4D3",
    "fragmented": "#FF8F21",
    "missing": "#E63127",
}

In [None]:
sns.boxplot(busco_df_long, x="percentage", y="dataset", hue="metric", palette=busco_palette)
plt.legend(title="BUSCO percentage", bbox_to_anchor=(1.05,1), loc='upper left')

In [None]:
data = busco_df_long[(busco_df_long["metric"]!="complete") & (busco_df_long["dataset"]=="geneML")]
sns.barplot(data, x="percentage", y="species", hue="metric", palette=busco_palette)