In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_csv("precision_per_sample.tsv", sep="\t")
df = df[["tool", "sample", "precision"]]
df

Unnamed: 0,tool,sample,precision
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,0.994285
1,pandora_illumina_nodenovo_global_genotyping,CFT073,0.993584
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1A,0.994114
3,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.993481
4,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_7C,0.994334
...,...,...,...
1035,samtools_CP010170.1,Escherichia_coli_MSB1_8G,0.983412
1036,samtools_CP010170.1,Escherichia_coli_MSB1_9D,0.907875
1037,samtools_CP010170.1,Escherichia_coli_MSB2_1A,0.989890
1038,samtools_CP010170.1,H131800734,0.983826


In [3]:
# # removing samtools
# df = df[["samtools" not in tool for tool in df.tool]]
# df

Unnamed: 0,tool,sample,precisions_wrt_truth_probes
0,snippy_NC_004431.1,Escherichia_coli_MSB1_6C,0.821731
1,snippy_NC_010498.1,Escherichia_coli_MINF_7C,0.889135
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MSB1_7C,0.887915
3,snippy_NZ_CP008697.1,Escherichia_coli_MSB2_1A,0.885606
11,snippy_CP010121.1,CFT073,0.844107
...,...,...,...
1031,snippy_NZ_CP008697.1,Escherichia_coli_MSB1_3B,0.894887
1035,snippy_CP018206.1,Escherichia_coli_MSB1_3B,0.927632
1036,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.895750
1038,snippy_CP010116.1,CFT073,0.813402


In [3]:
# making things look nicer

def get_tool_and_ref(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo / PRG"
        else:
            return "Pandora illumina with denovo / PRG"
    elif "snippy" in tool:
        return "Snippy / " + tool[tool.index("_")+1:]
    elif "samtools" in tool:
        return "Samtools / " + tool[tool.index("_")+1:]
    else:
        assert False, "We should not be here"
    
def get_ref(tool):
    if tool.startswith("pandora"):
        return "PRG"
    else:
        return tool[tool.index("_")+1:]

def get_tool(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo"
        else:
            return "Pandora illumina with denovo"
    elif "snippy" in tool:
        return "Snippy"
    elif "samtools" in tool:
        return "Samtools"
    else:
        assert False, "We should not be here"

df["tool_and_ref"] = df["tool"].apply(get_tool_and_ref)
df["ref"] = df["tool"].apply(get_ref)
df["tool"] = df["tool"].apply(get_tool)
df = df.sort_values(by="tool_and_ref")
df

Unnamed: 0,tool,sample,precision,tool_and_ref,ref
0,Pandora illumina no denovo,063_STEC,0.994285,Pandora illumina no denovo / PRG,PRG
19,Pandora illumina no denovo,ST38,0.994986,Pandora illumina no denovo / PRG,PRG
18,Pandora illumina no denovo,H131800734,0.994247,Pandora illumina no denovo / PRG,PRG
17,Pandora illumina no denovo,Escherichia_coli_MSB2_1A,0.994054,Pandora illumina no denovo / PRG,PRG
16,Pandora illumina no denovo,Escherichia_coli_MSB1_9D,0.993678,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...
522,Snippy,Escherichia_coli_MINF_1A,0.999708,Snippy / NZ_NG941718.1,NZ_NG941718.1
521,Snippy,CFT073,0.999553,Snippy / NZ_NG941718.1,NZ_NG941718.1
539,Snippy,ST38,0.999736,Snippy / NZ_NG941718.1,NZ_NG941718.1
528,Snippy,Escherichia_coli_MSB1_3B,0.999708,Snippy / NZ_NG941718.1,NZ_NG941718.1


In [5]:
# fix some ref names, because snippy script replace non-ACGT chars to U, even in ref name

def fix_ref_name(ref):
    if ref=="CN928163.2":
        return "CU928163.2"
    if ref=="NZ_LN995446.1":
        return "NZ_LM995446.1"
    if ref=="NZ_NG941718.1":
        return "NZ_HG941718.1"
    return ref


def fix_tool_and_ref_name(tool_and_ref):
    if tool_and_ref.endswith("CN928163.2"):
        return tool_and_ref.replace("CN928163.2", "CU928163.2")
    if tool_and_ref.endswith("NZ_LN995446.1"):
        return tool_and_ref.replace("NZ_LN995446.1", "NZ_LM995446.1")
    if tool_and_ref.endswith("NZ_NG941718.1"):
        return tool_and_ref.replace("NZ_NG941718.1", "NZ_HG941718.1")
    return tool_and_ref

df["ref"] = df["ref"].apply(fix_ref_name)
df["tool_and_ref"] = df["tool_and_ref"].apply(fix_tool_and_ref_name)
df

Unnamed: 0,tool,sample,precision,tool_and_ref,ref
0,Pandora illumina no denovo,063_STEC,0.994285,Pandora illumina no denovo / PRG,PRG
19,Pandora illumina no denovo,ST38,0.994986,Pandora illumina no denovo / PRG,PRG
18,Pandora illumina no denovo,H131800734,0.994247,Pandora illumina no denovo / PRG,PRG
17,Pandora illumina no denovo,Escherichia_coli_MSB2_1A,0.994054,Pandora illumina no denovo / PRG,PRG
16,Pandora illumina no denovo,Escherichia_coli_MSB1_9D,0.993678,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...
522,Snippy,Escherichia_coli_MINF_1A,0.999708,Snippy / NZ_HG941718.1,NZ_HG941718.1
521,Snippy,CFT073,0.999553,Snippy / NZ_HG941718.1,NZ_HG941718.1
539,Snippy,ST38,0.999736,Snippy / NZ_HG941718.1,NZ_HG941718.1
528,Snippy,Escherichia_coli_MSB1_3B,0.999708,Snippy / NZ_HG941718.1,NZ_HG941718.1


In [6]:
fig = px.bar(df, x="sample", y="precision", facet_col="tool_and_ref", facet_col_wrap=9, width=4000, height=1000)
fig.write_image("fig.png")

In [7]:
df.to_csv("precision_per_ref_per_clade.csv", index=False)