In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_csv("recall_per_sample.tsv", sep="\t")
df = df[["tool", "sample", "recalls_wrt_truth_probes"]]
df

Unnamed: 0,tool,sample,recalls_wrt_truth_probes
0,snippy_NC_004431.1,Escherichia_coli_MSB1_6C,0.821731
1,snippy_NC_010498.1,Escherichia_coli_MINF_7C,0.889135
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MSB1_7C,0.887915
3,snippy_NZ_CP008697.1,Escherichia_coli_MSB2_1A,0.885606
4,samtools_NZ_CP013483.1,Escherichia_coli_MSB1_4I,0.804207
...,...,...,...
1035,snippy_CP018206.1,Escherichia_coli_MSB1_3B,0.927632
1036,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.895750
1037,samtools_NZ_CP008697.1,Escherichia_coli_MSB1_1A,0.912384
1038,snippy_CP010116.1,CFT073,0.813402


In [3]:
# removing samtools
df = df[["samtools" not in tool for tool in df.tool]]
df

Unnamed: 0,tool,sample,recalls_wrt_truth_probes
0,snippy_NC_004431.1,Escherichia_coli_MSB1_6C,0.821731
1,snippy_NC_010498.1,Escherichia_coli_MINF_7C,0.889135
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MSB1_7C,0.887915
3,snippy_NZ_CP008697.1,Escherichia_coli_MSB2_1A,0.885606
11,snippy_CP010121.1,CFT073,0.844107
...,...,...,...
1031,snippy_NZ_CP008697.1,Escherichia_coli_MSB1_3B,0.894887
1035,snippy_CP018206.1,Escherichia_coli_MSB1_3B,0.927632
1036,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.895750
1038,snippy_CP010116.1,CFT073,0.813402


In [4]:
# making things look nicer

def get_tool_and_ref(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo / PRG"
        else:
            return "Pandora illumina with denovo / PRG"
    else:
        return "Snippy / " + tool[tool.index("_")+1:]
    
def get_ref(tool):
    if tool.startswith("pandora"):
        return "PRG"
    else:
        return tool[tool.index("_")+1:]

def get_tool(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo"
        else:
            return "Pandora illumina with denovo"
    else:
        return "Snippy"

df["tool_and_ref"] = df["tool"].apply(get_tool_and_ref)
df["ref"] = df["tool"].apply(get_ref)
df["tool"] = df["tool"].apply(get_tool)
df = df.sort_values(by="tool_and_ref")
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,tool,sample,recalls_wrt_truth_probes,tool_and_ref,ref
407,Pandora illumina no denovo,ST38,0.876717,Pandora illumina no denovo / PRG,PRG
707,Pandora illumina no denovo,Escherichia_coli_MSB1_8G,0.907464,Pandora illumina no denovo / PRG,PRG
816,Pandora illumina no denovo,Escherichia_coli_MSB1_6C,0.888480,Pandora illumina no denovo / PRG,PRG
761,Pandora illumina no denovo,Escherichia_coli_MSB1_4E,0.902893,Pandora illumina no denovo / PRG,PRG
1025,Pandora illumina no denovo,Escherichia_coli_MSB1_7A,0.888125,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...
196,Snippy,Escherichia_coli_MSB1_3B,0.832279,Snippy / NZ_NG941718.1,NZ_NG941718.1
418,Snippy,Escherichia_coli_MINF_7C,0.853479,Snippy / NZ_NG941718.1,NZ_NG941718.1
376,Snippy,Escherichia_coli_MSB1_7C,0.842836,Snippy / NZ_NG941718.1,NZ_NG941718.1
579,Snippy,Escherichia_coli_MSB1_6C,0.833029,Snippy / NZ_NG941718.1,NZ_NG941718.1


In [5]:
# fix some ref names, because snippy script replace non-ACGT chars to U, even in ref name

def fix_ref_name(ref):
    if ref=="CN928163.2":
        return "CU928163.2"
    if ref=="NZ_LN995446.1":
        return "NZ_LM995446.1"
    if ref=="NZ_NG941718.1":
        return "NZ_HG941718.1"
    return ref


def fix_tool_and_ref_name(tool_and_ref):
    if tool_and_ref.endswith("CN928163.2"):
        return tool_and_ref.replace("CN928163.2", "CU928163.2")
    if tool_and_ref.endswith("NZ_LN995446.1"):
        return tool_and_ref.replace("NZ_LN995446.1", "NZ_LM995446.1")
    if tool_and_ref.endswith("NZ_NG941718.1"):
        return tool_and_ref.replace("NZ_NG941718.1", "NZ_HG941718.1")
    return tool_and_ref

df["ref"] = df["ref"].apply(fix_ref_name)
df["tool_and_ref"] = df["tool_and_ref"].apply(fix_tool_and_ref_name)
df

Unnamed: 0,tool,sample,recalls_wrt_truth_probes,tool_and_ref,ref
407,Pandora illumina no denovo,ST38,0.876717,Pandora illumina no denovo / PRG,PRG
707,Pandora illumina no denovo,Escherichia_coli_MSB1_8G,0.907464,Pandora illumina no denovo / PRG,PRG
816,Pandora illumina no denovo,Escherichia_coli_MSB1_6C,0.888480,Pandora illumina no denovo / PRG,PRG
761,Pandora illumina no denovo,Escherichia_coli_MSB1_4E,0.902893,Pandora illumina no denovo / PRG,PRG
1025,Pandora illumina no denovo,Escherichia_coli_MSB1_7A,0.888125,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...
196,Snippy,Escherichia_coli_MSB1_3B,0.832279,Snippy / NZ_HG941718.1,NZ_HG941718.1
418,Snippy,Escherichia_coli_MINF_7C,0.853479,Snippy / NZ_HG941718.1,NZ_HG941718.1
376,Snippy,Escherichia_coli_MSB1_7C,0.842836,Snippy / NZ_HG941718.1,NZ_HG941718.1
579,Snippy,Escherichia_coli_MSB1_6C,0.833029,Snippy / NZ_HG941718.1,NZ_HG941718.1


In [6]:
fig = px.bar(df, x="sample", y="recalls_wrt_truth_probes", facet_col="tool_and_ref", facet_col_wrap=9, width=4000, height=1000)
fig.write_image("fig.png")

In [7]:
df.to_csv("recall_per_ref_per_clade.csv", index=False)

In [9]:
df_temp = pd.read_csv("recall_per_sample.tsv", sep="\t")
df_temp.sort_values(by="recalls_wrt_truth_probes")

Unnamed: 0,GT,step_GT,recalls_wrt_truth_probes,nbs_of_truth_probes_found,nbs_of_truth_probes_in_total,recalls_wrt_variants_where_all_allele_seqs_were_found,recalls_wrt_variants_found_wrt_alleles,nbs_variants_where_all_allele_seqs_were_found,nbs_variants_found_wrt_alleles,nbs_variants_total,tool,coverage,coverage_threshold,strand_bias_threshold,gaps_threshold,sample
218,0,0,0.771194,327405,424543,0.0,0.049979,0,21218.137013,424543,snippy_NC_011993.1,100x,0,Not_App,Not_App,ST38
825,0,0,0.778830,330647,424543,0.0,0.050627,0,21493.435861,424543,snippy_CP010226.1,100x,0,Not_App,Not_App,ST38
170,0,0,0.779709,322599,413743,0.0,0.053724,0,22227.897655,413743,snippy_CP010226.1,100x,0,Not_App,Not_App,Escherichia_coli_MSB1_4I
570,0,0,0.781278,331686,424543,0.0,0.052252,0,22183.260197,424543,snippy_NZ_CP016007.1,100x,0,Not_App,Not_App,ST38
251,0,0,0.784858,333206,424543,0.0,0.053418,0,22678.325185,424543,snippy_NC_004431.1,100x,0,Not_App,Not_App,ST38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,0,0,0.986128,407320,413050,0.0,0.096135,0,39708.713822,413050,snippy_NZ_NG941718.1,100x,0,Not_App,Not_App,Escherichia_coli_MSB1_4E
461,0,0,0.986159,407333,413050,0.0,0.096128,0,39705.697709,413050,samtools_NZ_LT632320.1,100x,0,Not_App,Not_App,Escherichia_coli_MSB1_4E
757,0,0,0.987050,407701,413050,0.0,0.096336,0,39791.413358,413050,samtools_NZ_HG941718.1,100x,0,Not_App,Not_App,Escherichia_coli_MSB1_4E
741,0,0,0.999810,279344,279397,0.0,0.114067,0,31870.071257,279397,snippy_NC_004431.1,100x,0,Not_App,Not_App,CFT073
