In [1]:
import pandas as pd
import plotly.express as px

In [2]:
# input
df = pd.read_csv(snakemake.input.aggregated_recall_per_sample_per_nb_of_samples, sep="\t")
df = df[["tool", "sample", "nb_of_samples", "recalls_wrt_truth_probes"]]
tools_to_keep = snakemake.wildcards.tools_to_keep
tools_to_keep = tools_to_keep.split("_")
list_with_number_of_samples = snakemake.params.list_with_number_of_samples
output_file_string_format = snakemake.params.output_file_string_format
df

Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,2,0.775038
1,pandora_illumina_nodenovo_global_genotyping,063_STEC,3,0.821852
2,pandora_illumina_nodenovo_global_genotyping,063_STEC,4,0.831538
3,pandora_illumina_nodenovo_global_genotyping,063_STEC,5,0.804246
4,pandora_illumina_nodenovo_global_genotyping,063_STEC,6,0.914239
...,...,...,...,...
19755,samtools_CP010170.1,ST38,16,0.957952
19756,samtools_CP010170.1,ST38,17,0.977306
19757,samtools_CP010170.1,ST38,18,0.989121
19758,samtools_CP010170.1,ST38,19,0.994864


In [3]:
# filtering for tools to keep
def tool_is_inside_tools_to_keep(tool):
    for tool_to_keep in tools_to_keep:
        if tool.startswith(tool_to_keep):
            return True
    return False

df = df[[tool_is_inside_tools_to_keep(tool) for tool in df.tool]]
df

Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,2,0.775038
1,pandora_illumina_nodenovo_global_genotyping,063_STEC,3,0.821852
2,pandora_illumina_nodenovo_global_genotyping,063_STEC,4,0.831538
3,pandora_illumina_nodenovo_global_genotyping,063_STEC,5,0.804246
4,pandora_illumina_nodenovo_global_genotyping,063_STEC,6,0.914239
...,...,...,...,...
19280,snippy_NZ_NG941718.1,ST38,16,0.955742
19281,snippy_NZ_NG941718.1,ST38,17,0.963785
19282,snippy_NZ_NG941718.1,ST38,18,0.960307
19283,snippy_NZ_NG941718.1,ST38,19,0.965412


In [4]:
# create ref and tool columns


# get ref out of tool column
def get_ref(tool):
    if tool.startswith("pandora"):
        return "PRG"
    else:
        return tool[tool.index("_")+1:]

    
def get_tool(tool):
    if "pandora" in tool:
        if "nodenovo" in tool or "no_denovo" in tool:
            if "illumina" in tool:
                return "Pandora illumina no denovo"
            elif "nanopore" in tool:
                return "Pandora nanopore no denovo"
            else:
                raise RuntimeError(f"Unknown tool: {tool}")
        elif "withdenovo" in tool or "with_denovo" in tool:
            if "illumina" in tool:
                return "Pandora illumina with denovo"
            elif "nanopore" in tool:
                return "Pandora nanopore with denovo"
            else:
                raise RuntimeError(f"Unknown tool: {tool}")
    elif "snippy" in tool:
        return "Snippy"
    elif "samtools" in tool:
        return "Samtools"
    elif "medaka" in tool:
        return "Medaka"
    elif "nanopolish" in tool:
        return "Nanopolish"
    else:
        raise RuntimeError(f"Unknown tool: {tool}")


df["ref"] = df["tool"].apply(get_ref)
df["tool"] = df["tool"].apply(get_tool)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes,ref
0,Pandora illumina no denovo,063_STEC,2,0.775038,PRG
1,Pandora illumina no denovo,063_STEC,3,0.821852,PRG
2,Pandora illumina no denovo,063_STEC,4,0.831538,PRG
3,Pandora illumina no denovo,063_STEC,5,0.804246,PRG
4,Pandora illumina no denovo,063_STEC,6,0.914239,PRG
...,...,...,...,...,...
19280,Snippy,ST38,16,0.955742,NZ_HG941718.1
19281,Snippy,ST38,17,0.963785,NZ_HG941718.1
19282,Snippy,ST38,18,0.960307,NZ_HG941718.1
19283,Snippy,ST38,19,0.965412,NZ_HG941718.1


In [5]:
# add canonical names to refs

def get_canonical_ref_names(ref):
    if ref=="PRG":
        return "PRG"
    
    ref_to_canonical = {
        "CP010116.1": "C1",
        "CP010121.1": "C4",
        "CP010170.1": "H6",
        "CP010171.1": "H7",
        "CP010226.1": "S1",
        "CP010230.1": "S21",
        "CP018206.1": "MRSN346647",
        "CU928163.2": "UMN026",
        "NC_004431.1": "CFT073",
        "NC_007779.1": "W3110",
        "NC_010498.1": "SMS-3-5",
        "NC_011742.1": "S88",
        "NC_011993.1": "LF82",
        "NC_017646.1": "CE10",
        "NC_022648.1": "JJ1886",
        "NZ_CP008697.1": "ST648",
        "NZ_CP009859.1": "ECONIH1",
        "NZ_CP011134.1": "VR50",
        "NZ_CP013483.1": "Y5",
        "NZ_CP015228.1": "09-00049",
        "NZ_CP016007.1": "NGF1",
        "NZ_CP018109.1": "MRSN346595",
        "NZ_HG941718.1": "EC958",
        "NZ_LM995446.1": "EcRV308Chr",
        "NZ_LT632320.1": "NCTC_13441"
    }
    return f"{ref_to_canonical[ref]} ({ref})"

df["ref"] = df["ref"].apply(get_canonical_ref_names)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes,ref
0,Pandora illumina no denovo,063_STEC,2,0.775038,PRG
1,Pandora illumina no denovo,063_STEC,3,0.821852,PRG
2,Pandora illumina no denovo,063_STEC,4,0.831538,PRG
3,Pandora illumina no denovo,063_STEC,5,0.804246,PRG
4,Pandora illumina no denovo,063_STEC,6,0.914239,PRG
...,...,...,...,...,...
19280,Snippy,ST38,16,0.955742,EC958 (NZ_HG941718.1)
19281,Snippy,ST38,17,0.963785,EC958 (NZ_HG941718.1)
19282,Snippy,ST38,18,0.960307,EC958 (NZ_HG941718.1)
19283,Snippy,ST38,19,0.965412,EC958 (NZ_HG941718.1)


In [6]:
# add tool_and_ref column
def get_tool_and_ref(df):
    return f"{df['tool']} / {df['ref']}"
    

df["tool_and_ref"] = df.apply(get_tool_and_ref, axis=1)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes,ref,tool_and_ref
0,Pandora illumina no denovo,063_STEC,2,0.775038,PRG,Pandora illumina no denovo / PRG
1,Pandora illumina no denovo,063_STEC,3,0.821852,PRG,Pandora illumina no denovo / PRG
2,Pandora illumina no denovo,063_STEC,4,0.831538,PRG,Pandora illumina no denovo / PRG
3,Pandora illumina no denovo,063_STEC,5,0.804246,PRG,Pandora illumina no denovo / PRG
4,Pandora illumina no denovo,063_STEC,6,0.914239,PRG,Pandora illumina no denovo / PRG
...,...,...,...,...,...,...
19280,Snippy,ST38,16,0.955742,EC958 (NZ_HG941718.1),Snippy / EC958 (NZ_HG941718.1)
19281,Snippy,ST38,17,0.963785,EC958 (NZ_HG941718.1),Snippy / EC958 (NZ_HG941718.1)
19282,Snippy,ST38,18,0.960307,EC958 (NZ_HG941718.1),Snippy / EC958 (NZ_HG941718.1)
19283,Snippy,ST38,19,0.965412,EC958 (NZ_HG941718.1),Snippy / EC958 (NZ_HG941718.1)


In [7]:
# generate the several csvs
for nb_of_samples in list_with_number_of_samples:
    restricted_df = df[df.nb_of_samples == nb_of_samples]
    restricted_df.to_csv(output_file_string_format.format(nb_of_samples=nb_of_samples), index=False)