In [1]:
import pandas as pd
import plotly.express as px

In [3]:
# input
df = pd.read_csv(snakemake.input.precision_per_sample, sep="\t")
df = df[["tool", "sample", "precision"]]
tools_to_keep = snakemake.wildcards.tools_to_keep
df

Unnamed: 0,tool,sample,precision
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,0.994285
1,pandora_illumina_nodenovo_global_genotyping,CFT073,0.993584
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1A,0.994114
3,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.993481
4,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_7C,0.994334
...,...,...,...
1035,samtools_CP010170.1,Escherichia_coli_MSB1_8G,0.983412
1036,samtools_CP010170.1,Escherichia_coli_MSB1_9D,0.907875
1037,samtools_CP010170.1,Escherichia_coli_MSB2_1A,0.989890
1038,samtools_CP010170.1,H131800734,0.983826


In [4]:
# filtering for tools to keep
def tool_is_inside_tools_to_keep(tool):
    for tool_to_keep in tools_to_keep:
        if tool.startswith(tool_to_keep):
            return True
    return False

df = df[[tool_is_inside_tools_to_keep(tool) for tool in df.tool]]
df

Unnamed: 0,tool,sample,precision
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,0.994285
1,pandora_illumina_nodenovo_global_genotyping,CFT073,0.993584
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1A,0.994114
3,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.993481
4,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_7C,0.994334
...,...,...,...
1035,samtools_CP010170.1,Escherichia_coli_MSB1_8G,0.983412
1036,samtools_CP010170.1,Escherichia_coli_MSB1_9D,0.907875
1037,samtools_CP010170.1,Escherichia_coli_MSB2_1A,0.989890
1038,samtools_CP010170.1,H131800734,0.983826


In [5]:
# create ref and tool columns


# get ref out of tool column
def get_ref(tool):
    if tool.startswith("pandora"):
        return "PRG"
    else:
        return tool[tool.index("_")+1:]

    
def get_tool(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo"
        else:
            return "Pandora illumina with denovo"
    elif "snippy" in tool:
        return "Snippy"
    elif "samtools" in tool:
        return "Samtools"
    else:
        assert False, "We should not be here"


df["ref"] = df["tool"].apply(get_ref)
df["tool"] = df["tool"].apply(get_tool)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,precision,ref
0,Pandora illumina no denovo,063_STEC,0.994285,PRG
1,Pandora illumina no denovo,CFT073,0.993584,PRG
2,Pandora illumina no denovo,Escherichia_coli_MINF_1A,0.994114,PRG
3,Pandora illumina no denovo,Escherichia_coli_MINF_1D,0.993481,PRG
4,Pandora illumina no denovo,Escherichia_coli_MINF_7C,0.994334,PRG
...,...,...,...,...
1035,Samtools,Escherichia_coli_MSB1_8G,0.983412,CP010170.1
1036,Samtools,Escherichia_coli_MSB1_9D,0.907875,CP010170.1
1037,Samtools,Escherichia_coli_MSB2_1A,0.989890,CP010170.1
1038,Samtools,H131800734,0.983826,CP010170.1


In [6]:
# add canonical names to refs

def get_canonical_ref_names(ref):
    if ref=="PRG":
        return "PRG"
    
    ref_to_canonical = {
        "CP010116.1": "C1",
        "CP010121.1": "C4",
        "CP010170.1": "H6",
        "CP010171.1": "H7",
        "CP010226.1": "S1",
        "CP010230.1": "S21",
        "CP018206.1": "MRSN346647",
        "CU928163.2": "UMN026",
        "NC_004431.1": "CFT073",
        "NC_007779.1": "W3110",
        "NC_010498.1": "SMS-3-5",
        "NC_011742.1": "S88",
        "NC_011993.1": "LF82",
        "NC_017646.1": "CE10",
        "NC_022648.1": "JJ1886",
        "NZ_CP008697.1": "ST648",
        "NZ_CP009859.1": "ECONIH1",
        "NZ_CP011134.1": "VR50",
        "NZ_CP013483.1": "Y5",
        "NZ_CP015228.1": "09-00049",
        "NZ_CP016007.1": "NGF1",
        "NZ_CP018109.1": "MRSN346595",
        "NZ_HG941718.1": "EC958",
        "NZ_LM995446.1": "EcRV308Chr",
        "NZ_LT632320.1": "NCTC_13441"
    }
    return f"{ref_to_canonical[ref]} ({ref})"

df["ref"] = df["ref"].apply(get_canonical_ref_names)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,precision,ref
0,Pandora illumina no denovo,063_STEC,0.994285,PRG
1,Pandora illumina no denovo,CFT073,0.993584,PRG
2,Pandora illumina no denovo,Escherichia_coli_MINF_1A,0.994114,PRG
3,Pandora illumina no denovo,Escherichia_coli_MINF_1D,0.993481,PRG
4,Pandora illumina no denovo,Escherichia_coli_MINF_7C,0.994334,PRG
...,...,...,...,...
1035,Samtools,Escherichia_coli_MSB1_8G,0.983412,H6 (CP010170.1)
1036,Samtools,Escherichia_coli_MSB1_9D,0.907875,H6 (CP010170.1)
1037,Samtools,Escherichia_coli_MSB2_1A,0.989890,H6 (CP010170.1)
1038,Samtools,H131800734,0.983826,H6 (CP010170.1)


In [7]:
# add tool_and_ref column
def get_tool_and_ref(df):
    return f"{df['tool']} / {df['ref']}"
    

df["tool_and_ref"] = df.apply(get_tool_and_ref, axis=1)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,precision,ref,tool_and_ref
0,Pandora illumina no denovo,063_STEC,0.994285,PRG,Pandora illumina no denovo / PRG
1,Pandora illumina no denovo,CFT073,0.993584,PRG,Pandora illumina no denovo / PRG
2,Pandora illumina no denovo,Escherichia_coli_MINF_1A,0.994114,PRG,Pandora illumina no denovo / PRG
3,Pandora illumina no denovo,Escherichia_coli_MINF_1D,0.993481,PRG,Pandora illumina no denovo / PRG
4,Pandora illumina no denovo,Escherichia_coli_MINF_7C,0.994334,PRG,Pandora illumina no denovo / PRG
...,...,...,...,...,...
1035,Samtools,Escherichia_coli_MSB1_8G,0.983412,H6 (CP010170.1),Samtools / H6 (CP010170.1)
1036,Samtools,Escherichia_coli_MSB1_9D,0.907875,H6 (CP010170.1),Samtools / H6 (CP010170.1)
1037,Samtools,Escherichia_coli_MSB2_1A,0.989890,H6 (CP010170.1),Samtools / H6 (CP010170.1)
1038,Samtools,H131800734,0.983826,H6 (CP010170.1),Samtools / H6 (CP010170.1)


In [8]:
# save csv
df.to_csv(snakemake.output.csv_data, index=False)