In [1]:
import pandas as pd
import plotly.express as px

In [2]:
tools_to_keep = ["samtools", "pandora"]

In [3]:
df = pd.read_csv("precision_per_sample.tsv", sep="\t")
df = df[["tool", "sample", "precision"]]
df

Unnamed: 0,tool,sample,precision
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,0.994285
1,pandora_illumina_nodenovo_global_genotyping,CFT073,0.993584
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1A,0.994114
3,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.993481
4,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_7C,0.994334
...,...,...,...
1035,samtools_CP010170.1,Escherichia_coli_MSB1_8G,0.983412
1036,samtools_CP010170.1,Escherichia_coli_MSB1_9D,0.907875
1037,samtools_CP010170.1,Escherichia_coli_MSB2_1A,0.989890
1038,samtools_CP010170.1,H131800734,0.983826


In [4]:
def tool_is_inside_tools_to_keep(tool):
    for tool_to_keep in tools_to_keep:
        if tool.startswith(tool_to_keep):
            return True
    return False

# filtering for tools to keep
df = df[[tool_is_inside_tools_to_keep(tool) for tool in df.tool]]
df

Unnamed: 0,tool,sample,precision
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,0.994285
1,pandora_illumina_nodenovo_global_genotyping,CFT073,0.993584
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1A,0.994114
3,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.993481
4,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_7C,0.994334
...,...,...,...
1035,samtools_CP010170.1,Escherichia_coli_MSB1_8G,0.983412
1036,samtools_CP010170.1,Escherichia_coli_MSB1_9D,0.907875
1037,samtools_CP010170.1,Escherichia_coli_MSB2_1A,0.989890
1038,samtools_CP010170.1,H131800734,0.983826


In [5]:
# making things look nicer

def get_tool_and_ref(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo / PRG"
        else:
            return "Pandora illumina with denovo / PRG"
    elif "snippy" in tool:
        return "Snippy / " + tool[tool.index("_")+1:]
    elif "samtools" in tool:
        return "Samtools / " + tool[tool.index("_")+1:]
    else:
        assert False, "We should not be here"
    
def get_ref(tool):
    if tool.startswith("pandora"):
        return "PRG"
    else:
        return tool[tool.index("_")+1:]

def get_tool(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo"
        else:
            return "Pandora illumina with denovo"
    elif "snippy" in tool:
        return "Snippy"
    elif "samtools" in tool:
        return "Samtools"
    else:
        assert False, "We should not be here"

df["tool_and_ref"] = df["tool"].apply(get_tool_and_ref)
df["ref"] = df["tool"].apply(get_ref)
df["tool"] = df["tool"].apply(get_tool)
df = df.sort_values(by="tool_and_ref")
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,precision,tool_and_ref,ref
0,Pandora illumina no denovo,063_STEC,0.994285,Pandora illumina no denovo / PRG,PRG
19,Pandora illumina no denovo,ST38,0.994986,Pandora illumina no denovo / PRG,PRG
18,Pandora illumina no denovo,H131800734,0.994247,Pandora illumina no denovo / PRG,PRG
17,Pandora illumina no denovo,Escherichia_coli_MSB2_1A,0.994054,Pandora illumina no denovo / PRG,PRG
16,Pandora illumina no denovo,Escherichia_coli_MSB1_9D,0.993678,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...
624,Samtools,Escherichia_coli_MINF_7C,0.988864,Samtools / NZ_LT632320.1,NZ_LT632320.1
623,Samtools,Escherichia_coli_MINF_1D,0.979990,Samtools / NZ_LT632320.1,NZ_LT632320.1
622,Samtools,Escherichia_coli_MINF_1A,0.991615,Samtools / NZ_LT632320.1,NZ_LT632320.1
620,Samtools,063_STEC,0.987444,Samtools / NZ_LT632320.1,NZ_LT632320.1


In [6]:
# fix some ref names, because snippy script replace non-ACGT chars to U, even in ref name

def fix_ref_name(ref):
    if ref=="CN928163.2":
        return "CU928163.2"
    if ref=="NZ_LN995446.1":
        return "NZ_LM995446.1"
    if ref=="NZ_NG941718.1":
        return "NZ_HG941718.1"
    return ref


def fix_tool_and_ref_name(tool_and_ref):
    if tool_and_ref.endswith("CN928163.2"):
        return tool_and_ref.replace("CN928163.2", "CU928163.2")
    if tool_and_ref.endswith("NZ_LN995446.1"):
        return tool_and_ref.replace("NZ_LN995446.1", "NZ_LM995446.1")
    if tool_and_ref.endswith("NZ_NG941718.1"):
        return tool_and_ref.replace("NZ_NG941718.1", "NZ_HG941718.1")
    return tool_and_ref

df["ref"] = df["ref"].apply(fix_ref_name)
df["tool_and_ref"] = df["tool_and_ref"].apply(fix_tool_and_ref_name)
df

Unnamed: 0,tool,sample,precision,tool_and_ref,ref
0,Pandora illumina no denovo,063_STEC,0.994285,Pandora illumina no denovo / PRG,PRG
19,Pandora illumina no denovo,ST38,0.994986,Pandora illumina no denovo / PRG,PRG
18,Pandora illumina no denovo,H131800734,0.994247,Pandora illumina no denovo / PRG,PRG
17,Pandora illumina no denovo,Escherichia_coli_MSB2_1A,0.994054,Pandora illumina no denovo / PRG,PRG
16,Pandora illumina no denovo,Escherichia_coli_MSB1_9D,0.993678,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...
624,Samtools,Escherichia_coli_MINF_7C,0.988864,Samtools / NZ_LT632320.1,NZ_LT632320.1
623,Samtools,Escherichia_coli_MINF_1D,0.979990,Samtools / NZ_LT632320.1,NZ_LT632320.1
622,Samtools,Escherichia_coli_MINF_1A,0.991615,Samtools / NZ_LT632320.1,NZ_LT632320.1
620,Samtools,063_STEC,0.987444,Samtools / NZ_LT632320.1,NZ_LT632320.1


In [7]:
df.to_csv(f"precision_per_ref_per_clade.{'_'.join(tools_to_keep)}.csv", index=False)

In [8]:
!Rscript clade_plots.R

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0
[32m✔[39m [34mpurrr  [39m 0.3.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mcombine()[39m masks [34mgridExtra[39m::combine()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
null device 
          1 
