In [1]:
import pandas as pd
import plotly.express as px

In [2]:
# input
df = pd.read_csv("recall_per_sample.tsv", sep="\t")
df = df[["tool", "sample", "recalls_wrt_truth_probes"]]
df

Unnamed: 0,tool,sample,recalls_wrt_truth_probes
0,snippy_NC_004431.1,Escherichia_coli_MSB1_6C,0.821731
1,snippy_NC_010498.1,Escherichia_coli_MINF_7C,0.889135
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MSB1_7C,0.887915
3,snippy_NZ_CP008697.1,Escherichia_coli_MSB2_1A,0.885606
4,samtools_NZ_CP013483.1,Escherichia_coli_MSB1_4I,0.804207
...,...,...,...
1035,snippy_CP018206.1,Escherichia_coli_MSB1_3B,0.927632
1036,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.895750
1037,samtools_NZ_CP008697.1,Escherichia_coli_MSB1_1A,0.912384
1038,snippy_CP010116.1,CFT073,0.813402


In [3]:
# removing samtools
df = df[["samtools" not in tool for tool in df.tool]]
df

Unnamed: 0,tool,sample,recalls_wrt_truth_probes
0,snippy_NC_004431.1,Escherichia_coli_MSB1_6C,0.821731
1,snippy_NC_010498.1,Escherichia_coli_MINF_7C,0.889135
2,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MSB1_7C,0.887915
3,snippy_NZ_CP008697.1,Escherichia_coli_MSB2_1A,0.885606
11,snippy_CP010121.1,CFT073,0.844107
...,...,...,...
1031,snippy_NZ_CP008697.1,Escherichia_coli_MSB1_3B,0.894887
1035,snippy_CP018206.1,Escherichia_coli_MSB1_3B,0.927632
1036,pandora_illumina_nodenovo_global_genotyping,Escherichia_coli_MINF_1D,0.895750
1038,snippy_CP010116.1,CFT073,0.813402


In [4]:
# create ref and tool columns


# get ref out of tool column
def get_ref(tool):
    if tool.startswith("pandora"):
        return "PRG"
    else:
        return tool[tool.index("_")+1:]

    
def get_tool(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo"
        else:
            return "Pandora illumina with denovo"
    elif "snippy" in tool:
        return "Snippy"
    elif "samtools" in tool:
        return "Samtools"
    else:
        assert False, "We should not be here"


# fix some ref names, because snippy script replace non-ACGT chars to U, even in ref name
def fix_ref_name(ref):
    if ref=="CN928163.2":
        return "CU928163.2"
    if ref=="NZ_LN995446.1":
        return "NZ_LM995446.1"
    if ref=="NZ_NG941718.1":
        return "NZ_HG941718.1"
    return ref

df["ref"] = df["tool"].apply(get_ref)
df["ref"] = df["ref"].apply(fix_ref_name)
df["tool"] = df["tool"].apply(get_tool)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,recalls_wrt_truth_probes,ref
0,Snippy,Escherichia_coli_MSB1_6C,0.821731,NC_004431.1
1,Snippy,Escherichia_coli_MINF_7C,0.889135,NC_010498.1
2,Pandora illumina no denovo,Escherichia_coli_MSB1_7C,0.887915,PRG
3,Snippy,Escherichia_coli_MSB2_1A,0.885606,NZ_CP008697.1
11,Snippy,CFT073,0.844107,CP010121.1
...,...,...,...,...
1031,Snippy,Escherichia_coli_MSB1_3B,0.894887,NZ_CP008697.1
1035,Snippy,Escherichia_coli_MSB1_3B,0.927632,CP018206.1
1036,Pandora illumina no denovo,Escherichia_coli_MINF_1D,0.895750,PRG
1038,Snippy,CFT073,0.813402,CP010116.1


In [5]:
# add canonical names to refs

def get_canonical_ref_names(ref):
    if ref=="PRG":
        return "PRG"
    
    ref_to_canonical = {
        "CP010116.1": "C1",
        "CP010121.1": "C4",
        "CP010170.1": "H6",
        "CP010171.1": "H7",
        "CP010226.1": "S1",
        "CP010230.1": "S21",
        "CP018206.1": "MRSN346647",
        "CU928163.2": "UMN026",
        "NC_004431.1": "CFT073",
        "NC_007779.1": "W3110",
        "NC_010498.1": "SMS-3-5",
        "NC_011742.1": "S88",
        "NC_011993.1": "LF82",
        "NC_017646.1": "CE10",
        "NC_022648.1": "JJ1886",
        "NZ_CP008697.1": "ST648",
        "NZ_CP009859.1": "ECONIH1",
        "NZ_CP011134.1": "VR50",
        "NZ_CP013483.1": "Y5",
        "NZ_CP015228.1": "09-00049",
        "NZ_CP016007.1": "NGF1",
        "NZ_CP018109.1": "MRSN346595",
        "NZ_HG941718.1": "EC958",
        "NZ_LM995446.1": "EcRV308Chr",
        "NZ_LT632320.1": "NCTC_13441"
    }
    return f"{ref_to_canonical[ref]} ({ref})"

df["ref"] = df["ref"].apply(get_canonical_ref_names)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,recalls_wrt_truth_probes,ref
0,Snippy,Escherichia_coli_MSB1_6C,0.821731,CFT073 (NC_004431.1)
1,Snippy,Escherichia_coli_MINF_7C,0.889135,SMS-3-5 (NC_010498.1)
2,Pandora illumina no denovo,Escherichia_coli_MSB1_7C,0.887915,PRG
3,Snippy,Escherichia_coli_MSB2_1A,0.885606,ST648 (NZ_CP008697.1)
11,Snippy,CFT073,0.844107,C4 (CP010121.1)
...,...,...,...,...
1031,Snippy,Escherichia_coli_MSB1_3B,0.894887,ST648 (NZ_CP008697.1)
1035,Snippy,Escherichia_coli_MSB1_3B,0.927632,MRSN346647 (CP018206.1)
1036,Pandora illumina no denovo,Escherichia_coli_MINF_1D,0.895750,PRG
1038,Snippy,CFT073,0.813402,C1 (CP010116.1)


In [6]:
# add tool_and_ref column
def get_tool_and_ref(df):
    return f"{df['tool']} / {df['ref']}"
    

df["tool_and_ref"] = df.apply(get_tool_and_ref, axis=1)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,recalls_wrt_truth_probes,ref,tool_and_ref
0,Snippy,Escherichia_coli_MSB1_6C,0.821731,CFT073 (NC_004431.1),Snippy / CFT073 (NC_004431.1)
1,Snippy,Escherichia_coli_MINF_7C,0.889135,SMS-3-5 (NC_010498.1),Snippy / SMS-3-5 (NC_010498.1)
2,Pandora illumina no denovo,Escherichia_coli_MSB1_7C,0.887915,PRG,Pandora illumina no denovo / PRG
3,Snippy,Escherichia_coli_MSB2_1A,0.885606,ST648 (NZ_CP008697.1),Snippy / ST648 (NZ_CP008697.1)
11,Snippy,CFT073,0.844107,C4 (CP010121.1),Snippy / C4 (CP010121.1)
...,...,...,...,...,...
1031,Snippy,Escherichia_coli_MSB1_3B,0.894887,ST648 (NZ_CP008697.1),Snippy / ST648 (NZ_CP008697.1)
1035,Snippy,Escherichia_coli_MSB1_3B,0.927632,MRSN346647 (CP018206.1),Snippy / MRSN346647 (CP018206.1)
1036,Pandora illumina no denovo,Escherichia_coli_MINF_1D,0.895750,PRG,Pandora illumina no denovo / PRG
1038,Snippy,CFT073,0.813402,C1 (CP010116.1),Snippy / C1 (CP010116.1)


In [7]:
# save csv
df.to_csv("recall_per_ref_per_clade.csv", index=False)

In [8]:
# generate plot
!Rscript clade_plots.R recall_per_ref_per_clade.csv 0.75 Recall_per_ref_per_clade recall_per_ref_per_clade.png

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0
[32m✔[39m [34mpurrr  [39m 0.3.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mcombine()[39m masks [34mgridExtra[39m::combine()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
null device 
          1 
