In [1]:
import pandas as pd
import plotly.express as px
import imageio

In [2]:
df = pd.read_csv("recall_per_sample_per_number_of_samples.csv")
df = df[["tool", "sample", "nb_of_samples", "recalls_wrt_truth_probes"]]
df

Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,2,0.775038
1,pandora_illumina_nodenovo_global_genotyping,063_STEC,3,0.821852
2,pandora_illumina_nodenovo_global_genotyping,063_STEC,4,0.831538
3,pandora_illumina_nodenovo_global_genotyping,063_STEC,5,0.804246
4,pandora_illumina_nodenovo_global_genotyping,063_STEC,6,0.914239
...,...,...,...,...
19755,samtools_CP010170.1,ST38,16,0.957952
19756,samtools_CP010170.1,ST38,17,0.977306
19757,samtools_CP010170.1,ST38,18,0.989121
19758,samtools_CP010170.1,ST38,19,0.994864


In [3]:
# removing samtools
df = df[["samtools" not in tool for tool in df.tool]]
df

Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes
0,pandora_illumina_nodenovo_global_genotyping,063_STEC,2,0.775038
1,pandora_illumina_nodenovo_global_genotyping,063_STEC,3,0.821852
2,pandora_illumina_nodenovo_global_genotyping,063_STEC,4,0.831538
3,pandora_illumina_nodenovo_global_genotyping,063_STEC,5,0.804246
4,pandora_illumina_nodenovo_global_genotyping,063_STEC,6,0.914239
...,...,...,...,...
19280,snippy_NZ_NG941718.1,ST38,16,0.955742
19281,snippy_NZ_NG941718.1,ST38,17,0.963785
19282,snippy_NZ_NG941718.1,ST38,18,0.960307
19283,snippy_NZ_NG941718.1,ST38,19,0.965412


In [4]:
# making things look nicer

def get_tool_and_ref(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo / PRG"
        else:
            return "Pandora illumina with denovo / PRG"
    else:
        return "Snippy / " + tool[tool.index("_")+1:]
    
def get_ref(tool):
    if tool.startswith("pandora"):
        return "PRG"
    else:
        return tool[tool.index("_")+1:]

def get_tool(tool):
    if "pandora" in tool:
        if "nodenovo" in tool:
            return "Pandora illumina no denovo"
        else:
            return "Pandora illumina with denovo"
    else:
        return "Snippy"

df["tool_and_ref"] = df["tool"].apply(get_tool_and_ref)
df["ref"] = df["tool"].apply(get_ref)
df["tool"] = df["tool"].apply(get_tool)
df = df.sort_values(by=["tool_and_ref", "sample", "nb_of_samples"])
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes,tool_and_ref,ref
0,Pandora illumina no denovo,063_STEC,2,0.775038,Pandora illumina no denovo / PRG,PRG
1,Pandora illumina no denovo,063_STEC,3,0.821852,Pandora illumina no denovo / PRG,PRG
2,Pandora illumina no denovo,063_STEC,4,0.831538,Pandora illumina no denovo / PRG,PRG
3,Pandora illumina no denovo,063_STEC,5,0.804246,Pandora illumina no denovo / PRG,PRG
4,Pandora illumina no denovo,063_STEC,6,0.914239,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...,...
19280,Snippy,ST38,16,0.955742,Snippy / NZ_NG941718.1,NZ_NG941718.1
19281,Snippy,ST38,17,0.963785,Snippy / NZ_NG941718.1,NZ_NG941718.1
19282,Snippy,ST38,18,0.960307,Snippy / NZ_NG941718.1,NZ_NG941718.1
19283,Snippy,ST38,19,0.965412,Snippy / NZ_NG941718.1,NZ_NG941718.1


In [5]:
# fix some ref names, because snippy script replace non-ACGT chars to U, even in ref name

def fix_ref_name(ref):
    if ref=="CN928163.2":
        return "CU928163.2"
    if ref=="NZ_LN995446.1":
        return "NZ_LM995446.1"
    if ref=="NZ_NG941718.1":
        return "NZ_HG941718.1"
    return ref


def fix_tool_and_ref_name(tool_and_ref):
    if tool_and_ref.endswith("CN928163.2"):
        return tool_and_ref.replace("CN928163.2", "CU928163.2")
    if tool_and_ref.endswith("NZ_LN995446.1"):
        return tool_and_ref.replace("NZ_LN995446.1", "NZ_LM995446.1")
    if tool_and_ref.endswith("NZ_NG941718.1"):
        return tool_and_ref.replace("NZ_NG941718.1", "NZ_HG941718.1")
    return tool_and_ref

df["ref"] = df["ref"].apply(fix_ref_name)
df["tool_and_ref"] = df["tool_and_ref"].apply(fix_tool_and_ref_name)
df

Unnamed: 0,tool,sample,nb_of_samples,recalls_wrt_truth_probes,tool_and_ref,ref
0,Pandora illumina no denovo,063_STEC,2,0.775038,Pandora illumina no denovo / PRG,PRG
1,Pandora illumina no denovo,063_STEC,3,0.821852,Pandora illumina no denovo / PRG,PRG
2,Pandora illumina no denovo,063_STEC,4,0.831538,Pandora illumina no denovo / PRG,PRG
3,Pandora illumina no denovo,063_STEC,5,0.804246,Pandora illumina no denovo / PRG,PRG
4,Pandora illumina no denovo,063_STEC,6,0.914239,Pandora illumina no denovo / PRG,PRG
...,...,...,...,...,...,...
19280,Snippy,ST38,16,0.955742,Snippy / NZ_HG941718.1,NZ_HG941718.1
19281,Snippy,ST38,17,0.963785,Snippy / NZ_HG941718.1,NZ_HG941718.1
19282,Snippy,ST38,18,0.960307,Snippy / NZ_HG941718.1,NZ_HG941718.1
19283,Snippy,ST38,19,0.965412,Snippy / NZ_HG941718.1,NZ_HG941718.1


In [6]:
for nb_of_samples in df["nb_of_samples"].unique():
    restricted_df = df[df.nb_of_samples == nb_of_samples]
    restricted_df.to_csv(f"recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_{nb_of_samples}.csv", index=False)

In [7]:
for nb_of_samples in df["nb_of_samples"].unique():
    args = f"recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_{nb_of_samples}.csv Number_of_samples=={nb_of_samples} recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_{nb_of_samples}.png"
    print(f"Rscript clade_plots.R {args}")
    !Rscript clade_plots.R {args}

Rscript clade_plots.R recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_2.csv Number_of_samples==2 recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_2.png
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0
[32m✔[39m [34mpurrr  [39m 0.3.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mcombine()[39m masks [34mgridExtra[39m::combine()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
null device 
          1 
Rscript clade_plots.R recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_3.csv Number_of_samples==3 recal

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mcombine()[39m masks [34mgridExtra[39m::combine()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
null device 
          1 
Rscript clade_plots.R recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_12.csv Number_of_samples==12 recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_12.png
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0
[32m✔[39m [34mpurrr  [39m 0.3.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34md

null device 
          1 


In [8]:
images = [imageio.imread(f"recall_per_ref_per_nb_of_samples_per_clade.nb_of_samples_{nb_of_samples}.png") for nb_of_samples in df["nb_of_samples"].unique()]
imageio.mimsave('recall_per_ref_per_nb_of_samples_per_clade.gif', images, duration=2)