In [1]:
import pandas as pd
import numpy as np

# Data reading

In [2]:
df_illumina = pd.read_csv("../df_illumina.R_data.csv")
df_illumina

Unnamed: 0,tool_long_description,tool,NUMBER_OF_SAMPLES,nb_of_found_PanVar,recall_PVR,recall_AvgAR
0,pandora_illumina_withdenovo,pandora with denovo,2,30647.0,0.483765,0.670439
1,snippy_NC_011993.1,snippy,2,19445.0,0.306941,0.349300
2,snippy_NC_022648.1,snippy,2,21229.0,0.335101,0.413537
3,snippy_CP010121.1,snippy,2,18914.0,0.298559,0.355685
4,snippy_NZ_LM995446.1,snippy,2,15115.0,0.238591,0.278512
...,...,...,...,...,...,...
926,samtools_NZ_CP016007.1,samtools,20,10137.0,0.998227,0.997937
927,samtools_NZ_CP013483.1,samtools,20,10123.0,0.996849,0.996470
928,samtools_CP010116.1,samtools,20,9865.0,0.971443,0.971251
929,samtools_CP010226.1,samtools,20,10013.0,0.986017,0.985810


In [3]:
df_nanopore = pd.read_csv("../df_nanopore.R_data.csv")
df_nanopore

Unnamed: 0,tool_long_description,tool,NUMBER_OF_SAMPLES,nb_of_found_PanVar,recall_PVR,recall_AvgAR
0,pandora_nanopore_withdenovo,pandora with denovo,2,31054.0,0.490190,0.671789
1,medaka_NC_011993.1,medaka,2,22573.0,0.356316,0.394351
2,medaka_NC_022648.1,medaka,2,25630.0,0.404571,0.470119
3,medaka_CP010121.1,medaka,2,23017.0,0.363325,0.410538
4,medaka_NZ_LM995446.1,medaka,2,17542.0,0.276902,0.314518
...,...,...,...,...,...,...
926,nanopolish_NZ_CP016007.1,nanopolish,20,10122.0,0.996750,0.995815
927,nanopolish_NZ_CP013483.1,nanopolish,20,10093.0,0.993895,0.995047
928,nanopolish_CP010116.1,nanopolish,20,9847.0,0.969670,0.970044
929,nanopolish_CP010226.1,nanopolish,20,9982.0,0.982964,0.984003


# Restricting to core genes

In [4]:
df_illumina_core_genes = df_illumina[df_illumina.NUMBER_OF_SAMPLES >= 16]
df_illumina_core_genes

Unnamed: 0,tool_long_description,tool,NUMBER_OF_SAMPLES,nb_of_found_PanVar,recall_PVR,recall_AvgAR
686,pandora_illumina_withdenovo,pandora with denovo,16,30983.0,0.897928,0.924857
687,snippy_NC_011993.1,snippy,16,33634.0,0.974757,0.975819
688,snippy_NC_022648.1,snippy,16,33435.0,0.968990,0.970195
689,snippy_CP010121.1,snippy,16,33783.0,0.979075,0.979979
690,snippy_NZ_LM995446.1,snippy,16,33028.0,0.957195,0.959243
...,...,...,...,...,...,...
926,samtools_NZ_CP016007.1,samtools,20,10137.0,0.998227,0.997937
927,samtools_NZ_CP013483.1,samtools,20,10123.0,0.996849,0.996470
928,samtools_CP010116.1,samtools,20,9865.0,0.971443,0.971251
929,samtools_CP010226.1,samtools,20,10013.0,0.986017,0.985810


In [5]:
df_nanopore_core_genes = df_nanopore[df_nanopore.NUMBER_OF_SAMPLES >= 16]
df_nanopore_core_genes

Unnamed: 0,tool_long_description,tool,NUMBER_OF_SAMPLES,nb_of_found_PanVar,recall_PVR,recall_AvgAR
686,pandora_nanopore_withdenovo,pandora with denovo,16,30752.0,0.891233,0.916472
687,medaka_NC_011993.1,medaka,16,34026.0,0.986118,0.981447
688,medaka_NC_022648.1,medaka,16,33856.0,0.981191,0.976094
689,medaka_CP010121.1,medaka,16,34123.0,0.988929,0.984482
690,medaka_NZ_LM995446.1,medaka,16,33373.0,0.967193,0.963241
...,...,...,...,...,...,...
926,nanopolish_NZ_CP016007.1,nanopolish,20,10122.0,0.996750,0.995815
927,nanopolish_NZ_CP013483.1,nanopolish,20,10093.0,0.993895,0.995047
928,nanopolish_CP010116.1,nanopolish,20,9847.0,0.969670,0.970044
929,nanopolish_CP010226.1,nanopolish,20,9982.0,0.982964,0.984003


# Summing the number of found panvars

In [6]:
df_illumina_core_genes_panvars_summed = \
    df_illumina_core_genes.groupby(by=["tool_long_description", "tool"])\
    .agg({"nb_of_found_PanVar": np.sum})\
    .reset_index()
df_illumina_core_genes_panvars_summed

Unnamed: 0,tool_long_description,tool,nb_of_found_PanVar
0,pandora_illumina_withdenovo,pandora with denovo,212545.0
1,samtools_CP010116.1,samtools,223347.0
2,samtools_CP010121.1,samtools,229188.0
3,samtools_CP010170.1,samtools,229184.0
4,samtools_CP010171.1,samtools,227638.0
5,samtools_CP010226.1,samtools,224917.0
6,samtools_CP010230.1,samtools,228538.0
7,samtools_CP018206.1,samtools,229818.0
8,samtools_CU928163.2,samtools,229653.0
9,samtools_NC_007779.1,samtools,228800.0


In [7]:
df_nanopore_core_genes_panvars_summed = \
    df_nanopore_core_genes.groupby(by=["tool_long_description", "tool"])\
    .agg({"nb_of_found_PanVar": np.sum})\
    .reset_index()
df_nanopore_core_genes_panvars_summed

Unnamed: 0,tool_long_description,tool,nb_of_found_PanVar
0,medaka_CP010116.1,medaka,223971.0
1,medaka_CP010121.1,medaka,229946.0
2,medaka_CP010170.1,medaka,229966.0
3,medaka_CP010171.1,medaka,228337.0
4,medaka_CP010226.1,medaka,225622.0
5,medaka_CP010230.1,medaka,229440.0
6,medaka_CP018206.1,medaka,230440.0
7,medaka_CU928163.2,medaka,230367.0
8,medaka_NC_007779.1,medaka,229484.0
9,medaka_NC_010498.1,medaka,230443.0


# Select the best values for each tool

In [8]:
df_illumina_best_values = \
    df_illumina_core_genes_panvars_summed.sort_values(by=["nb_of_found_PanVar"], ascending=False).\
    groupby(by=["tool"]).first()\
    .reset_index()
df_illumina_best_values

Unnamed: 0,tool,tool_long_description,nb_of_found_PanVar
0,pandora with denovo,pandora_illumina_withdenovo,212545.0
1,samtools,samtools_CP018206.1,229818.0
2,snippy,snippy_CP018206.1,229259.0


In [9]:
df_nanopore_best_values = \
    df_nanopore_core_genes_panvars_summed.sort_values(by=["nb_of_found_PanVar"], ascending=False).\
    groupby(by=["tool"]).first()\
    .reset_index()
df_nanopore_best_values

Unnamed: 0,tool,tool_long_description,nb_of_found_PanVar
0,medaka,medaka_NC_010498.1,230443.0
1,nanopolish,nanopolish_CP018206.1,229338.0
2,pandora with denovo,pandora_nanopore_withdenovo,211663.0


# Get how many panvars we have in total in core genes, to be able to compute the recall

In [10]:
pangenome_variations_per_nb_of_samples = pd.read_csv("../../pandora1_paper_analysis_output_20_way/technology_independent_analysis/pangenome_variants_vs_samples/pangenome_variations_per_nb_of_samples.csv")
pangenome_variations_per_nb_of_samples

Unnamed: 0,PVID,NUMBER_OF_SAMPLES
0,0,2
1,1,4
2,2,2
3,3,20
4,4,19
...,...,...
618300,681407,2
618301,681408,2
618302,681409,2
618303,681410,2


In [11]:
pangenome_variations_per_nb_of_samples_in_core_genes = pangenome_variations_per_nb_of_samples[pangenome_variations_per_nb_of_samples.NUMBER_OF_SAMPLES <= 5]
pangenome_variations_per_nb_of_samples_in_core_genes

Unnamed: 0,PVID,NUMBER_OF_SAMPLES
0,0,2
1,1,4
2,2,2
10,10,2
12,12,2
...,...,...
618300,681407,2
618301,681408,2
618302,681409,2
618303,681410,2


In [12]:
number_of_panvars_in_core_genes = len(pangenome_variations_per_nb_of_samples_in_core_genes)
display(number_of_panvars_in_core_genes)

160197

# Compute recall

In [13]:
df_illumina_best_values["recall"] = df_illumina_best_values["nb_of_found_PanVar"] / number_of_panvars_in_core_genes
df_illumina_best_values

Unnamed: 0,tool,tool_long_description,nb_of_found_PanVar,recall
0,pandora with denovo,pandora_illumina_withdenovo,212545.0,1.326773
1,samtools,samtools_CP018206.1,229818.0,1.434596
2,snippy,snippy_CP018206.1,229259.0,1.431107


In [14]:
df_nanopore_best_values["recall"] = df_nanopore_best_values["nb_of_found_PanVar"] / number_of_panvars_in_core_genes
df_nanopore_best_values

Unnamed: 0,tool,tool_long_description,nb_of_found_PanVar,recall
0,medaka,medaka_NC_010498.1,230443.0,1.438498
1,nanopolish,nanopolish_CP018206.1,229338.0,1.4316
2,pandora with denovo,pandora_nanopore_withdenovo,211663.0,1.321267


# Create report

In [15]:
def extract_value(df, queried_tool, column):
    return df[df.tool==queried_tool][column].to_list()[0]

from collections import defaultdict
tech_to_tool_to_measure = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))

for measure in ["nb_of_found_PanVar", "recall"]:
    for tool in ["pandora with denovo", "samtools", "snippy"]:
        tech_to_tool_to_measure["illumina"][tool][measure] = extract_value(df_illumina_best_values, tool, measure)
    for tool in ["pandora with denovo", "medaka", "nanopolish"]:
        tech_to_tool_to_measure["nanopore"][tool][measure] = extract_value(df_nanopore_best_values, tool, measure)

tech_to_tool_to_measure

defaultdict(<function __main__.<lambda>()>,
            {'illumina': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'pandora with denovo': defaultdict(float,
                                      {'nb_of_found_PanVar': 212545.0,
                                       'recall': 1.3267726611609456}),
                          'samtools': defaultdict(float,
                                      {'nb_of_found_PanVar': 229818.0,
                                       'recall': 1.4345961534860203}),
                          'snippy': defaultdict(float,
                                      {'nb_of_found_PanVar': 229259.0,
                                       'recall': 1.431106699875778})}),
             'nanopore': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'pandora with denovo': defaultdict(float,
                                      {'nb_of_found_PanVar': 211663.0,
                                    

In [16]:
print("For illumina:")
for tool in ["samtools", "snippy"]:
    print(f"Pandora identified {int(tech_to_tool_to_measure['illumina'][tool]['nb_of_found_PanVar'] - tech_to_tool_to_measure['illumina']['pandora with denovo']['nb_of_found_PanVar'])} less pan vars than {tool}")
    print(f"Pandora had a recall {round((tech_to_tool_to_measure['illumina'][tool]['recall'] - tech_to_tool_to_measure['illumina']['pandora with denovo']['recall'])*100, 1)}% lower than {tool}")

print("For nanopore:")
for tool in ["medaka", "nanopolish"]:
    print(f"Pandora identified {int(tech_to_tool_to_measure['nanopore'][tool]['nb_of_found_PanVar'] - tech_to_tool_to_measure['nanopore']['pandora with denovo']['nb_of_found_PanVar'])} less pan vars than {tool}")
    print(f"Pandora had a recall {round((tech_to_tool_to_measure['nanopore'][tool]['recall'] - tech_to_tool_to_measure['nanopore']['pandora with denovo']['recall'])*100, 1)}% lower than {tool}")

For illumina:
Pandora identified 17273 less pan vars than samtools
Pandora had a recall 10.8% lower than samtools
Pandora identified 16714 less pan vars than snippy
Pandora had a recall 10.4% lower than snippy
For nanopore:
Pandora identified 18780 less pan vars than medaka
Pandora had a recall 11.7% lower than medaka
Pandora identified 17674 less pan vars than nanopolish
Pandora had a recall 11.0% lower than nanopolish
