In [16]:
import pandas as pd
from pdbio.vcfdataframe import VcfDataFrame
import plotly.express as px

# CONFIGS

In [18]:
# configs
pandora_ROC_directory = "analysis_output_20_way_pandora_and_snippy_illumina_100x"
gene_distance_directory = "gene_distance_20_way_pandora_and_snippy_illumina_100x"
samples=["063_STEC", "Escherichia_coli_MSB1_1A", "Escherichia_coli_MSB1_8B", "CFT073",
         "Escherichia_coli_MSB1_3B", "Escherichia_coli_MSB1_8G", "Escherichia_coli_MINF_1A",
         "Escherichia_coli_MSB1_4E", "Escherichia_coli_MSB1_9D", "Escherichia_coli_MINF_1D",
         "Escherichia_coli_MSB1_4I", "Escherichia_coli_MSB2_1A", "Escherichia_coli_MINF_7C", 
         "Escherichia_coli_MSB1_6C", "H131800734", "Escherichia_coli_MINF_8D", "Escherichia_coli_MSB1_7A", 
         "ST38", "Escherichia_coli_MINF_9A", "Escherichia_coli_MSB1_7C"]
coverage="100x"
subsampling="random"
technology="illumina"
tool="pandora_illumina_withdenovo_global_genotyping"
coverage_filter="0"
strand_bias_filter="0.0"
gaps_filter="1.0"

# INPUT

In [9]:
all_variant_calls_probeset_reports = [pd.read_csv(f"{pandora_ROC_directory}/precision/reports_from_probe_mappings/{sample}/{coverage}/{tool}/coverage_filter_{coverage_filter}/strand_bias_filter_{strand_bias_filter}/gaps_filter_{gaps_filter}/variant_calls_probeset_report.tsv", sep="\t") for sample in samples]
all_variant_calls_probeset_reports

[          sample                                 query_probe_header  \
 0       063_STEC  >CHROM=Cluster_5590;SAMPLE=063_STEC;POS=1;REF_...   
 1       063_STEC  >CHROM=Cluster_5590;SAMPLE=063_STEC;POS=20;REF...   
 2       063_STEC  >CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...   
 3       063_STEC  >CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...   
 4       063_STEC  >CHROM=Cluster_5590;SAMPLE=063_STEC;POS=71;REF...   
 ...          ...                                                ...   
 177302  063_STEC  >CHROM=GC00003121;SAMPLE=063_STEC;POS=1140;REF...   
 177303  063_STEC  >CHROM=Cluster_3436;SAMPLE=063_STEC;POS=1;REF_...   
 177304  063_STEC  >CHROM=Cluster_3436;SAMPLE=063_STEC;POS=34;REF...   
 177305  063_STEC  >CHROM=Cluster_3436;SAMPLE=063_STEC;POS=71;REF...   
 177306  063_STEC  >CHROM=Cluster_3436;SAMPLE=063_STEC;POS=92;REF...   
 
        ref_probe_header  classification  
 0             >CHROM=0;             1.0  
 1             >CHROM=0;             1.0  
 2   

In [10]:
all_variant_calls_probeset_reports = pd.concat(all_variant_calls_probeset_reports, ignore_index=True)
all_variant_calls_probeset_reports

Unnamed: 0,sample,query_probe_header,ref_probe_header,classification
0,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=1;REF_...,>CHROM=0;,1.000000
1,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=20;REF...,>CHROM=0;,1.000000
2,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.000000
3,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.000000
4,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=71;REF...,>CHROM=0;,1.000000
...,...,...,...,...
3922137,Escherichia_coli_MSB1_7C,>CHROM=Cluster_11410;SAMPLE=Escherichia_coli_M...,>CHROM=0;,1.000000
3922138,Escherichia_coli_MSB1_7C,>CHROM=Cluster_11410;SAMPLE=Escherichia_coli_M...,>CHROM=0;,1.000000
3922139,Escherichia_coli_MSB1_7C,>CHROM=Cluster_11410;SAMPLE=Escherichia_coli_M...,>CHROM=0;,0.833333
3922140,Escherichia_coli_MSB1_7C,>CHROM=Cluster_5219;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,0.000000


In [11]:
classification_df_with_gene_length = pd.read_csv(f"{gene_distance_directory}/classification_df_with_gene_length.csv")
classification_df_with_gene_length

Unnamed: 0,gene_name,063_STEC_classification,CFT073_classification,Escherichia_coli_MINF_1A_classification,Escherichia_coli_MINF_1D_classification,Escherichia_coli_MINF_7C_classification,Escherichia_coli_MINF_8D_classification,Escherichia_coli_MINF_9A_classification,Escherichia_coli_MSB1_1A_classification,Escherichia_coli_MSB1_3B_classification,...,Escherichia_coli_MSB1_7A_classification,Escherichia_coli_MSB1_7C_classification,Escherichia_coli_MSB1_8B_classification,Escherichia_coli_MSB1_8G_classification,Escherichia_coli_MSB1_9D_classification,Escherichia_coli_MSB2_1A_classification,H131800734_classification,ST38_classification,gene_length,gene_length_category
0,GC00005414,TN,TN,TN,TN,TN,TN,TN,TN,TN,...,TN,TN,TN,TN,TN,TN,TN,TP,186,200
1,Cluster_4119,TN,TN,TN,TN,TN,TN,TN,TN,TN,...,TN,TN,TN,TN,TN,TN,TN,TP,243,300
2,GC00000244_37,TN,TN,TN,TN,TN,TN,TN,TN,TN,...,TN,TN,TN,TN,TN,TN,TN,FP,858,900
3,Cluster_11810,FN,FN,FN,FN,FN,FN,FN,FN,FN,...,FN,FN,FN,FN,FN,FN,FN,TP,63,100
4,GC00010971,TN,TN,TN,TN,TN,TN,TN,TN,TN,...,TN,TN,TN,TN,TN,TN,TN,TP,525,600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15136,GC00006719,TN,TN,TN,TN,TN,TN,TN,TN,TP,...,TN,TN,TN,TN,TN,TN,TN,TP,213,300
15137,GC00000831_6,TN,TN,TN,TN,TN,TN,TN,TN,TP,...,TN,TN,TN,TN,TN,TN,TN,TN,1206,1300
15138,Cluster_10085,FN,TN,FN,FN,FN,FN,FN,FN,TP,...,TP,FN,FN,FN,TN,FN,FN,TP,75,100
15139,Cluster_5219,TN,TN,TN,TN,TN,TN,TN,TN,FP,...,FP,FP,TN,TN,TN,TN,TN,FP,203,300


In [13]:
vcfdf = VcfDataFrame(path=f"{gene_distance_directory}/pandora_multisample_genotyped_global.vcf")
vcfdf

<pdbio.vcfdataframe.VcfDataFrame at 0x7fe68e99bb38>

# INPUT FIXING: vcfdf

In [19]:
# fix #CHROM column name
vcfdf.df.rename(columns={"#CHROM": "CHROM"}, inplace=True)
vcfdf.df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,063_STEC.100x.random.illumina,...,Escherichia_coli_MSB1_6C.100x.random.illumina,Escherichia_coli_MSB1_7A.100x.random.illumina,Escherichia_coli_MSB1_7C.100x.random.illumina,Escherichia_coli_MSB1_8B.100x.random.illumina,Escherichia_coli_MSB1_8G.100x.random.illumina,Escherichia_coli_MSB1_9D.100x.random.illumina,Escherichia_coli_MSB2_1A.100x.random.illumina,H131800734.100x.random.illumina,ST38.100x.random.illumina,REF_LENGTH
0,GC00000007_16,1,.,ATGAAGATTCGAAGAATAGTTTCAACAATAGCTATAGCATTAAGTG...,A,.,.,SVTYPE=INDEL;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:5,0:15,0:7,0:21,0:107,0:319,0:0.3,1:-18.9839...",130
1,GC00005414,1,.,ATGGCACGAACAATGCTC,ATGTCTCAAATCATGCTT,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:1,0:2,0:1,0:2,0:1,0:2,0:1,1:-66.879,-89.8155...",18
2,GC00005414,27,.,GGGCAAC,"AGGCAAC,GGGATCG,GGGGAAC,TGGCAAC",.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",...,".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...","0:1,0,0,0,0:5,1,0,1,0:1,0,0,0,0:5,1,0,1,0:3,1,...",7
3,GC00005414,45,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:1,0:7,0:2,0:8,0:5,0:22,0:0.333333,1:-32.1706...",1
4,GC00005414,45,.,TGGTCAGTTTGCTGTCGATTACATCATT,CGGCCCCCACTCGCTGGATTACATCATC,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:2,0:9,0:3,0:14,0:14,0:66,0:0.285714,1:-26.34...",28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361179,GC00004221,20698,.,G,A,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:26,0:25,0:38,0:37,0:78,0:77,0:0.333333,1:-32...","0:20,0:27,0:29,0:39,0:62,0:82,0:0.333333,1:-36...","0:28,0:23,0:43,0:35,0:86,0:71,0:0.333333,1:-35...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:14,0:8,0:19,0:12,0:43,0:25,0:0.333333,1:-19....",1
361180,GC00004221,20706,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:24,0:23,0:34,0:32,0:72,0:69,0:0.333333,1:-34...","0:20,0:24,0:27,0:32,0:60,0:72,0:0.333333,1:-37...","0:27,0:23,0:39,0:34,0:82,0:69,0:0.333333,1:-36...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:14,0:7,0:19,0:9,0:43,0:21,0:0.333333,1:-19.6...",1
361181,GC00004221,20716,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:35,0:32,0:35,0:32,0:71,0:65,0:0,1:-3.5778,-4...","0:28,0:34,0:28,0:34,0:56,0:68,0:0,1:-4.75023,-...","0:42,0:37,0:42,0:37,0:84,0:74,0:0,1:-3.11099,-...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:21,0:11,0:21,0:11,0:43,0:23,0:0,1:-3.1552,-2...",1
361182,GC00004221,20730,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:35,9:31,8:35,0:31,0:71,37:63,33:0,0.75:-81.9...","0:27,7:33,9:27,0:33,0:55,29:67,36:0,0.75:-78.9...","0:41,11:35,9:41,0:35,0:82,45:70,39:0,0.75:-95....",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:23,6:13,3:23,0:13,0:46,24:27,13:0,0.75:-44.2...",1


In [20]:
# add REF_LENGTH to the columns
vcfdf.df["REF_LENGTH"] = vcfdf.df["REF"].apply(lambda ref: len(ref))
vcfdf.df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,063_STEC.100x.random.illumina,...,Escherichia_coli_MSB1_6C.100x.random.illumina,Escherichia_coli_MSB1_7A.100x.random.illumina,Escherichia_coli_MSB1_7C.100x.random.illumina,Escherichia_coli_MSB1_8B.100x.random.illumina,Escherichia_coli_MSB1_8G.100x.random.illumina,Escherichia_coli_MSB1_9D.100x.random.illumina,Escherichia_coli_MSB2_1A.100x.random.illumina,H131800734.100x.random.illumina,ST38.100x.random.illumina,REF_LENGTH
0,GC00000007_16,1,.,ATGAAGATTCGAAGAATAGTTTCAACAATAGCTATAGCATTAAGTG...,A,.,.,SVTYPE=INDEL;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:5,0:15,0:7,0:21,0:107,0:319,0:0.3,1:-18.9839...",130
1,GC00005414,1,.,ATGGCACGAACAATGCTC,ATGTCTCAAATCATGCTT,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:1,0:2,0:1,0:2,0:1,0:2,0:1,1:-66.879,-89.8155...",18
2,GC00005414,27,.,GGGCAAC,"AGGCAAC,GGGATCG,GGGGAAC,TGGCAAC",.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",...,".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...","0:1,0,0,0,0:5,1,0,1,0:1,0,0,0,0:5,1,0,1,0:3,1,...",7
3,GC00005414,45,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:1,0:7,0:2,0:8,0:5,0:22,0:0.333333,1:-32.1706...",1
4,GC00005414,45,.,TGGTCAGTTTGCTGTCGATTACATCATT,CGGCCCCCACTCGCTGGATTACATCATC,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:2,0:9,0:3,0:14,0:14,0:66,0:0.285714,1:-26.34...",28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361179,GC00004221,20698,.,G,A,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:26,0:25,0:38,0:37,0:78,0:77,0:0.333333,1:-32...","0:20,0:27,0:29,0:39,0:62,0:82,0:0.333333,1:-36...","0:28,0:23,0:43,0:35,0:86,0:71,0:0.333333,1:-35...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:14,0:8,0:19,0:12,0:43,0:25,0:0.333333,1:-19....",1
361180,GC00004221,20706,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:24,0:23,0:34,0:32,0:72,0:69,0:0.333333,1:-34...","0:20,0:24,0:27,0:32,0:60,0:72,0:0.333333,1:-37...","0:27,0:23,0:39,0:34,0:82,0:69,0:0.333333,1:-36...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:14,0:7,0:19,0:9,0:43,0:21,0:0.333333,1:-19.6...",1
361181,GC00004221,20716,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:35,0:32,0:35,0:32,0:71,0:65,0:0,1:-3.5778,-4...","0:28,0:34,0:28,0:34,0:56,0:68,0:0,1:-4.75023,-...","0:42,0:37,0:42,0:37,0:84,0:74,0:0,1:-3.11099,-...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:21,0:11,0:21,0:11,0:43,0:23,0:0,1:-3.1552,-2...",1
361182,GC00004221,20730,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",...,"0:35,9:31,8:35,0:31,0:71,37:63,33:0,0.75:-81.9...","0:27,7:33,9:27,0:33,0:55,29:67,36:0,0.75:-78.9...","0:41,11:35,9:41,0:35,0:82,45:70,39:0,0.75:-95....",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:23,6:13,3:23,0:13,0:46,24:27,13:0,0.75:-44.2...",1


In [21]:
# set the index to CHROM, POS, REF_LENGTH
vcfdf_indexed_by_chrom_pos_reflength = vcfdf.df
vcfdf_indexed_by_chrom_pos_reflength.set_index(["CHROM", "POS", "REF_LENGTH"], inplace=True)
vcfdf_indexed_by_chrom_pos_reflength

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,063_STEC.100x.random.illumina,CFT073.100x.random.illumina,Escherichia_coli_MINF_1A.100x.random.illumina,...,Escherichia_coli_MSB1_4I.100x.random.illumina,Escherichia_coli_MSB1_6C.100x.random.illumina,Escherichia_coli_MSB1_7A.100x.random.illumina,Escherichia_coli_MSB1_7C.100x.random.illumina,Escherichia_coli_MSB1_8B.100x.random.illumina,Escherichia_coli_MSB1_8G.100x.random.illumina,Escherichia_coli_MSB1_9D.100x.random.illumina,Escherichia_coli_MSB2_1A.100x.random.illumina,H131800734.100x.random.illumina,ST38.100x.random.illumina
CHROM,POS,REF_LENGTH,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
GC00000007_16,1,130,.,ATGAAGATTCGAAGAATAGTTTCAACAATAGCTATAGCATTAAGTG...,A,.,.,SVTYPE=INDEL;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-144,-144:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:5,0:15,0:7,0:21,0:107,0:319,0:0.3,1:-18.9839..."
GC00005414,1,18,.,ATGGCACGAACAATGCTC,ATGTCTCAAATCATGCTT,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-144,-144:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:1,0:2,0:1,0:2,0:1,0:2,0:1,1:-66.879,-89.8155..."
GC00005414,27,7,.,GGGCAAC,"AGGCAAC,GGGATCG,GGGGAAC,TGGCAAC",.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",...,".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...","0:1,0,0,0,0:5,1,0,1,0:1,0,0,0,0:5,1,0,1,0:3,1,..."
GC00005414,45,1,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-144,-144:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:1,0:7,0:2,0:8,0:5,0:22,0:0.333333,1:-32.1706..."
GC00005414,45,28,.,TGGTCAGTTTGCTGTCGATTACATCATT,CGGCCCCCACTCGCTGGATTACATCATC,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-144,-144:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:2,0:9,0:3,0:14,0:14,0:66,0:0.285714,1:-26.34..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GC00004221,20698,1,.,G,A,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-144,-144:0","0:26,0:25,0:38,0:37,0:78,0:77,0:0.333333,1:-32...","0:20,0:27,0:29,0:39,0:62,0:82,0:0.333333,1:-36...","0:28,0:23,0:43,0:35,0:86,0:71,0:0.333333,1:-35...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:14,0:8,0:19,0:12,0:43,0:25,0:0.333333,1:-19...."
GC00004221,20706,1,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-144,-144:0","0:24,0:23,0:34,0:32,0:72,0:69,0:0.333333,1:-34...","0:20,0:24,0:27,0:32,0:60,0:72,0:0.333333,1:-37...","0:27,0:23,0:39,0:34,0:82,0:69,0:0.333333,1:-36...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:14,0:7,0:19,0:9,0:43,0:21,0:0.333333,1:-19.6..."
GC00004221,20716,1,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,"0:21,0:22,0:21,0:22,0:42,0:44,0:0,1:-9.63644,-...","0:35,0:32,0:35,0:32,0:71,0:65,0:0,1:-3.5778,-4...","0:28,0:34,0:28,0:34,0:56,0:68,0:0,1:-4.75023,-...","0:42,0:37,0:42,0:37,0:84,0:74,0:0,1:-3.11099,-...",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:21,0:11,0:21,0:11,0:43,0:23,0:0,1:-3.1552,-2..."
GC00004221,20730,1,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,"1:10,15:11,16:10,20:11,22:21,62:22,66:0.5,0.25...","0:35,9:31,8:35,0:31,0:71,37:63,33:0,0.75:-81.9...","0:27,7:33,9:27,0:33,0:55,29:67,36:0,0.75:-78.9...","0:41,11:35,9:41,0:35,0:82,45:70,39:0,0.75:-95....",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-156,-156:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-160,-160:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-152,-152:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-118,-118:0","0:23,6:13,3:23,0:13,0:46,24:27,13:0,0.75:-44.2..."


In [22]:
#  checks if CHROM, POS, REF_LENGTH really identify uniquely every VCF record
nb_of_duplicated_vcf_records = sum(vcfdf_indexed_by_chrom_pos_reflength.index.duplicated())
no_duplicated_records = nb_of_duplicated_vcf_records==0
assert no_duplicated_records
no_duplicated_records

True

In [23]:
# add gt_conf columns
def add_gt_conf_columns(vcfdf_indexed_by_chrom_pos_reflength):
    for sample in samples:
        vcfdf_indexed_by_chrom_pos_reflength[f"{sample}_gt_conf"] = \
            vcfdf_indexed_by_chrom_pos_reflength[f"{sample}.{coverage}.{subsampling}.{technology}"].apply(
                lambda info: float(info.split(":")[-1]))
        
add_gt_conf_columns(vcfdf_indexed_by_chrom_pos_reflength)
vcfdf_indexed_by_chrom_pos_reflength

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,063_STEC.100x.random.illumina,CFT073.100x.random.illumina,Escherichia_coli_MINF_1A.100x.random.illumina,...,Escherichia_coli_MSB1_4I_gt_conf,Escherichia_coli_MSB2_1A_gt_conf,Escherichia_coli_MINF_7C_gt_conf,Escherichia_coli_MSB1_6C_gt_conf,H131800734_gt_conf,Escherichia_coli_MINF_8D_gt_conf,Escherichia_coli_MSB1_7A_gt_conf,ST38_gt_conf,Escherichia_coli_MINF_9A_gt_conf,Escherichia_coli_MSB1_7C_gt_conf
CHROM,POS,REF_LENGTH,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
GC00000007_16,1,130,.,ATGAAGATTCGAAGAATAGTTTCAACAATAGCTATAGCATTAAGTG...,A,.,.,SVTYPE=INDEL;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,149.1200,0.0,0.000
GC00005414,1,18,.,ATGGCACGAACAATGCTC,ATGTCTCAAATCATGCTT,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,22.9365,0.0,0.000
GC00005414,27,7,.,GGGCAAC,"AGGCAAC,GGGATCG,GGGGAAC,TGGCAAC",.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",".:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,0,0,0:0,0,...",...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,53.6345,0.0,0.000
GC00005414,45,1,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,80.6708,0.0,0.000
GC00005414,45,28,.,TGGTCAGTTTGCTGTCGATTACATCATT,CGGCCCCCACTCGCTGGATTACATCATC,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,100.3110,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GC00004221,20698,1,.,G,A,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,0.0000,0.0,397.635,353.988,0.0,0.0,336.406,158.2030,0.0,359.271
GC00004221,20706,1,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,0.0000,0.0,387.824,333.851,0.0,0.0,321.005,153.0510,0.0,354.215
GC00004221,20716,1,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,332.3860,0.0,550.596,456.969,0.0,0.0,436.770,220.2100,0.0,520.697
GC00004221,20730,1,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-138,-138:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-148,-148:0",".:0,0:0,0:0,0:0,0:0,0:0,0:1,1:-154,-154:0",...,74.1063,0.0,377.356,314.832,0.0,0.0,294.866,168.1370,0.0,349.398


In [24]:
# put the index as columns
vcfdf_indexed_by_chrom_pos_reflength.reset_index(inplace=True)
vcfdf_indexed_by_chrom_pos_reflength

Unnamed: 0,CHROM,POS,REF_LENGTH,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,...,Escherichia_coli_MSB1_4I_gt_conf,Escherichia_coli_MSB2_1A_gt_conf,Escherichia_coli_MINF_7C_gt_conf,Escherichia_coli_MSB1_6C_gt_conf,H131800734_gt_conf,Escherichia_coli_MINF_8D_gt_conf,Escherichia_coli_MSB1_7A_gt_conf,ST38_gt_conf,Escherichia_coli_MINF_9A_gt_conf,Escherichia_coli_MSB1_7C_gt_conf
0,GC00000007_16,1,130,.,ATGAAGATTCGAAGAATAGTTTCAACAATAGCTATAGCATTAAGTG...,A,.,.,SVTYPE=INDEL;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,149.1200,0.0,0.000
1,GC00005414,1,18,.,ATGGCACGAACAATGCTC,ATGTCTCAAATCATGCTT,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,22.9365,0.0,0.000
2,GC00005414,27,7,.,GGGCAAC,"AGGCAAC,GGGATCG,GGGGAAC,TGGCAAC",.,.,SVTYPE=PH_SNPs;GRAPHTYPE=SIMPLE,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,53.6345,0.0,0.000
3,GC00005414,45,1,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,80.6708,0.0,0.000
4,GC00005414,45,28,.,TGGTCAGTTTGCTGTCGATTACATCATT,CGGCCCCCACTCGCTGGATTACATCATC,.,.,SVTYPE=PH_SNPs;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,0.0000,0.0,0.000,0.000,0.0,0.0,0.000,100.3110,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361179,GC00004221,20698,1,.,G,A,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,0.0000,0.0,397.635,353.988,0.0,0.0,336.406,158.2030,0.0,359.271
361180,GC00004221,20706,1,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,0.0000,0.0,387.824,333.851,0.0,0.0,321.005,153.0510,0.0,354.215
361181,GC00004221,20716,1,.,C,T,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,332.3860,0.0,550.596,456.969,0.0,0.0,436.770,220.2100,0.0,520.697
361182,GC00004221,20730,1,.,T,C,.,.,SVTYPE=SNP;GRAPHTYPE=NESTED,GT:MEAN_FWD_COVG:MEAN_REV_COVG:MED_FWD_COVG:ME...,...,74.1063,0.0,377.356,314.832,0.0,0.0,294.866,168.1370,0.0,349.398


# INPUT FIXING: all_variant_calls_probeset_reports

In [25]:
def extract_value_from_query_probe_header(query_probe_header, field, return_type):
    string_with_field = query_probe_header[query_probe_header.index(f"{field}=") + (len(field)+1):]
    value = string_with_field[:string_with_field.index(";")]
    return return_type(value)

all_variant_calls_probeset_reports["gene_name"] = all_variant_calls_probeset_reports["query_probe_header"].apply(
extract_value_from_query_probe_header, field=">CHROM", return_type=str)
all_variant_calls_probeset_reports["SVTYPE"] = all_variant_calls_probeset_reports["query_probe_header"].apply(
extract_value_from_query_probe_header, field="SVTYPE", return_type=str)
all_variant_calls_probeset_reports["POS"] = all_variant_calls_probeset_reports["query_probe_header"].apply(
extract_value_from_query_probe_header, field="POS", return_type=int)
all_variant_calls_probeset_reports["LOG_NORMALISED_GTCONF"] = all_variant_calls_probeset_reports["query_probe_header"].apply(
extract_value_from_query_probe_header, field="GT_CONF", return_type=float)
all_variant_calls_probeset_reports["REF_LENGTH"] = all_variant_calls_probeset_reports["query_probe_header"].apply(
extract_value_from_query_probe_header, field="REF_LENGTH", return_type=int)
all_variant_calls_probeset_reports["COVERAGE"] = all_variant_calls_probeset_reports["query_probe_header"].apply(
extract_value_from_query_probe_header, field="COVERAGE", return_type=float)
all_variant_calls_probeset_reports

Unnamed: 0,sample,query_probe_header,ref_probe_header,classification,gene_name,SVTYPE,POS,LOG_NORMALISED_GTCONF,REF_LENGTH,COVERAGE
0,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=1;REF_...,>CHROM=0;,1.000000,Cluster_5590,PH_SNPs,1,58.099998,4,16.0
1,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=20;REF...,>CHROM=0;,1.000000,Cluster_5590,SNP,20,59.700001,1,20.0
2,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.000000,Cluster_5590,PH_SNPs,40,50.799999,4,21.0
3,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.000000,Cluster_5590,COMPLEX,40,56.799999,20,16.0
4,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=71;REF...,>CHROM=0;,1.000000,Cluster_5590,PH_SNPs,71,50.200001,8,14.0
...,...,...,...,...,...,...,...,...,...,...
3922137,Escherichia_coli_MSB1_7C,>CHROM=Cluster_11410;SAMPLE=Escherichia_coli_M...,>CHROM=0;,1.000000,Cluster_11410,PH_SNPs,43,57.700001,4,49.0
3922138,Escherichia_coli_MSB1_7C,>CHROM=Cluster_11410;SAMPLE=Escherichia_coli_M...,>CHROM=0;,1.000000,Cluster_11410,SNP,59,67.400002,1,48.0
3922139,Escherichia_coli_MSB1_7C,>CHROM=Cluster_11410;SAMPLE=Escherichia_coli_M...,>CHROM=0;,0.833333,Cluster_11410,PH_SNPs,59,57.400002,12,48.0
3922140,Escherichia_coli_MSB1_7C,>CHROM=Cluster_5219;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,0.000000,Cluster_5219,SNP,159,67.400002,1,50.0


# PROCESSING

In [27]:
# merging variant_calls_probeset_report and classification_df_with_gene_length
all_variant_calls_probeset_reports_with_gene_classification = \
all_variant_calls_probeset_reports.merge(classification_df_with_gene_length, on="gene_name", how="inner")
all_collumns_were_merged_successfully = len(all_variant_calls_probeset_reports_with_gene_classification) == len(all_variant_calls_probeset_reports)
assert all_collumns_were_merged_successfully
all_variant_calls_probeset_reports_with_gene_classification

Unnamed: 0,sample,query_probe_header,ref_probe_header,classification,gene_name,SVTYPE,POS,LOG_NORMALISED_GTCONF,REF_LENGTH,COVERAGE,...,Escherichia_coli_MSB1_7A_classification,Escherichia_coli_MSB1_7C_classification,Escherichia_coli_MSB1_8B_classification,Escherichia_coli_MSB1_8G_classification,Escherichia_coli_MSB1_9D_classification,Escherichia_coli_MSB2_1A_classification,H131800734_classification,ST38_classification,gene_length,gene_length_category
0,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=1;REF_...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,1,58.099998,4,16.0,...,TP,TP,TP,TP,TP,TP,TP,TP,126,200
1,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=20;REF...,>CHROM=0;,1.0,Cluster_5590,SNP,20,59.700001,1,20.0,...,TP,TP,TP,TP,TP,TP,TP,TP,126,200
2,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,40,50.799999,4,21.0,...,TP,TP,TP,TP,TP,TP,TP,TP,126,200
3,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.0,Cluster_5590,COMPLEX,40,56.799999,20,16.0,...,TP,TP,TP,TP,TP,TP,TP,TP,126,200
4,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=71;REF...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,71,50.200001,8,14.0,...,TP,TP,TP,TP,TP,TP,TP,TP,126,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3922137,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,PH_SNPs,149,74.400002,4,106.0,...,TN,TP,TN,TN,TN,TN,TN,TN,363,400
3922138,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,SNP,232,73.300003,1,94.0,...,TN,TP,TN,TN,TN,TN,TN,TN,363,400
3922139,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,SNP,310,67.900002,1,51.0,...,TN,TP,TN,TN,TN,TN,TN,TN,363,400
3922140,Escherichia_coli_MSB1_7C,>CHROM=GC00004008_4;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,GC00004008_4,SNP,168,64.500000,1,34.0,...,TN,TP,TN,TN,TN,TN,TN,TN,426,500


In [28]:
# get the gene classification given the sample
def get_gene_classification(row):
    sample=row["sample"]
    return row[f"{sample}_classification"]

all_variant_calls_probeset_reports_with_gene_classification["gene_classification"] = \
all_variant_calls_probeset_reports_with_gene_classification.apply(get_gene_classification, axis=1)
all_variant_calls_probeset_reports_with_gene_classification

Unnamed: 0,sample,query_probe_header,ref_probe_header,classification,gene_name,SVTYPE,POS,LOG_NORMALISED_GTCONF,REF_LENGTH,COVERAGE,...,Escherichia_coli_MSB1_7C_classification,Escherichia_coli_MSB1_8B_classification,Escherichia_coli_MSB1_8G_classification,Escherichia_coli_MSB1_9D_classification,Escherichia_coli_MSB2_1A_classification,H131800734_classification,ST38_classification,gene_length,gene_length_category,gene_classification
0,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=1;REF_...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,1,58.099998,4,16.0,...,TP,TP,TP,TP,TP,TP,TP,126,200,TP
1,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=20;REF...,>CHROM=0;,1.0,Cluster_5590,SNP,20,59.700001,1,20.0,...,TP,TP,TP,TP,TP,TP,TP,126,200,TP
2,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,40,50.799999,4,21.0,...,TP,TP,TP,TP,TP,TP,TP,126,200,TP
3,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=40;REF...,>CHROM=0;,1.0,Cluster_5590,COMPLEX,40,56.799999,20,16.0,...,TP,TP,TP,TP,TP,TP,TP,126,200,TP
4,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=71;REF...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,71,50.200001,8,14.0,...,TP,TP,TP,TP,TP,TP,TP,126,200,TP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3922137,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,PH_SNPs,149,74.400002,4,106.0,...,TP,TN,TN,TN,TN,TN,TN,363,400,TP
3922138,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,SNP,232,73.300003,1,94.0,...,TP,TN,TN,TN,TN,TN,TN,363,400,TP
3922139,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,SNP,310,67.900002,1,51.0,...,TP,TN,TN,TN,TN,TN,TN,363,400,TP
3922140,Escherichia_coli_MSB1_7C,>CHROM=GC00004008_4;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,GC00004008_4,SNP,168,64.500000,1,34.0,...,TP,TN,TN,TN,TN,TN,TN,426,500,TP


In [29]:
# merge all_variant_calls_probeset_reports_with_gene_classification and vcfdf_indexed_by_chrom_pos_reflength to get several info
# e.g. true gt_conf and etc

all_merged_info = all_variant_calls_probeset_reports_with_gene_classification.merge(vcfdf_indexed_by_chrom_pos_reflength,
                                                                 left_on=["gene_name", "POS", "REF_LENGTH"],
                                                                 right_on=["CHROM", "POS", "REF_LENGTH"],
                                                                 how="inner")

all_collumns_were_merged_successfully = len(all_variant_calls_probeset_reports_with_gene_classification) == len(all_merged_info)
assert all_collumns_were_merged_successfully
all_merged_info

Unnamed: 0,sample,query_probe_header,ref_probe_header,classification,gene_name,SVTYPE,POS,LOG_NORMALISED_GTCONF,REF_LENGTH,COVERAGE,...,Escherichia_coli_MSB1_4I_gt_conf,Escherichia_coli_MSB2_1A_gt_conf,Escherichia_coli_MINF_7C_gt_conf,Escherichia_coli_MSB1_6C_gt_conf,H131800734_gt_conf,Escherichia_coli_MINF_8D_gt_conf,Escherichia_coli_MSB1_7A_gt_conf,ST38_gt_conf,Escherichia_coli_MINF_9A_gt_conf,Escherichia_coli_MSB1_7C_gt_conf
0,063_STEC,>CHROM=Cluster_5590;SAMPLE=063_STEC;POS=1;REF_...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,1,58.099998,4,16.0,...,0.00,0.0,0.0,269.4190,239.541,0.0,255.413,15.7924,318.505,390.973
1,Escherichia_coli_MSB1_8B,>CHROM=Cluster_5590;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,1,64.400002,4,34.0,...,0.00,0.0,0.0,269.4190,239.541,0.0,255.413,15.7924,318.505,390.973
2,Escherichia_coli_MSB1_3B,>CHROM=Cluster_5590;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,1,62.799999,4,28.0,...,0.00,0.0,0.0,269.4190,239.541,0.0,255.413,15.7924,318.505,390.973
3,Escherichia_coli_MINF_1A,>CHROM=Cluster_5590;SAMPLE=Escherichia_coli_MI...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,1,67.599998,4,50.0,...,0.00,0.0,0.0,269.4190,239.541,0.0,255.413,15.7924,318.505,390.973
4,Escherichia_coli_MSB1_4E,>CHROM=Cluster_5590;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_5590,PH_SNPs,1,63.799999,4,32.0,...,0.00,0.0,0.0,269.4190,239.541,0.0,255.413,15.7924,318.505,390.973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3922137,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,PH_SNPs,149,74.400002,4,106.0,...,0.00,0.0,0.0,0.0000,0.000,0.0,0.000,0.0000,0.000,641.067
3922138,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,SNP,232,73.300003,1,94.0,...,0.00,0.0,0.0,0.0000,0.000,0.0,0.000,0.0000,0.000,588.535
3922139,Escherichia_coli_MSB1_7C,>CHROM=Cluster_1698;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,Cluster_1698,SNP,310,67.900002,1,51.0,...,0.00,0.0,0.0,0.0000,0.000,0.0,0.000,0.0000,0.000,385.937
3922140,Escherichia_coli_MSB1_7C,>CHROM=GC00004008_4;SAMPLE=Escherichia_coli_MS...,>CHROM=0;,1.0,GC00004008_4,SNP,168,64.500000,1,34.0,...,0.00,0.0,0.0,0.0000,0.000,0.0,0.000,0.0000,0.000,296.984


In [30]:
# get the correct gt_conf classification given the sample
def get_non_normalised_gt_conf(row):
    sample=row["sample"]
    return row[f"{sample}_gt_conf"]

def get_sample_data(row):
    sample=row["sample"]
    return row[f"{sample}.{coverage}.{subsampling}.{technology}"]

all_merged_info["GT_CONF"] = all_merged_info.apply(get_non_normalised_gt_conf, axis=1)
all_merged_info["SAMPLE_DATA"] = all_merged_info.apply(get_sample_data, axis=1)
all_merged_info

MemoryError: Unable to allocate 1.58 GiB for an array with shape (54, 3922142) and data type object

In [None]:
# get gt from SAMPLE_DATA
def get_gt_from_sample_data(row):
    sample_data=row["SAMPLE_DATA"]
    gt=sample_data.split(":")[0]
    return gt

all_merged_info["GT"] = all_merged_info.apply(get_gt_from_sample_data, axis=1)
all_merged_info

In [None]:
all_merged_info.columns

In [None]:
# clean the huge df before plotting stuff
cleaned_df = all_merged_info[[
    # uniquely identifying a record
    "sample",
    "gene_name",
    "POS",
    "REF_LENGTH",
    
    # plot-relevant info
    "COVERAGE",
    "GT_CONF",
    "SVTYPE",
    "gene_length",
    "gene_length_category",
    "gene_classification",
    "classification",
]]
cleaned_df.rename(columns={"classification": "variant_precision"}, inplace=True)
cleaned_df

In [None]:
fig_gtconf_classification = px.scatter(cleaned_df, x="GT_CONF", y="variant_precision", color="gene_classification",
                           symbol="SVTYPE", opacity=0.5, render_mode="webgl",
                           title=f"Calls breakdown on {directory} - GT_CONF vs variant_precision")
fig_gtconf_classification.write_html(f"{directory}/calls_breakdown_{directory}_gtconf_classif.html")
fig_gtconf_classification

In [None]:
fig_gtconf_coverage = px.scatter(cleaned_df, x="GT_CONF", y="COVERAGE", color="variant_precision",
                                 symbol="gene_classification", size="gene_length",
                           opacity=0.5, render_mode="webgl",
                           title=f"Calls breakdown on {directory} - GT_CONF vs COVERAGE (on called allele)")
fig_gtconf_coverage.write_html(f"{directory}/calls_breakdown_{directory}_gtconf_coverage.html")

In [None]:
# adds a discretised_precision to the df
cleaned_df["discretised_precision"] = cleaned_df["variant_precision"].apply(lambda precision: ">=0.5" if precision>=0.5 else "<0.5")
cleaned_df

In [None]:
fig_gtconf_coverage_discretised_precision = px.scatter(
    cleaned_df, x="GT_CONF", y="COVERAGE", color="discretised_precision",
    symbol="gene_classification", size="gene_length",
    opacity=0.5, render_mode="webgl",
    title=f"Calls breakdown on {directory} - GT_CONF vs COVERAGE (on called allele) - discretised precision")
fig_gtconf_coverage_discretised_precision.write_html(f"{directory}/calls_breakdown_{directory}_gtconf_coverage_discretised_precision.html")

In [None]:
fig = px.scatter(
    cleaned_df, x="GT_CONF", y="COVERAGE",
    facet_row="sample", facet_col="discretised_precision",
    opacity=0.1, render_mode="webgl",
    title=f"Calls breakdown on {directory} - GT_CONF vs COVERAGE (on called allele) - discretised precision")
fig.write_html(f"{directory}/fig.html")

In [None]:
vcf_cleaned = all_merged_info[[
    "sample",
    "CHROM",
    "gene_classification",
    "POS",
    "REF",
    "ALT",
    "GT_CONF",
    "classification",
    "INFO",
    "FORMAT",
    "SAMPLE_DATA",
    "GT",
    "COVERAGE",
    "SVTYPE"
]]
vcf_cleaned.rename(columns={"classification": "variant_precision"}, inplace=True)
vcf_cleaned

In [None]:
FPs_sorted_by_gt_conf = vcf_cleaned[vcf_cleaned.variant_precision<=0.0].sort_values(by="GT_CONF", ascending=False)
FPs_sorted_by_gt_conf.to_csv(f"{directory}/FPs_sorted_by_gt_conf.tsv", sep="\t", index=False)
FPs_sorted_by_gt_conf