In [1]:
import pandas as pd
import numpy as np

In [2]:
pandora_gene_presence_df = pd.read_csv("pandora_multisample.matrix", sep="\t")
pandora_gene_presence_df

Unnamed: 0,gene_name,063_STEC,CFT073,H131800734,ST38
0,GC00006917,0,0,0,1
1,Cluster_6872,0,0,0,1
2,GC00000927_2,0,0,0,1
3,Cluster_808,0,0,0,1
4,GC00008595,0,0,0,1
...,...,...,...,...,...
11152,GC00010372,1,1,1,1
11153,GC00009412,1,0,0,0
11154,Cluster_5558,1,0,0,1
11155,GC00004186_r1_1,1,0,1,0


In [3]:
bowtie2_gene_presence_df = pd.read_csv("gene_presence_matrix_based_on_bowtie2", sep="\t")
bowtie2_gene_presence_df

Unnamed: 0,gene_name,CFT073,H131800734,ST38,063_STEC
0,GC00006917,0,0,1,0
1,Cluster_588,0,0,1,0
2,GC00004143,0,0,1,0
3,GC00005267_1,0,0,1,0
4,GC00003169_2,0,0,1,0
...,...,...,...,...,...
10427,GC00001998,1,1,1,1
10428,GC00002254,1,1,1,1
10429,Cluster_4479,1,1,1,1
10430,GC00005134,0,0,0,1


In [4]:
merged_dfs = pandora_gene_presence_df.merge(bowtie2_gene_presence_df,
                                            on="gene_name", how="outer", suffixes=("_pandora", "_bowtie"))
merged_dfs

Unnamed: 0,gene_name,063_STEC_pandora,CFT073_pandora,H131800734_pandora,ST38_pandora,CFT073_bowtie,H131800734_bowtie,ST38_bowtie,063_STEC_bowtie
0,GC00006917,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,Cluster_6872,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,GC00000927_2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,Cluster_808,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,GC00008595,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
11573,Cluster_6971,,,,,0.0,0.0,1.0,0.0
11574,GC00004416,,,,,0.0,0.0,0.0,1.0
11575,GC00003424_1,,,,,0.0,0.0,1.0,0.0
11576,Cluster_2012,,,,,0.0,0.0,0.0,1.0


In [5]:
merged_dfs = merged_dfs.fillna(0)
for column in merged_dfs.columns:
    if column != "gene_name":
        merged_dfs[column] = merged_dfs[column].astype(np.int)
merged_dfs

Unnamed: 0,gene_name,063_STEC_pandora,CFT073_pandora,H131800734_pandora,ST38_pandora,CFT073_bowtie,H131800734_bowtie,ST38_bowtie,063_STEC_bowtie
0,GC00006917,0,0,0,1,0,0,1,0
1,Cluster_6872,0,0,0,1,0,0,1,0
2,GC00000927_2,0,0,0,1,0,0,1,0
3,Cluster_808,0,0,0,1,0,0,1,0
4,GC00008595,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
11573,Cluster_6971,0,0,0,0,0,0,1,0
11574,GC00004416,0,0,0,0,0,0,0,1
11575,GC00003424_1,0,0,0,0,0,0,1,0
11576,Cluster_2012,0,0,0,0,0,0,0,1


In [6]:
def get_classification(row, sample):
    bowtie_classification = float(row[f"{sample}_bowtie"])
    pandora_classification = float(row[f"{sample}_pandora"])
    
    TP = bowtie_classification == 1 and pandora_classification == 1
    FP = bowtie_classification == 0 and pandora_classification == 1
    FN = bowtie_classification == 1 and pandora_classification == 0
    TN = bowtie_classification == 0 and pandora_classification == 0
    
    if TP: return "TP"
    if FP: return "FP"
    if FN: return "FN"
    if TN: return "TN"
    

samples=["063_STEC", "CFT073", "H131800734", "ST38"]
for sample in samples:
    merged_dfs[f"{sample}_classification"] = merged_dfs.apply(get_classification, axis=1, sample=sample)
merged_dfs

Unnamed: 0,gene_name,063_STEC_pandora,CFT073_pandora,H131800734_pandora,ST38_pandora,CFT073_bowtie,H131800734_bowtie,ST38_bowtie,063_STEC_bowtie,063_STEC_classification,CFT073_classification,H131800734_classification,ST38_classification
0,GC00006917,0,0,0,1,0,0,1,0,TN,TN,TN,TP
1,Cluster_6872,0,0,0,1,0,0,1,0,TN,TN,TN,TP
2,GC00000927_2,0,0,0,1,0,0,1,0,TN,TN,TN,TP
3,Cluster_808,0,0,0,1,0,0,1,0,TN,TN,TN,TP
4,GC00008595,0,0,0,1,0,0,1,0,TN,TN,TN,TP
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11573,Cluster_6971,0,0,0,0,0,0,1,0,TN,TN,TN,FN
11574,GC00004416,0,0,0,0,0,0,0,1,FN,TN,TN,TN
11575,GC00003424_1,0,0,0,0,0,0,1,0,TN,TN,TN,FN
11576,Cluster_2012,0,0,0,0,0,0,0,1,FN,TN,TN,TN


In [7]:
classification_cols = [col for col in merged_dfs.columns if "classification" in col]
classification_df = merged_dfs[classification_cols]
classification_df

Unnamed: 0,063_STEC_classification,CFT073_classification,H131800734_classification,ST38_classification
0,TN,TN,TN,TP
1,TN,TN,TN,TP
2,TN,TN,TN,TP
3,TN,TN,TN,TP
4,TN,TN,TN,TP
...,...,...,...,...
11573,TN,TN,TN,FN
11574,FN,TN,TN,TN
11575,TN,TN,TN,FN
11576,FN,TN,TN,TN


In [10]:
def print_classification():
    all_classifications = pd.Series()
    for col in classification_df.columns:
        all_classifications = all_classifications.append(classification_df[col])
    all_classifications.name="all_classifications"
    print(all_classifications.value_counts(normalize=False))
    print(all_classifications.value_counts(normalize=True))
    print("\n===============================================\n")

    for col in classification_df.columns:
        print(classification_df[col].value_counts(normalize=False))
        print(classification_df[col].value_counts(normalize=True))
        print("\n===============================================\n")

print_classification()

TP    25672
TN    15787
FP     3205
FN     1648
Name: all_classifications, dtype: int64
TP    0.554327
TN    0.340884
FP    0.069205
FN    0.035585
Name: all_classifications, dtype: float64


TP    5934
TN    4429
FP     655
FN     560
Name: 063_STEC_classification, dtype: int64
TP    0.512524
TN    0.382536
FP    0.056573
FN    0.048368
Name: 063_STEC_classification, dtype: float64


TP    6744
TN    3716
FP     889
FN     229
Name: CFT073_classification, dtype: int64
TP    0.582484
TN    0.320954
FP    0.076784
FN    0.019779
Name: CFT073_classification, dtype: float64


TP    6044
TN    4439
FP     923
FN     172
Name: H131800734_classification, dtype: int64
TP    0.522025
TN    0.383400
FP    0.079720
FN    0.014856
Name: H131800734_classification, dtype: float64


TP    6950
TN    3203
FP     738
FN     687
Name: ST38_classification, dtype: int64
TP    0.600276
TN    0.276645
FP    0.063742
FN    0.059337
Name: ST38_classification, dtype: float64


