In [1]:
import pandas as pd
import numpy as np

In [2]:
pandora_gene_presence_df = pd.read_csv("pandora_multisample.matrix", sep="\t")
pandora_gene_presence_df

Unnamed: 0,gene_name,063_STEC,CFT073,H131800734,ST38
0,GC00006917,0,0,0,1
1,Cluster_6872,0,0,0,1
2,GC00000927_2,0,0,0,1
3,Cluster_808,0,0,0,1
4,GC00008595,0,0,0,1
...,...,...,...,...,...
11152,GC00010372,1,1,1,1
11153,GC00009412,1,0,0,0
11154,Cluster_5558,1,0,0,1
11155,GC00004186_r1_1,1,0,1,0


In [3]:
bowtie2_gene_presence_df = pd.read_csv("gene_presence_matrix_based_on_bowtie2", sep="\t")
bowtie2_gene_presence_df

Unnamed: 0,gene_name,CFT073,H131800734,ST38,063_STEC
0,GC00006917,0,0,1,0
1,Cluster_588,0,0,1,0
2,GC00004143,0,0,1,0
3,GC00005267_1,0,0,1,0
4,GC00003169_2,0,0,1,0
...,...,...,...,...,...
10427,GC00001998,1,1,1,1
10428,GC00002254,1,1,1,1
10429,Cluster_4479,1,1,1,1
10430,GC00005134,0,0,0,1


In [14]:
gene_length_df = pd.read_csv("gene_length_matrix", sep="\t")
gene_length_df

Unnamed: 0,gene_name,gene_length
0,GC00005502_3,459
1,Cluster_588,418
2,GC00008078_2,252
3,GC00004685_15,303
4,Cluster_1684,364
...,...,...
10427,GC00001998,279
10428,Cluster_6011,138
10429,GC00002082,327
10430,GC00005883,177


In [4]:
merged_dfs = pandora_gene_presence_df.merge(bowtie2_gene_presence_df,
                                            on="gene_name", how="outer", suffixes=("_pandora", "_bowtie"))
merged_dfs

Unnamed: 0,gene_name,063_STEC_pandora,CFT073_pandora,H131800734_pandora,ST38_pandora,CFT073_bowtie,H131800734_bowtie,ST38_bowtie,063_STEC_bowtie
0,GC00006917,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,Cluster_6872,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,GC00000927_2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,Cluster_808,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,GC00008595,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
11573,Cluster_6971,,,,,0.0,0.0,1.0,0.0
11574,GC00004416,,,,,0.0,0.0,0.0,1.0
11575,GC00003424_1,,,,,0.0,0.0,1.0,0.0
11576,Cluster_2012,,,,,0.0,0.0,0.0,1.0


In [5]:
merged_dfs = merged_dfs.fillna(0)
for column in merged_dfs.columns:
    if column != "gene_name":
        merged_dfs[column] = merged_dfs[column].astype(np.int)
merged_dfs

Unnamed: 0,gene_name,063_STEC_pandora,CFT073_pandora,H131800734_pandora,ST38_pandora,CFT073_bowtie,H131800734_bowtie,ST38_bowtie,063_STEC_bowtie
0,GC00006917,0,0,0,1,0,0,1,0
1,Cluster_6872,0,0,0,1,0,0,1,0
2,GC00000927_2,0,0,0,1,0,0,1,0
3,Cluster_808,0,0,0,1,0,0,1,0
4,GC00008595,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
11573,Cluster_6971,0,0,0,0,0,0,1,0
11574,GC00004416,0,0,0,0,0,0,0,1
11575,GC00003424_1,0,0,0,0,0,0,1,0
11576,Cluster_2012,0,0,0,0,0,0,0,1


In [6]:
def get_classification(row, sample):
    bowtie_classification = float(row[f"{sample}_bowtie"])
    pandora_classification = float(row[f"{sample}_pandora"])
    
    TP = bowtie_classification == 1 and pandora_classification == 1
    FP = bowtie_classification == 0 and pandora_classification == 1
    FN = bowtie_classification == 1 and pandora_classification == 0
    TN = bowtie_classification == 0 and pandora_classification == 0
    
    if TP: return "TP"
    if FP: return "FP"
    if FN: return "FN"
    if TN: return "TN"
    

samples=["063_STEC", "CFT073", "H131800734", "ST38"]
for sample in samples:
    merged_dfs[f"{sample}_classification"] = merged_dfs.apply(get_classification, axis=1, sample=sample)
merged_dfs

Unnamed: 0,gene_name,063_STEC_pandora,CFT073_pandora,H131800734_pandora,ST38_pandora,CFT073_bowtie,H131800734_bowtie,ST38_bowtie,063_STEC_bowtie,063_STEC_classification,CFT073_classification,H131800734_classification,ST38_classification
0,GC00006917,0,0,0,1,0,0,1,0,TN,TN,TN,TP
1,Cluster_6872,0,0,0,1,0,0,1,0,TN,TN,TN,TP
2,GC00000927_2,0,0,0,1,0,0,1,0,TN,TN,TN,TP
3,Cluster_808,0,0,0,1,0,0,1,0,TN,TN,TN,TP
4,GC00008595,0,0,0,1,0,0,1,0,TN,TN,TN,TP
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11573,Cluster_6971,0,0,0,0,0,0,1,0,TN,TN,TN,FN
11574,GC00004416,0,0,0,0,0,0,0,1,FN,TN,TN,TN
11575,GC00003424_1,0,0,0,0,0,0,1,0,TN,TN,TN,FN
11576,Cluster_2012,0,0,0,0,0,0,0,1,FN,TN,TN,TN


In [15]:
classification_cols = [col for col in merged_dfs.columns if "classification" in col or col=="gene_name"]
classification_df = merged_dfs[classification_cols]
classification_df

Unnamed: 0,gene_name,063_STEC_classification,CFT073_classification,H131800734_classification,ST38_classification
0,GC00006917,TN,TN,TN,TP
1,Cluster_6872,TN,TN,TN,TP
2,GC00000927_2,TN,TN,TN,TP
3,Cluster_808,TN,TN,TN,TP
4,GC00008595,TN,TN,TN,TP
...,...,...,...,...,...
11573,Cluster_6971,TN,TN,TN,FN
11574,GC00004416,FN,TN,TN,TN
11575,GC00003424_1,TN,TN,TN,FN
11576,Cluster_2012,FN,TN,TN,TN


In [16]:
def print_classification():
    all_samples = pd.Series()
    for col in classification_df.columns:
        if col.endswith("_classification"):
            all_samples = all_samples.append(classification_df[col])
    all_samples.name="all_samples"
    print(all_samples.value_counts(normalize=False))
    print(all_samples.value_counts(normalize=True))
    print("\n===============================================\n")

    for col in classification_df.columns:
        if col.endswith("_classification"):
            print(classification_df[col].value_counts(normalize=False))
            print(classification_df[col].value_counts(normalize=True))
            print("\n===============================================\n")

print_classification()

TP    25672
TN    15787
FP     3205
FN     1648
Name: all_samples, dtype: int64
TP    0.554327
TN    0.340884
FP    0.069205
FN    0.035585
Name: all_samples, dtype: float64


TP    5934
TN    4429
FP     655
FN     560
Name: 063_STEC_classification, dtype: int64
TP    0.512524
TN    0.382536
FP    0.056573
FN    0.048368
Name: 063_STEC_classification, dtype: float64


TP    6744
TN    3716
FP     889
FN     229
Name: CFT073_classification, dtype: int64
TP    0.582484
TN    0.320954
FP    0.076784
FN    0.019779
Name: CFT073_classification, dtype: float64


TP    6044
TN    4439
FP     923
FN     172
Name: H131800734_classification, dtype: int64
TP    0.522025
TN    0.383400
FP    0.079720
FN    0.014856
Name: H131800734_classification, dtype: float64


TP    6950
TN    3203
FP     738
FN     687
Name: ST38_classification, dtype: int64
TP    0.600276
TN    0.276645
FP    0.063742
FN    0.059337
Name: ST38_classification, dtype: float64




In [18]:
classification_df_with_gene_length = classification_df.merge(gene_length_df, on="gene_name")
classification_df_with_gene_length

Unnamed: 0,gene_name,063_STEC_classification,CFT073_classification,H131800734_classification,ST38_classification,gene_length
0,GC00006917,TN,TN,TN,TP,489
1,Cluster_6872,TN,TN,TN,TP,157
2,GC00000927_2,TN,TN,TN,TP,249
3,Cluster_808,TN,TN,TN,TP,428
4,GC00008595,TN,TN,TN,TP,273
...,...,...,...,...,...,...
10427,Cluster_6971,TN,TN,TN,FN,155
10428,GC00004416,FN,TN,TN,TN,453
10429,GC00003424_1,TN,TN,TN,FN,393
10430,Cluster_2012,FN,TN,TN,TN,344


In [48]:
def get_gene_length_category(value):
    if value >= 4000:
        return 4100
    else:
        return (int(value/100)+1)*100

classification_df_with_gene_length["gene_length_category"] = \
classification_df_with_gene_length["gene_length"].apply(get_gene_length_category)
classification_df_with_gene_length

Unnamed: 0,gene_name,063_STEC_classification,CFT073_classification,H131800734_classification,ST38_classification,gene_length,gene_length_category
0,GC00006917,TN,TN,TN,TP,489,500
1,Cluster_6872,TN,TN,TN,TP,157,200
2,GC00000927_2,TN,TN,TN,TP,249,300
3,Cluster_808,TN,TN,TN,TP,428,500
4,GC00008595,TN,TN,TN,TP,273,300
...,...,...,...,...,...,...,...
10427,Cluster_6971,TN,TN,TN,FN,155,200
10428,GC00004416,FN,TN,TN,TN,453,500
10429,GC00003424_1,TN,TN,TN,FN,393,400
10430,Cluster_2012,FN,TN,TN,TN,344,400


In [49]:
gene_names = []
classifications = []
gene_length_categories = []
for _, row in classification_df_with_gene_length.iterrows():
    for col in classification_df_with_gene_length.columns:
        if col.endswith("_classification"):
            gene_names.append(row["gene_name"])
            classifications.append(row[col])
            gene_length_categories.append(row["gene_length_category"])
            
classification_all = pd.DataFrame(data={"gene_name": gene_names, "classification": classifications, "gene_length_category": gene_length_categories})
classification_all

Unnamed: 0,gene_name,classification,gene_length_category
0,GC00006917,TN,500
1,GC00006917,TN,500
2,GC00006917,TN,500
3,GC00006917,TP,500
4,Cluster_6872,TN,200
...,...,...,...
41723,Cluster_2012,TN,400
41724,GC00001036_3,TN,800
41725,GC00001036_3,TN,800
41726,GC00001036_3,TN,800


In [50]:
classification_all_grouped_and_counted = classification_all.groupby(by=["classification", "gene_length_category"]).count()
classification_all_grouped_and_counted = classification_all_grouped_and_counted.unstack()
classification_all_grouped_and_counted

Unnamed: 0_level_0,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name,gene_name
gene_length_category,100,200,300,400,500,600,700,800,900,1000,...,3200,3300,3400,3500,3600,3700,3800,3900,4000,4100
classification,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
FN,60.0,552.0,582.0,268.0,155.0,8.0,1.0,3.0,3.0,2.0,...,,,1.0,,,,,,,
FP,20.0,177.0,301.0,199.0,187.0,44.0,21.0,22.0,40.0,34.0,...,3.0,2.0,,1.0,1.0,,3.0,,,15.0
TN,126.0,1792.0,2747.0,1934.0,1474.0,644.0,460.0,482.0,511.0,452.0,...,15.0,29.0,6.0,3.0,6.0,8.0,8.0,2.0,3.0,67.0
TP,378.0,3727.0,3822.0,2599.0,1948.0,1224.0,1290.0,1369.0,1286.0,1352.0,...,38.0,17.0,13.0,16.0,9.0,12.0,25.0,10.0,9.0,62.0


In [52]:
import plotly.graph_objects as go

gene_length_categories = classification_all_grouped_and_counted.columns.get_level_values(1)

fig = go.Figure(data=[
        go.Bar(name=classification, x=gene_length_categories, y=classification_all_grouped_and_counted.xs(classification)) \
               for classification in ["TP", "FP", "TN", "FN"]])

fig.update_layout(barmode='stack')
fig