In [1]:
import os

import pandas as pd
from Bio import SeqIO
from common.env_config import config

In [22]:
performance_data = {
    "100_400": {
        "fold": {
            1: {"Sensitivity": 86.15, "Specificity": 97.72, "Accuracy": 92.45},
            2: {"Sensitivity": 92.13, "Specificity": 97.33, "Accuracy": 94.92},
            3: {"Sensitivity": 88.96, "Specificity": 93.48, "Accuracy": 91.31},
            4: {"Sensitivity": 93.16, "Specificity": 93.62, "Accuracy": 93.41},
            5: {"Sensitivity": 92.85, "Specificity": 93.55, "Accuracy": 93.22}
        }
    },
    "400_800": {
        "fold": {
            1: {"Sensitivity": 88.00, "Specificity": 94.66, "Accuracy": 91.35},
            2: {"Sensitivity": 93.45, "Specificity": 92.44, "Accuracy": 92.94},
            3: {"Sensitivity": 94.09, "Specificity": 93.17, "Accuracy": 93.63},
            4: {"Sensitivity": 94.54, "Specificity": 93.53, "Accuracy": 94.04},
            5: {"Sensitivity": 90.54, "Specificity": 93.69, "Accuracy": 92.12}
        }
    },
    "800_1200": {
        "fold": {
            1: {"Sensitivity": 88.19, "Specificity": 95.50, "Accuracy": 91.85},
            2: {"Sensitivity": 93.23, "Specificity": 93.83, "Accuracy": 93.53},
            3: {"Sensitivity": 94.50, "Specificity": 93.71, "Accuracy": 94.11},
            4: {"Sensitivity": 94.52, "Specificity": 94.14, "Accuracy": 94.33},
            5: {"Sensitivity": 91.17, "Specificity": 93.94, "Accuracy": 92.56}
        }
    },
    "1200_1800": {
        "fold": {
            1: {"Sensitivity": 88.94, "Specificity": 96.27, "Accuracy": 92.61},
            2: {"Sensitivity": 94.47, "Specificity": 93.86, "Accuracy": 94.17},
            3: {"Sensitivity": 95.06, "Specificity": 95.11, "Accuracy": 95.08},
            4: {"Sensitivity": 95.47, "Specificity": 94.91, "Accuracy": 95.19},
            5: {"Sensitivity": 90.94, "Specificity": 94.79, "Accuracy": 92.86}
        }
    }
}

rows = []
for range_key, range_data in performance_data.items():
    for fold_num, metrics in range_data["fold"].items():
        row = {
            'group': range_key,
            'fold': fold_num,
            'sens': metrics['Sensitivity'],
            'spec': metrics['Specificity'],
            'acc': metrics['Accuracy']
        }
        rows.append(row)

phatyp_result_df = pd.DataFrame(rows)
phatyp_result_df.head()

Unnamed: 0,group,fold,sens,spec,acc
0,100_400,1,86.15,97.72,92.45
1,100_400,2,92.13,97.33,94.92
2,100_400,3,88.96,93.48,91.31
3,100_400,4,93.16,93.62,93.41
4,100_400,5,92.85,93.55,93.22


In [31]:
phatyp_result_df[(phatyp_result_df['group']=='100_400') & (phatyp_result_df['fold']==1)]['spec'].values[0]

97.72

In [36]:
phatyp_data_dir = os.path.join(config.MY_DATA_DIR, "convert2phatyp")
fasta_data_dir = os.path.join(config.MY_DATA_DIR, "fasta")

columns = ["fold", "group", "false_temperate", "false_virulent", "spec", "sens", "acc"]
df = pd.DataFrame(columns=columns)
for group_index in range(4):
    if group_index == 0:
        group = '100_400'
    elif group_index == 1:
        group = '400_800'
    elif group_index == 2:
        group = '800_1200'
    else:
        group = '1200_1800'

    data_type = "train"
    for fold in range(1, 6):
        print("=" * 200)
        print(f"Processing fold {fold}, group {group}, data_type {data_type}")

        phatyp_df = pd.read_csv(f"{phatyp_data_dir}/{group}/{fold}/{data_type}/ds.csv")

        sequence = []
        label = []
        id = []
        with open(f"{fasta_data_dir}/{group}/{fold}/{data_type}/data.fa", "r") as f:
            for record in SeqIO.parse(f, "fasta"):
                sequence.append(str(record.seq))
                id.append(record.id.split("_")[0] + record.id.split("_")[1])
                if record.id.split("_")[2] == 'temperate':
                    label.append(0)
                else:
                    label.append(1)
        fasta_df = pd.DataFrame({"sequence": sequence, "name": id, "label": label})

        print(f"fasta_df.shape: {fasta_df.shape}")
        print(f"phatyp_df.shape: {phatyp_df.shape}")

        in_fasta_not_in_phatyp_df = fasta_df[~fasta_df['name'].isin(phatyp_df['name'])]
        count_false_virulent = in_fasta_not_in_phatyp_df['label'].value_counts()[0]
        count_false_temperate = in_fasta_not_in_phatyp_df['label'].value_counts()[1]
        print(f"false virulent: {count_false_virulent}")
        print(f"false temperate: {count_false_temperate}")

        predicted_virulent = phatyp_df['label'].value_counts()[0]
        spec = (predicted_virulent * phatyp_result_df[(phatyp_result_df['group']==group) & (phatyp_result_df['fold']==fold)]['spec'].values[0]) / (predicted_virulent + count_false_virulent)/100

        predicted_temperate = phatyp_df['label'].value_counts()[1]
        sens = (predicted_temperate * phatyp_result_df[(phatyp_result_df['group']==group) & (phatyp_result_df['fold']==fold)]['sens'].values[0]) / (predicted_temperate + count_false_temperate)/100

        P = fasta_df['label'].value_counts()[1]/fasta_df.shape[0]
        acc = (sens*P)+(spec*(1-P))

        temp = pd.DataFrame([[fold, group, count_false_virulent, count_false_temperate, spec, sens, acc]], columns=columns)
        df = pd.concat([df, temp], axis=0, ignore_index=True)


Processing fold 1, group 100_400, data_type train
fasta_df.shape: (1283624, 3)
phatyp_df.shape: (595796, 3)
false virulent: 19260
false temperate: 12870
Processing fold 2, group 100_400, data_type train


  df = pd.concat([df, temp], axis=0, ignore_index=True)


fasta_df.shape: (1315684, 3)
phatyp_df.shape: (606112, 3)
false virulent: 21186
false temperate: 12717
Processing fold 3, group 100_400, data_type train
fasta_df.shape: (1263496, 3)
phatyp_df.shape: (605500, 3)
false virulent: 19828
false temperate: 17873
Processing fold 4, group 100_400, data_type train
fasta_df.shape: (690504, 3)
phatyp_df.shape: (532532, 3)
false virulent: 39483
false temperate: 43220
Processing fold 5, group 100_400, data_type train
fasta_df.shape: (705600, 3)
phatyp_df.shape: (538753, 3)
false virulent: 35266
false temperate: 42637
Processing fold 1, group 400_800, data_type train
fasta_df.shape: (242128, 3)
phatyp_df.shape: (223222, 3)
false virulent: 8473
false temperate: 10021
Processing fold 2, group 400_800, data_type train
fasta_df.shape: (245676, 3)
phatyp_df.shape: (227223, 3)
false virulent: 8579
false temperate: 9874
Processing fold 3, group 400_800, data_type train
fasta_df.shape: (246556, 3)
phatyp_df.shape: (227275, 3)
false virulent: 9426
false tempe

In [37]:
df.head(20)

Unnamed: 0,fold,group,false_temperate,false_virulent,spec,sens,acc
0,1,100_400,19260,12870,0.920927,0.823719,0.872323
1,2,100_400,21186,12717,0.91319,0.881848,0.897519
2,3,100_400,19828,17873,0.880391,0.837045,0.85957
3,4,100_400,39483,43220,0.820865,0.794994,0.807929
4,5,100_400,35266,42637,0.831598,0.796155,0.813877
5,1,400_800,8473,10021,0.879485,0.807842,0.843664
6,2,400_800,8579,9874,0.859152,0.860166,0.859659
7,3,400_800,9426,9855,0.860212,0.865944,0.863078
8,4,400_800,9165,9825,0.864689,0.86971,0.867199
9,5,400_800,8262,9559,0.873291,0.835764,0.854527


In [38]:
df.to_csv("phatyp_result.csv", index=False)