In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from copy import deepcopy
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
from platform import python_version
print(python_version())

3.8.6


## Load popANI matrix

In [3]:
df_ani = pd.read_csv("data/popANI_wide_format.csv", index_col=0)
df_ani.head()

Unnamed: 0_level_0,CDC335,CDC336,CDC337,CDC338,CDC339,CDC340,CDC342,CDC343,CDC344,E1,...,MSK925,UWM1195,UWM1196,UWM1197,UWM1198,UWM1199,UWM1204,UWM1206,UWM1207,UWM1208
Isolate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CDC335,1.0,0.999982,0.999982,0.999767,0.999758,0.999833,0.99982,0.999757,0.999882,0.999746,...,0.999942,0.999829,0.999829,0.999797,0.9998,0.999746,0.99984,0.999763,0.999993,0.999914
CDC336,0.999982,1.0,1.0,0.999769,0.999761,0.999835,0.999823,0.99976,0.999885,0.999749,...,0.999943,0.999831,0.999831,0.9998,0.999803,0.999749,0.999842,0.999766,0.999984,0.999917
CDC337,0.999982,1.0,1.0,0.999768,0.99976,0.999835,0.999822,0.99976,0.999885,0.999749,...,0.999943,0.999831,0.999831,0.999799,0.999803,0.999749,0.999842,0.999766,0.999984,0.999916
CDC338,0.999767,0.999769,0.999768,1.0,0.999759,0.99977,0.999758,0.999759,0.999768,0.999748,...,0.999765,0.999766,0.999766,0.999867,0.999867,0.999748,0.999776,0.999764,0.999769,0.99976
CDC339,0.999758,0.999761,0.99976,0.999759,1.0,0.999767,0.999758,0.999985,0.999764,0.999874,...,0.999757,0.999763,0.999764,0.999787,0.99979,0.999958,0.999773,0.999988,0.99976,0.99975


## Load heteroresistance phenotype

In [4]:
df_y = pd.read_csv("data/cpara_metadata.csv", index_col=0)[['HR']].astype(int).loc[df_ani.index]
df_y.head()

Unnamed: 0_level_0,HR
Isolate,Unnamed: 1_level_1
CDC335,0
CDC336,0
CDC337,0
CDC338,0
CDC339,0


## Load repeated data split

In [5]:
df_split_50 = pd.read_csv("data/train_test_splits_50.csv")
df_split_50.head()

Unnamed: 0,Train,Test
0,"MSK2386,MSK67,E61,MSK1298,MSK848,UWM1196,MSK26...","E36,MSK1258,E69,E31,E63,CDC336,MSK2162,UWM1207..."
1,"MSK1090,UWM1207,E71,E51,E39,MSK2124,MSK806,MSK...","E34,E70,E9,E20,E50,MSK814,FM14,E7,E63,E78,MSK8..."
2,"E22,E5,MSK247,MSK2386,E41,E56,MSK624,E49,UWM12...","UWM1199,E4,E9,E3,E12,MSK844,MSK2448,MSK67,MSK2..."
3,"MSK250,FM14,E22,E79,E75,E19,UWM1199,MSK923,E80...","CDC344,UWM1204,E25,E53,MSK2425,UWM1195,MSK802,..."
4,"MSK1258,E67,UWM1195,E40,CDC339,E70,MSK2107,E42...","MSK808,E69,E12,E64,MSK811,UWM1208,MSK2448,CDC3..."


## Run ANI-based molel

Each test isolate is assigned the same heteroresistance phenotype as that of its closest training isolate

In [6]:
model_performance_summary = []
for iteration in tqdm(np.arange(0,50)):    
    # get train and test dataset
    train_isolates = df_split_50.loc[iteration,'Train'].split(',')
    test_isolates = df_split_50.loc[iteration,'Test'].split(',')
    y_test = deepcopy(df_y.loc[test_isolates])
    
    # find the closest training isolates to each test isolate
    y_pred = []
    for isolate in test_isolates:
        df2_ani = df_ani.loc[train_isolates, [isolate]]
        max_ani = df2_ani.values.max()
        df2_ani = df2_ani[df2_ani[isolate]==max_ani]

        # if multiple training isolates are equally close to the test isolate,
        # the heteroresistance phenotype is determined by their average
        mean_y = df_y.loc[df2_ani.index].values.mean()
        if mean_y > 0.5:
            y_pred.append(1)
        elif mean_y <= 0.5:
            y_pred.append(0)

    # compute test scores
    accuracy_test = accuracy_score(y_true=y_test, y_pred=y_pred)
    precision_test = precision_score(y_true=y_test, y_pred=y_pred)
    recall_test = recall_score(y_true=y_test, y_pred=y_pred)
    f1_test = f1_score(y_true=y_test, y_pred=y_pred)
    FN_test = []
    FP_test = []
    for ti,y1,y2 in zip(test_isolates,list(y_pred),list(y_test.HR)):
        if y1 == 0 and y2 == 1:
            FN_test.append(ti)
        if y1 == 1 and y2 == 0:
            FP_test.append(ti)
  
    # save to results
    model_performance_summary.append([iteration, accuracy_test, precision_test, recall_test, f1_test, (',').join(FN_test), (',').join(FP_test)])
            
df_model_performance_summary = pd.DataFrame(model_performance_summary,
                                            columns=['iteration','accuracy_test','precision_test','recall_test','f1_test','false_negative_test','false_positive_test']
                                           )
df_model_performance_summary.to_csv("output/model_eval_summary_ani_model.csv", index=False)
df_model_performance_summary.head()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 18.37it/s]


Unnamed: 0,iteration,accuracy_test,precision_test,recall_test,f1_test,false_negative_test,false_positive_test
0,0,0.75,0.75,0.3,0.428571,"E36,UWM1207,MSK2384,GL122,E7,MSK2191,MSK807",E27
1,1,0.75,0.666667,0.4,0.5,"E34,FM14,E7,MSK810,E32,MSK2199","MSK814,E1"
2,2,0.84375,0.727273,0.8,0.761905,"E38,MSK808","E72,MSK1666,E52"
3,3,0.8125,0.833333,0.5,0.625,"E38,MSK2191,MSK2448,UWM1207,MSK810",MSK2406
4,4,0.75,0.6,0.6,0.6,"MSK808,E7,E38,E30","UWM1206,CDC338,E52,CDC340"
