In [1]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report

import numpy as np
import pandas as pd

import itertools as it

In [3]:
# code taken from 11July2022-ASMabstract_Oct2022ASMPoster.ipynb in Manuscript1_Oct2023 folder

#### Data Encoding for Plasmid Clusters from MOB-suite

In [6]:
meta = pd.read_csv('/home/jee/enterococcus_sample_reconciliation/all_combined_enterococcus_metadata.tsv',
                  sep='\t').rename({'Quinolone':'Quin'},axis=1)[['Run_accession', 'Species',
                  'Ampicillin', 'Vancomycin', 'Teicoplanin', 'Doxycycline',
                  'Erythromycin', 'Nitrofurantoin', 'Gentamicin', 'Linezolid',
                  'Levofloxacin', 'Quin', 'Streptomycin',
                  'Tigecycline']]

mob = pd.read_csv('/home/jee/final_genome_ls_enterococci_feb2022/mob_all.csv')\
                    [['sample_id', 'primary_cluster_id', 'secondary_cluster_id']].\
                    rename(columns={"sample_id":"Run_accession"})

mob_all = pd.read_csv('/home/jee/final_genome_ls_enterococci_feb2022/mob_all.csv').rename(columns={"sample_id":"Run_accession"})
# mob_primary = mob[['Run_accession','primary_cluster_id']]

In [7]:
mob_all[ (mob_all['primary_cluster_id']!='-') & (mob_all['secondary_cluster_id']!='-') ].mash_nearest_neighbor
print(len(mob_all.primary_cluster_id.unique()), mob_all.primary_cluster_id.unique()) #108
print(len(mob_all.secondary_cluster_id.unique()), mob_all.secondary_cluster_id.unique()) #98 

109 ['-' 'AC727' 'AB530' 'AE312' 'AC733' 'AH273' 'AB759' 'AB614' 'AE672'
 'AE577' 'AE315' 'AD058' 'AA894' 'AB173' 'AC537' 'AE807' 'AD059' 'AD268'
 'AB756' 'AE309' 'AE314' 'AB528' 'AB172' 'AD907' 'AD909' 'AC731' 'AB171'
 'AA893' 'AF632' 'AB976' 'AG970' 'AD288' 'AG184' 'AD569' 'AB527' 'AB616'
 'AD908' 'AC536' 'AE310' 'AC751' 'AC670' 'AC732' 'AB529' 'AA769' 'AH188'
 'AC833' 'AB117' 'AB918' 'AD055' 'AA890' 'AB915' 'AD582' 'AE311' 'AC728'
 'AD265' 'AD910' 'AB760' 'AB615' 'AC754' 'AD100' 'AA366' 'AC345' 'AF663'
 'AC534' 'AB174' 'AC729' 'novel_9198ae084b830436dc78b2c36a55d225' 'AF662'
 'AE313' 'AC726' 'AC607' 'AF717' 'AB369' 'AC630' 'AB977' 'AD710' 'AF568'
 'AB917' 'AE633' 'AA892' 'AC535' 'AB758' 'AA896' 'AG002' 'AB020' 'AD269'
 'AF761' 'AC544' 'AB973' 'AB642' 'AA891' 'AA215' 'AD906' 'AG003' 'AC631'
 'AD297' 'AD415' 'AC468' 'AB922' 'AB920' 'AB617' 'AD414' 'AD709'
 'novel_1a033aeec961423a98d1e337f28b31b5'
 'novel_3c59533158572f3977e5c5e69c597db0' 'AC753' 'AD296'
 'novel_6ecbc972f5351ab231e301f

In [8]:
mob['Run_accession']

0        SRR13999941
1        SRR13999941
2        SRR13999941
3        SRR13999941
4        SRR13999941
            ...     
69961    SRR14026473
69962    SRR14026473
69963    SRR14026473
69964    SRR14026473
69965    SRR14026473
Name: Run_accession, Length: 69966, dtype: object

In [10]:
mob_genomes = mob['Run_accession'].value_counts().to_frame().rename(columns={'Run_accession':'count'})
mob_genomes_ls = list(mob_genomes.index)
display(mob_genomes)
mob_genomes_ls

Unnamed: 0,count
SRR14000009,439
SRR13999966,302
SRR14026545,292
SRR14010950,281
SRR14011034,278
...,...
SRR13726574,26
SRR13727006,25
SRR13999944,25
SRR13999926,24


['SRR14000009',
 'SRR13999966',
 'SRR14026545',
 'SRR14010950',
 'SRR14011034',
 'SRR13725705',
 'SRR14026530',
 'SRR14010971',
 'SRR14011033',
 'SRR14010961',
 'SRR14011037',
 'SRR14010949',
 'SRR14011018',
 'SRR14010935',
 'SRR14010942',
 'SRR13726551',
 'SRR14010994',
 'SRR14010981',
 'SRR14010975',
 'SRR14011039',
 'SRR14011026',
 'SRR14010946',
 'SRR14011001',
 'SRR14011002',
 'SRR14011003',
 'SRR14011025',
 'SRR14011010',
 'SRR14010974',
 'SRR14026552',
 'SRR14010987',
 'SRR14010991',
 'SRR14010934',
 'SRR14010970',
 'SRR14010938',
 'SRR14026535',
 'SRR14010951',
 'SRR14010993',
 'SRR14011043',
 'SRR14011029',
 'SRR14011007',
 'SRR14010960',
 'SRR14026534',
 'SRR14011041',
 'SRR14010978',
 'SRR14011015',
 'SRR14010984',
 'SRR14011006',
 'SRR14010944',
 'SRR14010947',
 'SRR14026531',
 'SRR14010995',
 'SRR14011035',
 'SRR14026519',
 'SRR14026550',
 'SRR14010940',
 'SRR14011012',
 'SRR14010980',
 'SRR14010990',
 'SRR13725724',
 'SRR14010976',
 'SRR14010959',
 'SRR14026510',
 'SRR140

In [11]:
mob_cluster = mob.groupby('Run_accession')


count_ls = []
for i in mob_genomes_ls:
    print(i)
    temp = mob_cluster.get_group(i)[['primary_cluster_id']].value_counts()
    print(temp)
    count_ls.append(temp)
    print('%%%%%%%%%%%%%%%%%%%%%%%%')
    print()

SRR14000009
primary_cluster_id
-                     421
AB530                   7
AE312                   7
AB915                   4
dtype: int64
%%%%%%%%%%%%%%%%%%%%%%%%

SRR13999966
primary_cluster_id
-                     273
AE315                  15
AB528                   3
AB529                   3
AE310                   3
AB530                   2
AB759                   2
AF632                   1
dtype: int64
%%%%%%%%%%%%%%%%%%%%%%%%

SRR14026545
primary_cluster_id
-                     223
AC732                  52
AB977                  13
AH273                   2
AA890                   1
AA891                   1
dtype: int64
%%%%%%%%%%%%%%%%%%%%%%%%

SRR14010950
primary_cluster_id
-                     231
AC733                  38
AB976                   8
AE807                   2
AB614                   1
AE672                   1
dtype: int64
%%%%%%%%%%%%%%%%%%%%%%%%

SRR14011034
primary_cluster_id
-                     197
AC731                  52
AB760        

In [12]:
primary_cluster = pd.concat(count_ls,axis=1).drop(labels=['-'],axis=0)
primary_cluster.rename(
    columns={i:j for i,j in zip(primary_cluster.columns,mob_genomes_ls)}, inplace=True
)
prim = primary_cluster.reset_index().rename(columns={'primary_cluster_id':'cluster_id'}).fillna(0)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [13]:
prim_only = prim.copy().set_index('cluster_id').fillna(0)
prim_only = prim_only.T.reset_index().rename(columns={'index':'Run_accession'})
prim_xy_pre= pd.merge(prim_only, meta, on="Run_accession")
prim_xy_pre

Unnamed: 0,Run_accession,AA215,AA366,AA769,AA890,AA891,AA892,AA893,AA894,AA896,...,Teicoplanin,Doxycycline,Erythromycin,Nitrofurantoin,Gentamicin,Linezolid,Levofloxacin,Quin,Streptomycin,Tigecycline
0,SRR14000009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,I,R,S,S,S,S,R,S,S
1,SRR13999966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,S,I,S,S,S,S,I,S,S
2,SRR14026545,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,S,S,S,S,R,S,R,R,S,S
3,SRR14010950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,I,S,R,S,S,S,R,S,S,S
4,SRR14011034,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,...,R,R,R,S,S,S,R,S,I,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642,SRR13726574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,R,S,S,S,S,S,R,S,S
643,SRR13727006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,S,I,S,S,S,S,R,S,S
644,SRR13999944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,R,I,S,S,S,S,R,S,S
645,SRR13999926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,R,I,S,S,S,S,I,S,S


In [14]:
prim_Xy_Vancomycin = prim_xy_pre[prim_xy_pre['Vancomycin'].isin(['R','S','S '])].replace(['S'], 0).replace(['S '], 0).replace(['R'],1).set_index('Run_accession')
prim_Xy_Doxycycline = prim_xy_pre[prim_xy_pre['Doxycycline'].isin(['R','S'])].replace(['S'], 0).replace(['R'],1).set_index('Run_accession')
prim_Xy_Erythromycin = prim_xy_pre[prim_xy_pre['Erythromycin'].isin(['R','S'])].replace(['S'], 0).replace(['R'],1).set_index('Run_accession')

In [15]:
prim_xy_pre_copy = prim_xy_pre.copy()
for column in prim_xy_pre_copy.iloc[:,1:109]:
    prim_xy_pre_copy[column].values[prim_xy_pre_copy[column] > 1] = 1


In [16]:
for column in prim_xy_pre_copy:
    print(column)
    print(prim_xy_pre_copy[column].unique())

Run_accession
['SRR14000009' 'SRR13999966' 'SRR14026545' 'SRR14010950' 'SRR14011034'
 'SRR13725705' 'SRR14026530' 'SRR14010971' 'SRR14011033' 'SRR14010961'
 'SRR14011037' 'SRR14010949' 'SRR14011018' 'SRR14010935' 'SRR14010942'
 'SRR13726551' 'SRR14010994' 'SRR14010981' 'SRR14010975' 'SRR14011039'
 'SRR14011026' 'SRR14010946' 'SRR14011001' 'SRR14011002' 'SRR14011003'
 'SRR14011025' 'SRR14011010' 'SRR14010974' 'SRR14026552' 'SRR14010987'
 'SRR14010991' 'SRR14010934' 'SRR14010970' 'SRR14010938' 'SRR14026535'
 'SRR14010951' 'SRR14010993' 'SRR14011043' 'SRR14011029' 'SRR14011007'
 'SRR14010960' 'SRR14026534' 'SRR14011041' 'SRR14010978' 'SRR14011015'
 'SRR14010984' 'SRR14011006' 'SRR14010944' 'SRR14010947' 'SRR14026531'
 'SRR14010995' 'SRR14011035' 'SRR14026519' 'SRR14026550' 'SRR14010940'
 'SRR14011012' 'SRR14010980' 'SRR14010990' 'SRR13725724' 'SRR14010976'
 'SRR14010959' 'SRR14026510' 'SRR14010996' 'SRR14010985' 'SRR14011024'
 'SRR14010967' 'SRR14010957' 'SRR14010979' 'SRR14010952' 'SRR14

In [17]:
prim_xy_pre_copy

Unnamed: 0,Run_accession,AA215,AA366,AA769,AA890,AA891,AA892,AA893,AA894,AA896,...,Teicoplanin,Doxycycline,Erythromycin,Nitrofurantoin,Gentamicin,Linezolid,Levofloxacin,Quin,Streptomycin,Tigecycline
0,SRR14000009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,I,R,S,S,S,S,R,S,S
1,SRR13999966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,S,I,S,S,S,S,I,S,S
2,SRR14026545,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,S,S,S,S,R,S,R,R,S,S
3,SRR14010950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,I,S,R,S,S,S,R,S,S,S
4,SRR14011034,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,R,R,R,S,S,S,R,S,I,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642,SRR13726574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,R,S,S,S,S,S,R,S,S
643,SRR13727006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,S,I,S,S,S,S,R,S,S
644,SRR13999944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,R,I,S,S,S,S,R,S,S
645,SRR13999926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,S,R,I,S,S,S,S,I,S,S


In [18]:
prim_binary_Xy_Vancomycin = prim_xy_pre_copy[prim_xy_pre_copy['Vancomycin'].isin(['R','S','S '])].replace(['S'], 0).replace(['S '], 0).replace(['R'],1).set_index('Run_accession')
prim_binary_Xy_Doxycycline = prim_xy_pre_copy[prim_xy_pre_copy['Doxycycline'].isin(['R','S'])].replace(['S'], 0).replace(['R'],1).set_index('Run_accession')
prim_binary_Xy_Erythromycin = prim_xy_pre_copy[prim_xy_pre_copy['Erythromycin'].isin(['R','S'])].replace(['S'], 0).replace(['R'],1).set_index('Run_accession')

#### Functions

In [22]:
rf_clf = RandomForestClassifier(criterion = 'gini', n_estimators = 100)
gb_clf = GradientBoostingClassifier(n_estimators = 100, max_features = 'auto', criterion = 'friedman_mse', 
                                     loss = 'exponential', learning_rate = 0.1)
lr_clf = LogisticRegression(penalty='l2',solver='lbfgs')


In [21]:
def ML_model(X, y, clf):
    
    ###############################################
    #Get feature names first:
    features=[]
    for columns in X.columns:
        features.append(columns)
    ###############################################
    
    #init a kfold object. I like to call it "skf".
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    prediction_outcome_ls = []
    
    #So that we can plot again, we'll make a nice data structure for our results.
    records = []
    confusion_matrices = []
    
    tree_feat_ls = []
    lr_feat_ls = []
    
    #Now we can loop.
    #looping on the skf.split() provides train/test indices.
    for train_index, test_index in skf.split(X.values,y.values):
        clf.fit(X.values[train_index], y.values[train_index])
        
        ###############################################
        #figure out the feature importance 
        if clf != lr_clf:
            tree_model = clf.fit(X.values[train_index], y.values[train_index])
            tree_imp_features = tree_model.feature_importances_
            df_imp_features = pd.DataFrame({"features":features}).join(pd.DataFrame({"weights":tree_imp_features}))
            df_imp_features_final = df_imp_features.sort_values(by=['weights'], ascending=False)
            df_imp_features_final['Antibiotic'] = y.columns[0]
            display(df_imp_features_final)
            tree_feat_ls.append(df_imp_features_final)
        
        else:
            lr_model = clf.fit(X.values[train_index], y.values[train_index])
            lr_coef = lr_model.coef_[0]
            column_ls = X.columns.to_list()
            d = {'Features': column_ls, 'lr_coef': lr_coef, 'Antibiotic':y.columns[0]}
            lr_coef_df = pd.DataFrame(data=d).sort_values(by='lr_coef',ascending=False)
            lr_feat_ls.append(lr_coef_df)
        ###############################################
        
                
        y_true_index = y.index[test_index]
        y_true = y.values[test_index]
        y_pred = clf.predict(X.values[test_index]) #get the predicted labels of the test set.
        
        y_true_index_ls = pd.DataFrame(y_true_index.to_list())
        y_true_df = pd.DataFrame(y_true)
        y_pred_df = pd.DataFrame(y_pred)

        #concatenating the two dataframes
        y_lists = [y_true_index_ls, y_true_df, y_pred_df]
        df_y_outcomes = pd.concat(y_lists, axis=1)
        df_y_outcomes.columns = ['GenomeID','y_true','y_pred']
        df_y_outcomes

        prediction_outcome_ls.append(df_y_outcomes)

  
   
        #We calculate our metrics individually now. 
        accuracy = metrics.accuracy_score(y_true, y_pred)
        f1 = metrics.f1_score(y_true, y_pred, average='weighted') #F1 behaves different for binary classification problems! be careful
        precision = metrics.precision_score(y_true, y_pred, average='weighted')
        recall = metrics.recall_score(y_true, y_pred, average='weighted')

        cm = metrics.confusion_matrix(y_true, y_pred) # pass your labels here if you want them.

        #Keeping the scores as a list of dicts that all have the same keys lets us do some nice pandas stuff.
        records.append({'Accuracy': accuracy,
                        'F1': f1,
                        'Precision': precision,
                        'Recall': recall})

        confusion_matrices.append(cm)

    #Now we can get our data summary.

    #metrics is the same as before.
    df = pd.DataFrame.from_records(records)
    #Now we can sum our confusion matrix too.
    cm = np.asarray(confusion_matrices).sum(axis=0)

    prediction_outcome = pd.concat([prediction_outcome_ls[0], prediction_outcome_ls[1], prediction_outcome_ls[2],
                                   prediction_outcome_ls[3], prediction_outcome_ls[4]], axis=0).set_index('GenomeID')
    
    misclassified = prediction_outcome[prediction_outcome.apply(lambda x: x['y_true'] != x['y_pred'], axis = 1)].index
    
    print('{antibiotic} resistance prediction,'.format(antibiotic=y.columns[0]))
    print(df)
    print()
    print("The misclassified isolates for {ab} resistance prediction is stated in a below list:".format(ab=y.columns[0]))
    print(misclassified)   
#     for col in df.columns:
#         metric_name = col#.split('test_')[1]
#         print("="*40)
#         #print each metric average and standard deviation to three decimal places.
#         print("{0}: {1:.3f} +/- {2:.3f}".format(metric_name, df[col].mean(), df[col].std()))
    
    
    return df, tree_feat_ls if len(lr_feat_ls)==0 else lr_feat_ls, misclassified
            #[0] = the dataframe , [1] = feature list, [2] = clf.ft... (the trained model!), [3] = misclassified


def true_pred(X,y,clf):
    
    #init a kfold object. I like to call it "skf".
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    prediction_outcome_ls = []
    #Now we can loop.
    #looping on the skf.split() provides train/test indices.
    for train_index, test_index in skf.split(X.values,y.values):
        clf.fit(X.values[train_index], y.values[train_index])

        y_true_index = y.index[test_index]
        y_true = y.values[test_index]
        y_pred = clf.predict(X.values[test_index]) #get the predicted labels of the test set.

        y_true_index_ls = pd.DataFrame(y_true_index.to_list())
        y_true_df = pd.DataFrame(y_true)
        y_pred_df = pd.DataFrame(y_pred)

        #concatenating the two dataframes
        y_lists = [y_true_index_ls, y_true_df, y_pred_df]
        df_y_outcomes = pd.concat(y_lists, axis=1)
        df_y_outcomes.columns = ['GenomeID','y_true','y_pred']
        df_y_outcomes

        prediction_outcome_ls.append(df_y_outcomes)

  
    prediction_outcome = pd.concat([prediction_outcome_ls[0], prediction_outcome_ls[1], prediction_outcome_ls[2],
                                   prediction_outcome_ls[3], prediction_outcome_ls[4]], axis=0).set_index('GenomeID')
    
    print("The misclassified isolates for {ab} resistance prediction is stated in a below list:".format(ab=y.columns[0]))
    misclassified = prediction_outcome[prediction_outcome.apply(lambda x: x['y_true'] != x['y_pred'], axis = 1)].index
    print(misclassified)
    print()
    print()
    print("The prediction and true labels for {ab} is:".format(ab=y.columns[0]))
    return prediction_outcome
    

#### ML analysis

In [23]:
both_vnco_ML_plasmids_binary = ML_model(prim_binary_Xy_Vancomycin.iloc[:,0:108] ,prim_binary_Xy_Vancomycin[['Vancomycin']], gb_clf)
both_doxy_ML_plasmids_binary = ML_model(prim_binary_Xy_Doxycycline.iloc[:,0:108] ,prim_binary_Xy_Doxycycline[['Doxycycline']], gb_clf)
both_eryth_ML_plasmids_binary = ML_model(prim_binary_Xy_Erythromycin.iloc[:,0:108] ,prim_binary_Xy_Erythromycin[['Erythromycin']], rf_clf)

  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
54,AC733,0.547611,Vancomycin
88,AE577,0.205818,Vancomycin
90,AE672,0.045353,Vancomycin
13,AB173,0.043538,Vancomycin
28,AB760,0.042600,Vancomycin
...,...,...,...
1,AA366,0.000000,Vancomycin
51,AC729,0.000000,Vancomycin
50,AC728,0.000000,Vancomycin
48,AC726,0.000000,Vancomycin


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
54,AC733,0.478410,Vancomycin
88,AE577,0.207974,Vancomycin
13,AB173,0.116347,Vancomycin
28,AB760,0.062703,Vancomycin
6,AA893,0.023450,Vancomycin
...,...,...,...
1,AA366,0.000000,Vancomycin
51,AC729,0.000000,Vancomycin
50,AC728,0.000000,Vancomycin
46,AC631,0.000000,Vancomycin


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
88,AE577,0.368800,Vancomycin
54,AC733,0.325964,Vancomycin
13,AB173,0.098352,Vancomycin
28,AB760,0.042587,Vancomycin
6,AA893,0.034325,Vancomycin
...,...,...,...
51,AC729,0.000000,Vancomycin
50,AC728,0.000000,Vancomycin
48,AC726,0.000000,Vancomycin
46,AC631,0.000000,Vancomycin


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
54,AC733,0.544504,Vancomycin
13,AB173,0.131884,Vancomycin
88,AE577,0.108313,Vancomycin
28,AB760,0.072531,Vancomycin
23,AB617,0.034970,Vancomycin
...,...,...,...
48,AC726,0.000000,Vancomycin
46,AC631,0.000000,Vancomycin
44,AC607,0.000000,Vancomycin
43,AC544,0.000000,Vancomycin


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
54,AC733,0.657284,Vancomycin
53,AC732,0.100276,Vancomycin
88,AE577,0.045668,Vancomycin
20,AB614,0.039171,Vancomycin
13,AB173,0.038231,Vancomycin
...,...,...,...
66,AD269,0.000000,Vancomycin
68,AD296,0.000000,Vancomycin
69,AD297,0.000000,Vancomycin
70,AD414,0.000000,Vancomycin


Vancomycin resistance prediction,
   Accuracy        F1  Precision    Recall
0  0.950820  0.948560   0.950317  0.950820
1  0.975410  0.976044   0.978484  0.975410
2  0.983607  0.983897   0.985032  0.983607
3  0.958678  0.958250   0.958051  0.958678
4  0.950413  0.950413   0.950413  0.950413

The misclassified isolates for Vancomycin resistance prediction is stated in a below list:
Index(['SRR14010961', 'SRR14010945', 'SRR14011027', 'SRR14010955',
       'SRR14022769', 'SRR14000612', 'SRR14026521', 'SRR14026523',
       'SRR14011017', 'SRR14026519', 'SRR14026554', 'SRR14010949',
       'SRR14026552', 'SRR14011012', 'SRR14010952', 'SRR14026490',
       'SRR14010969', 'SRR14010998', 'SRR14010988', 'SRR14026555',
       'SRR14026517', 'SRR14000586'],
      dtype='object', name='GenomeID')


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
25,AB756,0.228461,Doxycycline
19,AB530,0.224424,Doxycycline
54,AC733,0.074967,Doxycycline
77,AD907,0.051403,Doxycycline
84,AE312,0.046427,Doxycycline
...,...,...,...
64,AD265,0.000000,Doxycycline
66,AD269,0.000000,Doxycycline
69,AD297,0.000000,Doxycycline
70,AD414,0.000000,Doxycycline


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
25,AB756,0.233573,Doxycycline
84,AE312,0.118095,Doxycycline
77,AD907,0.080029,Doxycycline
19,AB530,0.066008,Doxycycline
88,AE577,0.061028,Doxycycline
...,...,...,...
68,AD296,0.000000,Doxycycline
69,AD297,0.000000,Doxycycline
70,AD414,0.000000,Doxycycline
36,AB977,0.000000,Doxycycline


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
25,AB756,0.228143,Doxycycline
84,AE312,0.137488,Doxycycline
19,AB530,0.071639,Doxycycline
54,AC733,0.071626,Doxycycline
90,AE672,0.051524,Doxycycline
...,...,...,...
18,AB529,0.000000,Doxycycline
66,AD269,0.000000,Doxycycline
69,AD297,0.000000,Doxycycline
70,AD414,0.000000,Doxycycline


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
25,AB756,0.197027,Doxycycline
19,AB530,0.108689,Doxycycline
54,AC733,0.091310,Doxycycline
84,AE312,0.059897,Doxycycline
77,AD907,0.058943,Doxycycline
...,...,...,...
63,AD156,0.000000,Doxycycline
69,AD297,0.000000,Doxycycline
70,AD414,0.000000,Doxycycline
75,AD710,0.000000,Doxycycline


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Unnamed: 0,features,weights,Antibiotic
54,AC733,0.164899,Doxycycline
25,AB756,0.158840,Doxycycline
19,AB530,0.123537,Doxycycline
84,AE312,0.076985,Doxycycline
73,AD582,0.071048,Doxycycline
...,...,...,...
69,AD297,0.000000,Doxycycline
70,AD414,0.000000,Doxycycline
74,AD709,0.000000,Doxycycline
75,AD710,0.000000,Doxycycline




Doxycycline resistance prediction,
   Accuracy        F1  Precision    Recall
0  0.858491  0.858298   0.858260  0.858491
1  0.801887  0.798418   0.807210  0.801887
2  0.828571  0.827486   0.828874  0.828571
3  0.847619  0.846654   0.848224  0.847619
4  0.828571  0.828095   0.828231  0.828571

The misclassified isolates for Doxycycline resistance prediction is stated in a below list:
Index(['SRR14011038', 'SRR14010933', 'SRR13999992', 'SRR14024964',
       'SRR13726510', 'SRR13999989', 'SRR14000587', 'SRR13727021',
       'SRR13999997', 'SRR13712374', 'SRR14000007', 'SRR14026508',
       'SRR14000016', 'SRR13727027', 'SRR14025003', 'SRR14010994',
       'SRR14026531', 'SRR14010992', 'SRR14026516', 'SRR14000614',
       'SRR14026485', 'SRR14022782', 'SRR13725695', 'SRR14026533',
       'SRR13712385', 'SRR14024995', 'SRR14024999', 'SRR14026500',
       'SRR13727032', 'SRR14024979', 'SRR13712376', 'SRR14026512',
       'SRR14000027', 'SRR13712363', 'SRR14000020', 'SRR14000005',
       'SRR



Unnamed: 0,features,weights,Antibiotic
19,AB530,0.065878,Erythromycin
25,AB756,0.063912,Erythromycin
60,AD058,0.053601,Erythromycin
84,AE312,0.049820,Erythromycin
93,AF632,0.043059,Erythromycin
...,...,...,...
75,AD710,0.000000,Erythromycin
74,AD709,0.000000,Erythromycin
69,AD297,0.000000,Erythromycin
68,AD296,0.000000,Erythromycin




Unnamed: 0,features,weights,Antibiotic
60,AD058,0.055609,Erythromycin
19,AB530,0.052241,Erythromycin
25,AB756,0.043738,Erythromycin
77,AD907,0.042241,Erythromycin
17,AB528,0.035123,Erythromycin
...,...,...,...
38,AC468,0.000000,Erythromycin
75,AD710,0.000000,Erythromycin
55,AC751,0.000000,Erythromycin
46,AC631,0.000000,Erythromycin




Unnamed: 0,features,weights,Antibiotic
25,AB756,0.066149,Erythromycin
19,AB530,0.063285,Erythromycin
84,AE312,0.050286,Erythromycin
17,AB528,0.046040,Erythromycin
60,AD058,0.045467,Erythromycin
...,...,...,...
74,AD709,0.000000,Erythromycin
69,AD297,0.000000,Erythromycin
68,AD296,0.000000,Erythromycin
56,AC753,0.000000,Erythromycin




Unnamed: 0,features,weights,Antibiotic
19,AB530,0.056217,Erythromycin
60,AD058,0.055662,Erythromycin
25,AB756,0.053317,Erythromycin
93,AF632,0.042346,Erythromycin
84,AE312,0.039215,Erythromycin
...,...,...,...
75,AD710,0.000000,Erythromycin
74,AD709,0.000000,Erythromycin
56,AC753,0.000000,Erythromycin
63,AD156,0.000000,Erythromycin




Unnamed: 0,features,weights,Antibiotic
19,AB530,0.061379,Erythromycin
25,AB756,0.052266,Erythromycin
77,AD907,0.046255,Erythromycin
60,AD058,0.045384,Erythromycin
17,AB528,0.042921,Erythromycin
...,...,...,...
57,AC754,0.000000,Erythromycin
56,AC753,0.000000,Erythromycin
44,AC607,0.000000,Erythromycin
43,AC544,0.000000,Erythromycin


Erythromycin resistance prediction,
   Accuracy        F1  Precision    Recall
0  0.905263  0.903288   0.901672  0.905263
1  0.947368  0.941213   0.950359  0.947368
2  0.778947  0.774804   0.770756  0.778947
3  0.905263  0.906871   0.908798  0.905263
4  0.884211  0.876968   0.872746  0.884211

The misclassified isolates for Erythromycin resistance prediction is stated in a below list:
Index(['SRR14026538', 'SRR14022776', 'SRR14022738', 'SRR14022782',
       'SRR14024952', 'SRR14024995', 'SRR13712392', 'SRR14026501',
       'SRR14022774', 'SRR14000003', 'SRR14024973', 'SRR14026445',
       'SRR13726527', 'SRR13712364', 'SRR14026477', 'SRR14022771',
       'SRR14026475', 'SRR13999995', 'SRR14026460', 'SRR14022746',
       'SRR14026454', 'SRR13712368', 'SRR14026526', 'SRR14026448',
       'SRR13712494', 'SRR13999967', 'SRR13712374', 'SRR13727001',
       'SRR13712362', 'SRR13727013', 'SRR13725723', 'SRR13712366',
       'SRR14026512', 'SRR13726535', 'SRR14000027', 'SRR14026545',
       'S

In [50]:
# both_vnco_ML_plasmids_binary_df = both_vnco_ML_plasmids_binary[0]
# both_vnco_ML_plasmids_binary_df.loc['Mean score'] = both_vnco_ML_plasmids_binary_df.mean()
# both_vnco_ML_plasmids_binary_df.loc['std'] = both_vnco_ML_plasmids_binary_df.std()
# both_vnco_ML_plasmids_binary_df['Classifier'] = 'Gradient boosting'
# both_vnco_ML_plasmids_binary_df['Antibiotic'] = 'Vancomycin'
# both_vnco_ML_plasmids_binary_df['Features'] = 'Plasmid clusters binary'

# both_doxy_ML_plasmids_binary_df = both_doxy_ML_plasmids_binary[0]
# both_doxy_ML_plasmids_binary_df.loc['Mean score'] = both_doxy_ML_plasmids_binary_df.mean()
# both_doxy_ML_plasmids_binary_df.loc['std'] = both_doxy_ML_plasmids_binary_df.std()
# both_doxy_ML_plasmids_binary_df['Classifier'] = 'Gradient boosting'
# both_doxy_ML_plasmids_binary_df['Antibiotic'] = 'Doxycycline'
# both_doxy_ML_plasmids_binary_df['Features'] = 'Plasmid clusters binary'

# both_eryth_ML_plasmids_binary_df = both_eryth_ML_plasmids_binary[0]
# both_eryth_ML_plasmids_binary_df.loc['Mean score'] = both_eryth_ML_plasmids_binary_df.mean()
# both_eryth_ML_plasmids_binary_df.loc['std'] = both_eryth_ML_plasmids_binary_df.std()
# both_eryth_ML_plasmids_binary_df['Classifier'] = 'Random forest'
# both_eryth_ML_plasmids_binary_df['Antibiotic'] = 'Erythromycin'
# both_eryth_ML_plasmids_binary_df['Features'] = 'Plasmid clusters binary'

display(both_vnco_ML_plasmids_binary_df)
display(both_doxy_ML_plasmids_binary_df)
display(both_eryth_ML_plasmids_binary_df)



Unnamed: 0,Accuracy,F1,Precision,Recall,Classifier,Antibiotic,Features
0,0.959016,0.956579,0.960932,0.959016,Gradient boosting,Vancomycin,Plasmid clusters binary
1,0.97541,0.976044,0.978484,0.97541,Gradient boosting,Vancomycin,Plasmid clusters binary
2,0.983607,0.983897,0.985032,0.983607,Gradient boosting,Vancomycin,Plasmid clusters binary
3,0.958678,0.95825,0.958051,0.958678,Gradient boosting,Vancomycin,Plasmid clusters binary
4,0.950413,0.950413,0.950413,0.950413,Gradient boosting,Vancomycin,Plasmid clusters binary
Mean score,0.965425,0.965037,0.966582,0.965425,Gradient boosting,Vancomycin,Plasmid clusters binary
std,0.012184,0.012715,0.013025,0.012184,Gradient boosting,Vancomycin,Plasmid clusters binary


Unnamed: 0,Accuracy,F1,Precision,Recall,Classifier,Antibiotic,Features
0,0.839623,0.839404,0.839337,0.839623,Gradient boosting,Doxycycline,Plasmid clusters binary
1,0.801887,0.798418,0.80721,0.801887,Gradient boosting,Doxycycline,Plasmid clusters binary
2,0.838095,0.837374,0.838052,0.838095,Gradient boosting,Doxycycline,Plasmid clusters binary
3,0.857143,0.855941,0.858776,0.857143,Gradient boosting,Doxycycline,Plasmid clusters binary
4,0.819048,0.818814,0.81873,0.819048,Gradient boosting,Doxycycline,Plasmid clusters binary
Mean score,0.831159,0.82999,0.832421,0.831159,Gradient boosting,Doxycycline,Plasmid clusters binary
std,0.018966,0.019689,0.017873,0.018966,Gradient boosting,Doxycycline,Plasmid clusters binary


Unnamed: 0,Accuracy,F1,Precision,Recall,Classifier,Antibiotic,Features
0,0.926316,0.921145,0.920236,0.926316,Random forest,Erythromycin,Plasmid clusters binary
1,0.936842,0.927417,0.9411,0.936842,Random forest,Erythromycin,Plasmid clusters binary
2,0.789474,0.789474,0.789474,0.789474,Random forest,Erythromycin,Plasmid clusters binary
3,0.894737,0.898144,0.90272,0.894737,Random forest,Erythromycin,Plasmid clusters binary
4,0.884211,0.876968,0.872746,0.884211,Random forest,Erythromycin,Plasmid clusters binary
Mean score,0.886316,0.882629,0.885255,0.886316,Random forest,Erythromycin,Plasmid clusters binary
std,0.052166,0.04988,0.052873,0.052166,Random forest,Erythromycin,Plasmid clusters binary


In [68]:
plasmid_vnco_fs = both_vnco_ML_plasmids_binary[1]
plasmid_doxy_fs = both_doxy_ML_plasmids_binary[1]
plasmid_eryth_fs = both_eryth_ML_plasmids_binary[1]

In [92]:
vnco_features = plasmid_vnco_fs[0].drop(columns=['Antibiotic'])\
                .merge(plasmid_vnco_fs[1].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_vnco_fs[2].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_vnco_fs[3].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_vnco_fs[4],on='features')
doxy_features = plasmid_doxy_fs[0].drop(columns=['Antibiotic'])\
                .merge(plasmid_doxy_fs[1].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_doxy_fs[2].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_doxy_fs[3].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_doxy_fs[4],on='features')
eryth_features = plasmid_eryth_fs[0].drop(columns=['Antibiotic'])\
                .merge(plasmid_eryth_fs[1].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_eryth_fs[2].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_eryth_fs[3].drop(columns=['Antibiotic']) ,on='features')\
                .merge(plasmid_eryth_fs[4],on='features')

  validate=validate,


In [95]:
vnco_features.to_csv('vnco_plasmid_features_June2023.csv', index=None)
doxy_features.to_csv('doxy_plasmid_features_June2023.csv', index=None)
eryth_features.to_csv('eryth_plasmid_features_June2023.csv', index=None)

In [24]:
plasmid_van_ml_misclassified = true_pred(prim_binary_Xy_Vancomycin.iloc[:,0:108] ,prim_binary_Xy_Vancomycin[['Vancomycin']], gb_clf)
plasmid_dox_ml_misclassified = true_pred(prim_binary_Xy_Doxycycline.iloc[:,0:108] ,prim_binary_Xy_Doxycycline[['Doxycycline']], gb_clf) 
plasmid_erth_ml_misclassified = true_pred(prim_binary_Xy_Erythromycin.iloc[:,0:108] ,prim_binary_Xy_Erythromycin[['Erythromycin']], rf_clf) 

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


The misclassified isolates for Vancomycin resistance prediction is stated in a below list:
Index(['SRR14010961', 'SRR14010945', 'SRR14011027', 'SRR14010955',
       'SRR14000612', 'SRR14026521', 'SRR14011017', 'SRR14026519',
       'SRR14026554', 'SRR14010949', 'SRR14026552', 'SRR14011012',
       'SRR14010952', 'SRR14026490', 'SRR14010998', 'SRR14026555',
       'SRR14026517', 'SRR14000586'],
      dtype='object', name='GenomeID')


The prediction and true labels for Vancomycin is:


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


The misclassified isolates for Doxycycline resistance prediction is stated in a below list:
Index(['SRR14011038', 'SRR14010933', 'SRR13999992', 'SRR14024964',
       'SRR13726510', 'SRR13999989', 'SRR14000587', 'SRR13999997',
       'SRR13712374', 'SRR13726594', 'SRR14000007', 'SRR13712391',
       'SRR14026508', 'SRR13712384', 'SRR14000016', 'SRR13727027',
       'SRR14000018', 'SRR14025003', 'SRR13727008', 'SRR14010994',
       'SRR14026531', 'SRR14010992', 'SRR14026516', 'SRR14000614',
       'SRR14026485', 'SRR14022782', 'SRR13725695', 'SRR14026533',
       'SRR13712385', 'SRR14024995', 'SRR14024999', 'SRR14026500',
       'SRR13727032', 'SRR14024979', 'SRR13712376', 'SRR14026512',
       'SRR14000027', 'SRR13712363', 'SRR14000020', 'SRR14000005',
       'SRR14022769', 'SRR13999978', 'SRR14022771', 'SRR14000595',
       'SRR14026461', 'SRR14026456', 'SRR14026451', 'SRR14000025',
       'SRR14026484', 'SRR13999986', 'SRR13712383', 'SRR13999990',
       'SRR13712366', 'SRR14026513', 



The misclassified isolates for Erythromycin resistance prediction is stated in a below list:
Index(['SRR14026538', 'SRR14022776', 'SRR14024949', 'SRR14022782',
       'SRR14024952', 'SRR14022754', 'SRR14024995', 'SRR13712392',
       'SRR14026501', 'SRR13712373', 'SRR14000003', 'SRR14024973',
       'SRR14026445', 'SRR13726527', 'SRR13712364', 'SRR14022771',
       'SRR13999995', 'SRR14026460', 'SRR14026454', 'SRR13712368',
       'SRR14026526', 'SRR14026448', 'SRR13712494', 'SRR13999967',
       'SRR13712374', 'SRR13727001', 'SRR13712362', 'SRR13727013',
       'SRR13725723', 'SRR13712366', 'SRR14026512', 'SRR13726535',
       'SRR14000027', 'SRR14026545', 'SRR14026514', 'SRR14024985',
       'SRR14026492', 'SRR14024958', 'SRR13999974', 'SRR14026500',
       'SRR14026508', 'SRR13727015', 'SRR14026546', 'SRR14026447',
       'SRR14024959', 'SRR13712361', 'SRR13999930', 'SRR13727021',
       'SRR14022765', 'SRR14026472', 'SRR13999960', 'SRR14026513',
       'SRR14000018', 'SRR14000005']



In [28]:
plasmid_van_prediction_all = plasmid_van_ml_misclassified.reset_index().rename(columns={'GenomeID':'Run_accession'})
plasmid_dox_prediction_all = plasmid_dox_ml_misclassified.reset_index().rename(columns={'GenomeID':'Run_accession'})
plasmid_erth_prediction_all = plasmid_erth_ml_misclassified.reset_index().rename(columns={'GenomeID':'Run_accession'})

In [34]:
plasmid_van_prediction_all_species_ab = pd.merge(plasmid_van_prediction_all, meta[['Run_accession','Species']], on="Run_accession")
plasmid_van_prediction_all_species_ab['antibiotic']= 'Vancomycin' 

plasmid_dox_prediction_all_species_ab = pd.merge(plasmid_dox_prediction_all, meta[['Run_accession','Species']], on="Run_accession")
plasmid_dox_prediction_all_species_ab['antibiotic']= 'Doxycycline' 

plasmid_erth_prediction_all_species_ab = pd.merge(plasmid_erth_prediction_all, meta[['Run_accession','Species']], on="Run_accession")
plasmid_erth_prediction_all_species_ab['antibiotic']= 'Erythromycin' 

In [39]:
pd.concat([plasmid_van_prediction_all_species_ab, plasmid_dox_prediction_all_species_ab, 
           plasmid_erth_prediction_all_species_ab]).to_csv('plasmid_clusters_prediction_all.csv',index=None)


---

In [55]:
plasmid_van_ml_misclassified_df = plasmid_van_ml_misclassified.loc[~(plasmid_van_ml_misclassified['y_true'] == plasmid_van_ml_misclassified['y_pred'])]
plasmid_dox_ml_misclassified_df = plasmid_dox_ml_misclassified.loc[~(plasmid_dox_ml_misclassified['y_true'] == plasmid_dox_ml_misclassified['y_pred'])]
plasmid_erth_ml_misclassified_df = plasmid_erth_ml_misclassified.loc[~(plasmid_erth_ml_misclassified['y_true'] == plasmid_erth_ml_misclassified['y_pred'])]

In [58]:
plasmid_van_ml_misclassified_df['ab'] = 'vancomycin'
plasmid_dox_ml_misclassified_df['ab'] = 'doxycycline'
plasmid_erth_ml_misclassified_df['ab'] = 'erythromycin'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [61]:
plasmid_misclassified_df = pd.concat([plasmid_van_ml_misclassified_df, plasmid_dox_ml_misclassified_df, plasmid_erth_ml_misclassified_df])
# plasmid_misclassified_df.to_csv('plasmid_clusters_misclassified_df.csv')
plasmid_misclassified_df

Unnamed: 0_level_0,y_true,y_pred,ab
GenomeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SRR14010961,1,0,vancomycin
SRR14010945,1,0,vancomycin
SRR14011027,1,0,vancomycin
SRR14010955,1,0,vancomycin
SRR14000612,1,0,vancomycin
...,...,...,...
SRR14026472,0,1,erythromycin
SRR13999960,1,0,erythromycin
SRR14026513,1,0,erythromycin
SRR14000018,0,1,erythromycin


In [65]:
mob_all[mob_all['Run_accession'].isin(plasmid_misclassified_df.index)]#.to_csv('plasmid_annot_misclassified_cases.csv',index=None)

Unnamed: 0,Run_accession,molecule_type,primary_cluster_id,secondary_cluster_id,contig_id,size,gc,md5,circularity_status,rep_type(s),...,mpf_type_accession(s),orit_type(s),orit_accession(s),predicted_mobility,mash_nearest_neighbor,mash_neighbor_distance,mash_neighbor_identification,repetitive_dna_id,repetitive_dna_type,filtering_reason
1404,SRR14000003,chromosome,-,-,1_length=392667_depth=1.00x,392667,37.061938,f98cb87350501d5c8649990c19ef1c3f,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
1405,SRR14000003,chromosome,-,-,2_length=284894_depth=0.95x,284894,36.014447,e5cc961378dab62e2929ef99e3374834,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
1406,SRR14000003,chromosome,-,-,3_length=270251_depth=1.10x,270251,38.543798,0a776cac048e7b45136eb155c099da97,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
1407,SRR14000003,chromosome,-,-,4_length=223660_depth=0.88x,223660,37.600376,fa13d04659f671947174b00532fb4220,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
1408,SRR14000003,chromosome,-,-,5_length=185026_depth=0.88x,185026,37.572017,99af63663ce95aa3b997b3a75ea35655,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69342,SRR13999960,chromosome,-,-,43_length=293_depth=4.37x,293,39.931741,f0b18d99b03e94d0e2db114b653fd6ed,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
69343,SRR13999960,chromosome,-,-,44_length=232_depth=2.07x,232,31.465517,a5bb3f544408b0b59a7e8a3c4d82e87e,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
69344,SRR13999960,chromosome,-,-,45_length=137_depth=3.26x,137,51.094891,76575ecc7fe119d9adbf384e03a7bc16,incomplete,-,...,-,-,-,-,-,-,-,-,-,-
69345,SRR13999960,chromosome,-,-,46_length=124_depth=3.83x,124,43.548387,90d1820d2597529375bdd009653a0b10,incomplete,-,...,-,-,-,-,-,-,-,-,-,-


In [16]:
mobtyper647 = pd.read_csv('/home/jee/conjugation_study/short_read_mobtyper647_df.csv')
# display(mobtyper647.columns)
mobtyper647[mobtyper647['primary_cluster_id'].str.contains('AC733', na=None, case=None)]['mash_nearest_neighbor'].unique()
# [['sample_id', 'num_contigs', 'size', 'gc', 'predicted_mobility','mash_nearest_neighbor', 'mash_neighbor_distance',
#   'mash_neighbor_identification', 'primary_cluster_id', 'secondary_cluster_id', 'Run_accession']]

array(['CP003586', 'CP019993', 'CP019209', 'CP027498', 'CP044275',
       'CP018066'], dtype=object)