In [1]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict

In [2]:
# Create a dataframe for the output that have all the paths
master_df= pd.read_csv('CAMIH1_Results/CAMIH1Categories.tsv', sep='\t')

In [3]:
master_df.columns

Index(['Unnamed: 0', 'Query', 'Path_Bandage', 'Start_Bandage', 'End_Bandage',
       'Extracted_Path', 'Start_SPAligner', 'End_SPAligner',
       'Length_SPAligner', 'Path_SPAligner', 'Path_GraphAligner',
       'Start_GraphAligner', 'End_GraphAligner'],
      dtype='object')

In [4]:
master_df.dtypes

Unnamed: 0              int64
Query                  object
Path_Bandage           object
Start_Bandage          object
End_Bandage            object
Extracted_Path         object
Start_SPAligner        object
End_SPAligner          object
Length_SPAligner       object
Path_SPAligner         object
Path_GraphAligner      object
Start_GraphAligner    float64
End_GraphAligner      float64
dtype: object

In [5]:
#Fill the empty column with 0
master_df= master_df.fillna(000)

In [6]:
 #List of column names that are float to convert to integers
#columns_to_convert = ['Start_SPAligner', 'End_SPAligner', 'Start_GraphAligner','End_GraphAligner']
columns_to_convert = ['Start_GraphAligner','End_GraphAligner']
# Convert the selected columns from float to integer data type
master_df[columns_to_convert] = master_df[columns_to_convert].astype(int)

In [7]:
master_df.dtypes

Unnamed: 0             int64
Query                 object
Path_Bandage          object
Start_Bandage         object
End_Bandage           object
Extracted_Path        object
Start_SPAligner       object
End_SPAligner         object
Length_SPAligner      object
Path_SPAligner        object
Path_GraphAligner     object
Start_GraphAligner     int64
End_GraphAligner       int64
dtype: object

In [8]:
master_df[columns_to_convert] = master_df[columns_to_convert].astype(str)

In [9]:
master_df.dtypes

Unnamed: 0             int64
Query                 object
Path_Bandage          object
Start_Bandage         object
End_Bandage           object
Extracted_Path        object
Start_SPAligner       object
End_SPAligner         object
Length_SPAligner      object
Path_SPAligner        object
Path_GraphAligner     object
Start_GraphAligner    object
End_GraphAligner      object
dtype: object

In [10]:
# Remove the signs "-" and "+" from the values in the Path_SPAligner
master_df['Path_SPAligner'] = master_df['Path_SPAligner'].str.replace(r'[+-]', '', regex=True)


In [11]:
master_df

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start_Bandage,End_Bandage,Extracted_Path,Start_SPAligner,End_SPAligner,Length_SPAligner,Path_SPAligner,Path_GraphAligner,Start_GraphAligner,End_GraphAligner
0,0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877
1,1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,"(47) 3478183-, 50639855- (37) , (47) 3478183-,...","47, 47","37, 37","3478183, 50639855,3478183, 36393793",46,852,806,3478183,3478183,47,907
2,2,gb|AB200915.1|-|1831-2305|ARO:3005084|dfrA31,(8) 54829473- (481),8,481,54829473,7,426,419,54829473,54829473,8,481
3,3,gb|AB302939|+|8-869|ARO:3001115|SHV-60,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877
4,4,gb|AB372881|+|8-869|ARO:3001160|SHV-111,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877
...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,702,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,0,0,0,0,82,812,730,3478183,0,0,0
703,703,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,0,0,870,870,3478183,0,0,0
704,704,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7 Partial,0,0,0,0,82,812,730,3478183,0,0,0
705,705,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,0,0,0,0,82,812,730,3478183,0,0,0


## This section compares the output between Bandage and SPAligner

In [12]:
def compare_values_Bandage_SPAligner(row):
    if row['Start_Bandage'] == row['Start_SPAligner'] and row['End_Bandage'] == row['End_SPAligner'] and row['Extracted_Path'] == row['Path_SPAligner']:
        return 'Full'
    elif row['Extracted_Path'] == row['Path_SPAligner']and row['Start_Bandage'] != row['Start_SPAligner'] and row['End_Bandage'] != row['End_SPAligner']:
        return 'MatchOnPath'
    elif row['Extracted_Path'] == row['Path_SPAligner'] and row['Start_Bandage'] == row['Start_SPAligner'] and row['End_Bandage'] != row['End_SPAligner'] :
        return 'MatchOnPathAndStart'
    elif row['Extracted_Path'] == row['Path_SPAligner'] and row['Start_Bandage'] != row['Start_SPAligner'] and row['End_Bandage'] == row['End_SPAligner'] :
        return 'MatchOnPathAndEnd'
    else:
        return 'Different'

In [13]:
master_df['BandageVSSPAligner'] = master_df.apply(compare_values_Bandage_SPAligner, axis=1)

In [14]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [15]:
output_list = []

for index, row in master_df.iterrows():
    path_values = str(row['Extracted_Path']).split(',')
    start_values = str(row['Start_Bandage']).split(',')
    end_values = str(row['End_Bandage']).split(',')
    
    path_value = row['Path_SPAligner']
    start_value= str(row['Start_SPAligner']).split(',')
    end_value= str(row['End_SPAligner']).split(',')
    
    if path_value in path_values  and start_value in start_values and end_value in end_values:
        output_list.append('SinglePathFull')
    elif path_value in path_values and start_value in start_values and end_value not in end_values:
        output_list.append('SinglePathStartMatch')
    elif path_value in path_values and start_value not in start_values and end_value in end_values:
        output_list.append('SinglePathEndMatch')
    elif path_value in path_values and start_value not in start_values and end_value not in end_values:
        output_list.append('SinglePathMatch')
    elif path_value not in path_values and start_value in start_values and end_value not in end_values:
        output_list.append('SingleStartMatch')
    else:
         output_list.append('Different')

In [16]:

master_df['ResultsBandageVSSPAligner'] = output_list

In [17]:
master_df

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start_Bandage,End_Bandage,Extracted_Path,Start_SPAligner,End_SPAligner,Length_SPAligner,Path_SPAligner,Path_GraphAligner,Start_GraphAligner,End_GraphAligner,BandageVSSPAligner,ResultsBandageVSSPAligner
0,0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877,MatchOnPath,SinglePathMatch
1,1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,"(47) 3478183-, 50639855- (37) , (47) 3478183-,...","47, 47","37, 37","3478183, 50639855,3478183, 36393793",46,852,806,3478183,3478183,47,907,Different,SinglePathMatch
2,2,gb|AB200915.1|-|1831-2305|ARO:3005084|dfrA31,(8) 54829473- (481),8,481,54829473,7,426,419,54829473,54829473,8,481,MatchOnPath,SinglePathMatch
3,3,gb|AB302939|+|8-869|ARO:3001115|SHV-60,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877,MatchOnPath,SinglePathMatch
4,4,gb|AB372881|+|8-869|ARO:3001160|SHV-111,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877,MatchOnPath,SinglePathMatch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,702,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,0,0,0,0,82,812,730,3478183,0,0,0,Different,Different
703,703,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,0,0,870,870,3478183,0,0,0,Different,Different
704,704,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7 Partial,0,0,0,0,82,812,730,3478183,0,0,0,Different,Different
705,705,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,0,0,0,0,82,812,730,3478183,0,0,0,Different,Different


In [18]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

## Compare the results between SPAligner and GraphAligner

In [19]:
def compare_values_GraphAligner_SPAligner(row):
    if row['Start_SPAligner'] == row['Start_GraphAligner'] and row['End_SPAligner'] == row['End_GraphAligner'] and row['Path_SPAligner'] == row['Path_GraphAligner']:
        return 'Full'
    elif row['Path_SPAligner'] == row['Path_GraphAligner']and row['Start_SPAligner'] != row['Start_GraphAligner']and row['End_SPAligner'] == row['End_GraphAligner'] :
        return 'MatchOnPathAndEnd'
    elif row['Path_SPAligner'] == row['Path_GraphAligner'] and row['Start_SPAligner'] == row['Start_GraphAligner'] and row['End_SPAligner'] != row['End_GraphAligner'] :
        return 'MatchOnPathAndStart'
    elif row['Path_SPAligner'] == row['Path_GraphAligner'] and row['Start_SPAligner'] != row['Start_GraphAligner'] and row['End_SPAligner'] != row['End_GraphAligner'] :
        return 'MatchOnPath'
    else:
        return 'Different'

In [20]:
master_df['SPAlignerVSGraphAligner'] = master_df.apply(compare_values_GraphAligner_SPAligner, axis=1)

In [21]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [22]:
output_list2 = []

for index, row in master_df.iterrows():
    path_values = str(row['Path_GraphAligner']).split(',')
    start_values = str(row['Start_GraphAligner']).split(',')
    end_values = str(row['End_GraphAligner']).split(',')
    
    path_value = row['Path_SPAligner']
    start_value= str(row['Start_SPAligner']).split(',')
    end_value= str(row['End_SPAligner']).split(',')
    
    if path_value in path_values  and start_value in start_values and end_value in end_values:
        output_list2.append('SinglePathFull')
    elif path_value in path_values and start_value in start_values and end_value not in end_values:
        output_list2.append('SinglePathStartMatch')
    elif path_value in path_values and start_value not in start_values and end_value in end_values:
        output_list2.append('SinglePathEndMatch')
    elif path_value in path_values and start_value not in start_values and end_value not in end_values:
        output_list2.append('SinglePathMatch')
    elif path_value not in path_values and start_value in start_values and end_value not in end_values:
        output_list2.append('SingleStartMatch')
    else:
         output_list2.append('Different')



In [23]:
master_df['ResultsSPAlignerVSGraphAligner'] = output_list2

In [24]:
master_df

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start_Bandage,End_Bandage,Extracted_Path,Start_SPAligner,End_SPAligner,Length_SPAligner,Path_SPAligner,Path_GraphAligner,Start_GraphAligner,End_GraphAligner,BandageVSSPAligner,ResultsBandageVSSPAligner,SPAlignerVSGraphAligner,ResultsSPAlignerVSGraphAligner
0,0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877,MatchOnPath,SinglePathMatch,MatchOnPath,SinglePathMatch
1,1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,"(47) 3478183-, 50639855- (37) , (47) 3478183-,...","47, 47","37, 37","3478183, 50639855,3478183, 36393793",46,852,806,3478183,3478183,47,907,Different,SinglePathMatch,MatchOnPath,SinglePathMatch
2,2,gb|AB200915.1|-|1831-2305|ARO:3005084|dfrA31,(8) 54829473- (481),8,481,54829473,7,426,419,54829473,54829473,8,481,MatchOnPath,SinglePathMatch,MatchOnPath,SinglePathMatch
3,3,gb|AB302939|+|8-869|ARO:3001115|SHV-60,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877,MatchOnPath,SinglePathMatch,MatchOnPath,SinglePathMatch
4,4,gb|AB372881|+|8-869|ARO:3001160|SHV-111,(17) 1612535+ (877),17,877,1612535,16,822,806,1612535,1612535,17,877,MatchOnPath,SinglePathMatch,MatchOnPath,SinglePathMatch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,702,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,0,0,0,0,82,812,730,3478183,0,0,0,Different,Different,Different,Different
703,703,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,0,0,870,870,3478183,0,0,0,Different,Different,Different,Different
704,704,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7 Partial,0,0,0,0,82,812,730,3478183,0,0,0,Different,Different,Different,Different
705,705,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,0,0,0,0,82,812,730,3478183,0,0,0,Different,Different,Different,Different


In [25]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [26]:
def compare_values_Bandage_GraphAligner(row):
    if row['Start_Bandage'] == row['Start_GraphAligner'] and row['End_Bandage'] == row['End_GraphAligner'] and row['Extracted_Path'] == row['Path_GraphAligner']:
        return 'Full'
    elif row['Extracted_Path'] == row['Path_GraphAligner']  and row['Start_Bandage'] != row['Start_GraphAligner'] and row['End_Bandage'] != row['End_GraphAligner'] :
        return 'MatchOnPath'
    elif row['Extracted_Path'] == row['Path_GraphAligner'] and row['Start_Bandage'] == row['Start_GraphAligner'] and row['End_Bandage'] != row['End_GraphAligner'] :
        return 'MatchOnPathAndStart'
    elif row['Extracted_Path'] == row['Path_GraphAligner'] and row['Start_Bandage'] != row['Start_GraphAligner'] and row['End_Bandage'] == row['End_GraphAligner'] :
        return 'MatchOnPathAndEnd'
    else:
        return 'Different'

In [27]:
master_df['BandageVSGraphAligner'] = master_df.apply(compare_values_Bandage_GraphAligner, axis=1)

In [28]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [29]:
output_list3 = []

for index, row in master_df.iterrows():
    path_values = str(row['Extracted_Path']).split(',')
    start_values = str(row['Start_Bandage']).split(',')
    end_values = str(row['End_Bandage']).split(',')
    
    path_value = row['Path_GraphAligner']
    start_value= str(row['Start_GraphAligner']).split(',')
    end_value= str(row['End_GraphAligner']).split(',')
    
    if path_value in path_values  and start_value in start_values and end_value in end_values:
        output_list3.append('SinglePathFull')
    elif path_value in path_values and start_value in start_values and end_value not in end_values:
        output_list3.append('SinglePathStartMatch')
    elif path_value in path_values and start_value not in start_values and end_value in end_values:
        output_list3.append('SinglePathEndMatch')
    elif path_value in path_values and start_value not in start_values and end_value not in end_values:
        output_list3.append('SinglePathMatch')
    elif path_value not in path_values and start_value in start_values and end_value not in end_values:
        output_list3.append('SingleStartMatch')
    else:
         output_list3.append('Different')

In [30]:
master_df['ResultsBandageVSGraphAligner'] = output_list3

In [31]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [32]:
def determine_final_result_1(row):
    if row['BandageVSGraphAligner'] == 'Full':
        return 'Full'
    else:
        return row['ResultsBandageVSGraphAligner']

In [33]:
master_df['FinalResultBandageVSGraphAligner'] = master_df.apply(determine_final_result_1, axis=1)

In [34]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)