## This Script is used to get the unique sequences from GraphAligner.

In [1]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict
import gfapy

In [2]:
# Open the GFA file
file_path = "graph1.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [3]:
data_graph = []

In [4]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

       Name                                           Sequence
0      1321  CGTTCCACCGGTTCTTACAGCCTGGTTACTCAGCAGCCGCTGGGTG...
1      1323  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
2      1325  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
3     32989  CTTAATATGAACCATCCAACTTTATGGGGTCAGTCCAGCAGCGCCG...
4       565  GGTTCGGCGGAGCTTACCGCGTCTTTTCGCGGTTAGCGGAGTGTGG...
...     ...                                                ...
2524  37173  GAACAAGGATCTAAGCTGTTTTAAGTTATGGGCAACGCAATGCACT...
2525  24893  TCTTAAGAGAGTGCATTGCGTTGCCCATAACTTAAAACAGCTTAGA...
2526  36779  TTTTCTCTGCAACCGAACCGGCTGTTTGTGTGAAGTGATTCACATC...
2527   6673  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...
2528  37823  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...

[2529 rows x 2 columns]


In [5]:
df_graph['Sequence']

0       CGTTCCACCGGTTCTTACAGCCTGGTTACTCAGCAGCCGCTGGGTG...
1       GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
2       GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
3       CTTAATATGAACCATCCAACTTTATGGGGTCAGTCCAGCAGCGCCG...
4       GGTTCGGCGGAGCTTACCGCGTCTTTTCGCGGTTAGCGGAGTGTGG...
                              ...                        
2524    GAACAAGGATCTAAGCTGTTTTAAGTTATGGGCAACGCAATGCACT...
2525    TCTTAAGAGAGTGCATTGCGTTGCCCATAACTTAAAACAGCTTAGA...
2526    TTTTCTCTGCAACCGAACCGGCTGTTTGTGTGAAGTGATTCACATC...
2527    CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...
2528    CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...
Name: Sequence, Length: 2529, dtype: object

In [6]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [7]:
#GraphAlignerNotEqual.tsv is from another script:GraphAlignerBandageScript.ipynb
df_GraphAligner_Paths= pd.read_csv('111_GraphResults/SequencesBandageGraphAligner.tsv', sep='\t')

In [8]:
df_GraphAligner_Paths

Unnamed: 0,Query,Path_Bandage_x,Start_Bandage_x,End_Bandage_x,Extracted_Path_x,Start_GraphAligner,End_GraphAligner,Path_GraphAligne,FinalResultBandageVSGraphAligner,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,FinalResultBandageVSGraphAligner.1,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,SPAlignerVSGraphAligner
0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,(24) 5151- (884),24,884,5151,24,884,5151,Full,,...,,,Full,,,,,,,Different
1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,(37) 5967+ (897),37,897,5967,37,897,5967,Full,,...,,,Full,,,,,,,Full
2,gb|AB302939|+|8-869|ARO:3001115|SHV-60,(24) 5151- (884),24,884,5151,24,884,5151,Full,,...,,,Full,,,,,,,Full
3,gb|AB372881|+|8-869|ARO:3001160|SHV-111,(24) 5151- (884),24,884,5151,24,884,5151,Full,,...,,,Full,,,,,,,Full
4,gb|AB551737|+|14-875|ARO:3001177|SHV-133,(24) 5151- (884),24,884,5151,24,884,5151,Full,,...,,,Full,,,,,,,Full
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,gb|Y14574|+|0-861|ARO:3000888|TEM-17,(37) 5967+ (897),37,897,5967,37,897,5967,Full,,...,,,Full,,,,,,,Full
374,gb|Y17581|+|78-936|ARO:3000891|TEM-20,(37) 5967+ (894),37,894,5967,37,894,5967,Full,,...,,,Full,,,,,,,Full
375,gb|Y17582|+|0-858|ARO:3000892|TEM-21,(37) 5967+ (894),37,894,5967,37,894,5967,Full,,...,,,Full,,,,,,,Full
376,gb|Y17583|+|213-1071|ARO:3000893|TEM-22,(37) 5967+ (894),37,894,5967,37,894,5967,Full,,...,,,Full,,,,,,,Full


In [9]:
selected_rows_path = df_GraphAligner_Paths[df_GraphAligner_Paths['FinalResultBandageVSGraphAligner'] != 'Full']
selected_rows_path.columns

Index(['Query', 'Path_Bandage_x', 'Start_Bandage_x', 'End_Bandage_x',
       'Extracted_Path_x', 'Start_GraphAligner', 'End_GraphAligner',
       'Path_GraphAligne', 'FinalResultBandageVSGraphAligner', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'FinalResultBandageVSGraphAligner.1',
       'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Unnamed: 21', 'Unnamed: 22', 'SPAlignerVSGraphAligner'],
      dtype='object')

In [10]:
selected_rows=selected_rows_path.drop(['Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15','FinalResultBandageVSGraphAligner.1',
       'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Unnamed: 21', 'Unnamed: 22'] ,axis=1)


In [24]:
selected_rows_path


Unnamed: 0,Query,Path_Bandage_x,Start_Bandage_x,End_Bandage_x,Extracted_Path_x,Start_GraphAligner,End_GraphAligner,Path_GraphAligne,FinalResultBandageVSGraphAligner,SPAlignerVSGraphAligner,Path_GraphAligner
10,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591
26,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,"(1700) 7591+ (2479) , (732) 7593+ (1498)","1700, 732","2479, 1498",75917593,1700,2479,7591,SinglePathFull,Full,7591
57,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7,(73) 5967+ (857),73,857,5967,0,0,0,Different,MatchOnPathAndEnd,0
63,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,"(712) 7593+ (1500) , (1688) 7591+ (2477)","712, 1688","1500, 2477",75937591,712,1500,7593,SinglePathFull,Full,7593
77,gb|AJ809407|+|118-898|ARO:3002620|aadA23,"(732) 7593+ (1500) , (1700) 7591+ (2477)","732, 1700","1500, 2477",75937591,732,1500,7593,SinglePathFull,MatchOnPathAndEnd,7593
84,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591
102,gb|AM261837|+|73-865|ARO:3002619|aadA22,"(712) 7593+ (1500) , (1688) 7591+ (2464)","712, 1688","1500, 2464",75937591,712,1500,7593,SinglePathFull,Full,7593
126,gb|AY130282|+|0-764|ARO:3000980|TEM-117,(73) 5967+ (836),73,836,5967,0,0,0,Different,MatchOnPathAndEnd,0
127,gb|AY130284|+|0-785|ARO:3000941|TEM-75,(73) 5967+ (857),73,857,5967,0,0,0,Different,MatchOnPathAndEnd,0
128,gb|AY130285|+|0-785|ARO:3000981|TEM-118,(73) 5967+ (857),73,857,5967,0,0,0,Different,MatchOnPathAndEnd,0


In [13]:
selected_rows_path=selected_rows_path.drop(['Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15','FinalResultBandageVSGraphAligner.1',
       'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Unnamed: 21', 'Unnamed: 22'] ,axis=1)

In [14]:
selected_rows_path.dtypes

Query                               object
Path_Bandage_x                      object
Start_Bandage_x                     object
End_Bandage_x                       object
Extracted_Path_x                    object
Start_GraphAligner                   int64
End_GraphAligner                     int64
Path_GraphAligne                     int64
FinalResultBandageVSGraphAligner    object
SPAlignerVSGraphAligner             object
dtype: object

In [17]:
selected_rows_path['Path_GraphAligner']=selected_rows_path['Path_GraphAligne'].astype(str)

In [18]:
# Get the sequence where we have only Path1
merged_df_Path1= pd.merge(selected_rows_path , df_graph, left_on='Path_GraphAligner', right_on='Name')
merged_df_Path1

Unnamed: 0,Query,Path_Bandage_x,Start_Bandage_x,End_Bandage_x,Extracted_Path_x,Start_GraphAligner,End_GraphAligner,Path_GraphAligne,FinalResultBandageVSGraphAligner,SPAlignerVSGraphAligner,Path_GraphAligner,Name,Sequence
0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,"(1700) 7591+ (2479) , (732) 7593+ (1498)","1700, 732","2479, 1498",75917593,1700,2479,7591,SinglePathFull,Full,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
2,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
3,gb|AY139603|+|106-898|ARO:3002608|aadA8,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
4,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
5,gb|FJ460181|+|1790-2582|ARO:3002617|aadA17,"(1688) 7591+ (2479) , (712) 7593+ (1498)","1688, 712","2479, 1498",75917593,1688,2479,7591,SinglePathFull,Full,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
6,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,"(712) 7593+ (1500) , (1688) 7591+ (2477)","712, 1688","1500, 2477",75937591,712,1500,7593,SinglePathFull,Full,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...
7,gb|AJ809407|+|118-898|ARO:3002620|aadA23,"(732) 7593+ (1500) , (1700) 7591+ (2477)","732, 1700","1500, 2477",75937591,732,1500,7593,SinglePathFull,MatchOnPathAndEnd,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...
8,gb|AM261837|+|73-865|ARO:3002619|aadA22,"(712) 7593+ (1500) , (1688) 7591+ (2464)","712, 1688","1500, 2464",75937591,712,1500,7593,SinglePathFull,Full,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...
9,gb|AY171244|+|46-838|ARO:3002618|aadA21,"(712) 7593+ (1500) , (1688) 7591+ (2477)","712, 1688","1500, 2477",75937591,712,1500,7593,SinglePathFull,Full,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...


In [19]:
def extract_sequences_Path1(dataframe):
    for index, row in dataframe.iterrows():
        start_pos = row['Start_GraphAligner']
        end_pos = row['End_GraphAligner']
        sequence = row['Sequence']
        
        extracted_sequence = sequence[start_pos-1:end_pos]
        dataframe.loc[index, 'Result_node'] = extracted_sequence
    
    return dataframe

In [22]:
extracted_sequences_Path1_df = extract_sequences_Path1(merged_df_Path1)
extracted_sequences_Path1_df


Unnamed: 0,Query,Path_Bandage_x,Start_Bandage_x,End_Bandage_x,Extracted_Path_x,Start_GraphAligner,End_GraphAligner,Path_GraphAligne,FinalResultBandageVSGraphAligner,SPAlignerVSGraphAligner,Path_GraphAligner,Name,Sequence,Result_node
0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...,ATGAGGGTAGCGGTGACCATCGAAATTTCGAACCAACTATCAGAGG...
1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,"(1700) 7591+ (2479) , (732) 7593+ (1498)","1700, 732","2479, 1498",75917593,1700,2479,7591,SinglePathFull,Full,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...,GTGACCATCGAAATTTCGAACCAACTATCAGAGGTGCTAAGCGTCA...
2,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...,ATGAGGGTAGCGGTGACCATCGAAATTTCGAACCAACTATCAGAGG...
3,gb|AY139603|+|106-898|ARO:3002608|aadA8,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...,ATGAGGGTAGCGGTGACCATCGAAATTTCGAACCAACTATCAGAGG...
4,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,"(1688) 7591+ (2477) , (712) 7593+ (1500)","1688, 712","2477, 1500",75917593,1688,2477,7591,SinglePathFull,MatchOnPathAndStart,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...,ATGAGGGTAGCGGTGACCATCGAAATTTCGAACCAACTATCAGAGG...
5,gb|FJ460181|+|1790-2582|ARO:3002617|aadA17,"(1688) 7591+ (2479) , (712) 7593+ (1498)","1688, 712","2479, 1498",75917593,1688,2479,7591,SinglePathFull,Full,7591,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...,ATGAGGGTAGCGGTGACCATCGAAATTTCGAACCAACTATCAGAGG...
6,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,"(712) 7593+ (1500) , (1688) 7591+ (2477)","712, 1688","1500, 2477",75937591,712,1500,7593,SinglePathFull,Full,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...
7,gb|AJ809407|+|118-898|ARO:3002620|aadA23,"(732) 7593+ (1500) , (1700) 7591+ (2477)","732, 1700","1500, 2477",75937591,732,1500,7593,SinglePathFull,MatchOnPathAndEnd,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,CGAAGTATCGACTCAACTATCAGAGGTAGTTGGCGTCATCGAGCGC...
8,gb|AM261837|+|73-865|ARO:3002619|aadA22,"(712) 7593+ (1500) , (1688) 7591+ (2464)","712, 1688","1500, 2464",75937591,712,1500,7593,SinglePathFull,Full,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...
9,gb|AY171244|+|46-838|ARO:3002618|aadA21,"(712) 7593+ (1500) , (1688) 7591+ (2477)","712, 1688","1500, 2477",75937591,712,1500,7593,SinglePathFull,Full,7593,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...


In [23]:
#Save the dataframe to a pdf
extracted_sequences_Path1_df.to_csv("Path1_TestSequences.tsv", sep="\t")