# Load

In [1]:
import numpy as np
import pandas as pd
import ecocyc_parse as ecops
from Bio import SeqIO

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [15]:
# reading ecocyc .dat file
gene_dat_file = "DataFiles/Ecocyc/genes.dat"
gene_dat = ecops.parse_dat(gene_dat_file)
gene_dat.head(3)

Unnamed: 0,UNIQUE-ID,TYPES,COMMON-NAME,ACCESSION-1,CENTISOME-POSITION,COMPONENT-OF,DBLINKS,LEFT-END-POSITION,PRODUCT,RIGHT-END-POSITION,SYNONYMS,TRANSCRIPTION-DIRECTION
0,GCQD-1687,Unclassified-Genes,marR,ECB_01489,34.27539,CHROMOSOME-1-29 -- TUCQD-7237,"(ECOCYC ""EG11435"" ORTHOLOG |ong| 3588961783 NI...",1586886,GCQD-1687-MONOMER,1587320,cfxB -- inaR -- soxQ,+
1,GCQD-3219,Unclassified-Genes,garD,ECB_02993,69.294624,CHROMOSOME-1-60 -- TUCQD-8061,"(ECOCYC ""EG12522"" ORTHOLOG |ong| 3588961773 NI...",3208211,GCQD-3219-MONOMER,3209782,yhaG,+
2,GCQD-2362,ORFs,yfaS,ECB_02154,49.170418,CHROMOSOME-1-42 -- TUCQD-7620,,2276498,GCQD-2362-MONOMER,2281102,,-


In [16]:
# reading ecocyc .col file for gene
gene_col_file = "DataFiles/Ecocyc/genes.col"
gene_col = ecops.parse_col(gene_col_file)
gene_col.head(3)

Unnamed: 0,UNIQUE-ID,NAME,PRODUCT-NAME,SWISS-PROT-ID,REPLICON,START-BASE,END-BASE,SYNONYMS,SYNONYMS.1,SYNONYMS.2,SYNONYMS.3,GENE-CLASS,GENE-CLASS.1,GENE-CLASS.2,GENE-CLASS.3
0,GCQD-1687,marR,MarR transcriptional repressor,C6UHT4,CHROMOSOME-1,1586886,1587320,cfxB,inaR,soxQ,,UNCLASSIFIED,,,
1,GCQD-3219,garD,(D)-galactarate dehydrogenase,C6UEH4,CHROMOSOME-1,3208211,3209782,yhaG,,,,UNCLASSIFIED,,,
2,GCQD-2362,yfaS,orf,C6ULN9,CHROMOSOME-1,2281102,2276498,,,,,,,,


In [17]:
gene_col.shape

(4310, 15)

In [18]:
# merge tow dataframe to get the ECB# 
gene_col = pd.merge(
    left = gene_col, 
    right = gene_dat[["UNIQUE-ID", "ACCESSION-1","TRANSCRIPTION-DIRECTION"]],
    on = "UNIQUE-ID",
    how = "left"
)
gene_col.head(3)

Unnamed: 0,UNIQUE-ID,NAME,PRODUCT-NAME,SWISS-PROT-ID,REPLICON,START-BASE,END-BASE,SYNONYMS,SYNONYMS.1,SYNONYMS.2,SYNONYMS.3,GENE-CLASS,GENE-CLASS.1,GENE-CLASS.2,GENE-CLASS.3,ACCESSION-1,TRANSCRIPTION-DIRECTION
0,GCQD-1687,marR,MarR transcriptional repressor,C6UHT4,CHROMOSOME-1,1586886,1587320,cfxB,inaR,soxQ,,UNCLASSIFIED,,,,ECB_01489,+
1,GCQD-3219,garD,(D)-galactarate dehydrogenase,C6UEH4,CHROMOSOME-1,3208211,3209782,yhaG,,,,UNCLASSIFIED,,,,ECB_02993,+
2,GCQD-2362,yfaS,orf,C6ULN9,CHROMOSOME-1,2281102,2276498,,,,,,,,,ECB_02154,-


In [19]:
gene_col.shape

(4310, 17)

In [2]:
gb_file = "DataFiles/REL606-NC_012967.1.gb"
# read genbank file
gb_record = SeqIO.read(gb_file, "genbank")
# create a table with all ECB # - location - gene name

gene_location_list = []          # .location
gene_name_list = []              # .qualifiers["gene"]
gene_locus_tag_list = []         # .qualifiers["locus_tag"]
gene_old_locus_tag_list = []     # .qualifiers["old_locus_tag"]

cds_location_list = []           # .location
cds_name_list = []               # .qualifiers["gene"]
cds_locus_tag_list = []          # .qualifiers["locus_tag"]
cds_old_locus_tag_list = []      # .qualifiers["old_locus_tag"]
cds_product_list = []            # .qualifiers["product"]
cds_note_list = []               # .qualifiers["note"]

for feature in gb_record.features:
    #print(feature.location)
    
    if feature.type == "gene":
        gene_location = str(feature.location)
        keys = feature.qualifiers.keys()
        if "gene" in keys:
            gene_name = feature.qualifiers["gene"][0]
        else:
            gene_name = None
        if "locus_tag" in keys:
            gene_locus_tag = feature.qualifiers["locus_tag"][0]
        else:
            gene_locus_tag = None
        if "old_locus_tag" in keys:
            gene_old_locus_tag = feature.qualifiers["old_locus_tag"][0]
        else:
            gene_old_locus_tag = None
            
        gene_location_list.append(gene_location)
        gene_name_list.append(gene_name)
        gene_locus_tag_list.append(gene_locus_tag)
        gene_old_locus_tag_list.append(gene_old_locus_tag)
        
    elif feature.type == "CDS":
        cds_location = str(feature.location)
        keys = feature.qualifiers.keys()

        if "gene" in keys:
            cds_name = feature.qualifiers["gene"][0]
        else:
            cds_name = None
        if "locus_tag" in keys:
            cds_locus_tag = feature.qualifiers["locus_tag"][0]
        else:
            cds_locus_tag = None
        if "old_locus_tag" in keys:
            cds_old_locus_tag = feature.qualifiers["old_locus_tag"][0]
        else:
            cds_old_locus_tag = None
        if "product" in keys:
            cds_product = feature.qualifiers["product"][0]
        else:
            cds_product = None
        if "note" in keys:
            cds_note = feature.qualifiers["note"][0]
        else:
            cds_note = None
            
        cds_location_list.append(cds_location)
        cds_name_list.append(cds_name)
        cds_locus_tag_list.append(cds_locus_tag)
        cds_old_locus_tag_list.append(cds_old_locus_tag)
        cds_product_list.append(cds_product)
        cds_note_list.append(cds_note)

gene_df = pd.DataFrame(
    {
    "location":gene_location_list,
    "name":gene_name_list,
    "locus_tag":gene_locus_tag_list,
    "old_locus_tag":gene_old_locus_tag_list
    }
)

cds_df = pd.DataFrame(
    {
    "location":cds_location_list,
    "name":cds_name_list,
    "locus_tag":cds_locus_tag_list,
    "old_locus_tag":cds_old_locus_tag_list,
    "product":cds_product_list,
    "note":cds_note_list
    }
)

In [3]:
genbank_df = pd.merge(
    gene_df,
    cds_df,
    how = "left",
    on = "locus_tag",
    suffixes = ("-gene", "-cds")
)
genbank_df = genbank_df[["locus_tag", "name-gene", "location-gene", "old_locus_tag-gene", "product", "note"]]
genbank_df.columns = ["locus_tag", "name", "location", "old_locus_tag", "product", "note"]
genbank_df.head()

Unnamed: 0,locus_tag,name,location,old_locus_tag,product,note
0,ECB_RS00005,thrL,[189:255](+),ECB_00001,thr operon leader peptide,Derived by automated computational analysis us...
1,ECB_RS00010,thrA,[335:2798](+),ECB_00002,bifunctional aspartate kinase/homoserine dehyd...,Derived by automated computational analysis us...
2,ECB_RS00015,thrB,[2799:3732](+),ECB_00003,homoserine kinase,Derived by automated computational analysis us...
3,ECB_RS00020,thrC,[3732:5019](+),ECB_00004,threonine synthase,Derived by automated computational analysis us...
4,ECB_RS00030,,[5231:5528](+),ECB_00005,DUF2502 domain-containing protein,Derived by automated computational analysis us...


In [4]:
start_list = []
end_list = []

for item in genbank_df.location:
    
    if "+" in item:
        item = item[1:-4].split(":")
        #print(item)
        start = item[0]
        end = item[1]
        
    elif "-" in item:
        item = item[1:-4].split(":")
        #print(item)
        start = item[1]
        end = item[0]
    start_list.append(start)
    end_list.append(end)

In [5]:
genbank_detail_df = genbank_df.copy()
genbank_detail_df["start"] = start_list
genbank_detail_df["end"]   = end_list
genbank_detail_df.head()

Unnamed: 0,locus_tag,name,location,old_locus_tag,product,note,start,end
0,ECB_RS00005,thrL,[189:255](+),ECB_00001,thr operon leader peptide,Derived by automated computational analysis us...,189,255
1,ECB_RS00010,thrA,[335:2798](+),ECB_00002,bifunctional aspartate kinase/homoserine dehyd...,Derived by automated computational analysis us...,335,2798
2,ECB_RS00015,thrB,[2799:3732](+),ECB_00003,homoserine kinase,Derived by automated computational analysis us...,2799,3732
3,ECB_RS00020,thrC,[3732:5019](+),ECB_00004,threonine synthase,Derived by automated computational analysis us...,3732,5019
4,ECB_RS00030,,[5231:5528](+),ECB_00005,DUF2502 domain-containing protein,Derived by automated computational analysis us...,5231,5528


In [6]:
# replace "<" and ">" in the star/end bases.
genbank_detail_df["start"] = genbank_detail_df["start"].str.replace(">", "")
genbank_detail_df["start"] = genbank_detail_df["start"].str.replace("<", "")
genbank_detail_df["end"] = genbank_detail_df["end"].str.replace(">", "")
genbank_detail_df["end"] = genbank_detail_df["end"].str.replace("<", "")

In [7]:
# Cast cols to int32 using a dictionary
genbank_detail_df = genbank_detail_df.astype({'start': 'int', 'end': 'int'})
genbank_detail_df.dtypes


locus_tag        object
name             object
location         object
old_locus_tag    object
product          object
note             object
start             int64
end               int64
dtype: object

In [26]:
# Cast cols to int32 using a dictionary
gene_col = gene_col.astype({'START-BASE': 'int', 'END-BASE': 'int'})
gene_col.dtypes

UNIQUE-ID                   object
NAME                        object
PRODUCT-NAME                object
SWISS-PROT-ID               object
REPLICON                    object
START-BASE                   int64
END-BASE                     int64
SYNONYMS                    object
SYNONYMS.1                  object
SYNONYMS.2                  object
SYNONYMS.3                  object
GENE-CLASS                  object
GENE-CLASS.1                object
GENE-CLASS.2               float64
GENE-CLASS.3               float64
ACCESSION-1                 object
TRANSCRIPTION-DIRECTION     object
dtype: object

In [27]:
genbank_detail_df.to_csv("NCBI_rel606_annotation.csv")


In [28]:
# if the starting and ending is within 50 bp
# and the overlaping is more than 95%, then merge them

seek_ecocyc = pd.DataFrame(columns = gene_col.columns)
seek_genbank = pd.DataFrame(columns = genbank_detail_df.columns)

i = 0
while i< len(gene_col):
    j = 0
    distance_list = []
    while j<len(genbank_detail_df):
        
        eco_start = gene_col.iloc[i]["START-BASE"]
        eco_end = gene_col.iloc[i]["END-BASE"]
        gen_start = genbank_detail_df.iloc[j]["start"]
        gen_end = genbank_detail_df.iloc[j]["end"]
        
        dist = abs(eco_start-gen_start)+abs(eco_end-gen_end)
        distance_list.append(dist)
        j = j+1

    ind = np.argmin(distance_list)
    print(i, gene_col.iloc[i]["UNIQUE-ID"], genbank_detail_df.iloc[ind]["locus_tag"])
    seek_ecocyc = seek_ecocyc.append(gene_col.iloc[i], ignore_index=True)
    seek_genbank = seek_genbank.append(genbank_detail_df.iloc[ind],ignore_index=True)
    i = i+1

0 GCQD-1687 ECB_RS07880
1 GCQD-3219 ECB_RS15820
2 GCQD-2362 ECB_RS11350
3 GCQD-2361 ECB_RS11345
4 GCQD-4285 ECB_RS21410
5 GCQD-3613 ECB_RS17885
6 GCQD-2360 ECB_RS11340
7 GCQD-2324 ECB_RS11155
8 GCQD-2164 ECB_RS10365
9 GCQD-1415 ECB_RS06445
10 GCQD-2318 ECB_RS11125
11 GCQD-2313 ECB_RS11100
12 GCQD-3003 ECB_RS14685
13 GCQD-405 ECB_RS01160
14 GCQD-2310 ECB_RS11085
15 GCQD-2307 ECB_RS11070
16 GCQD-3425 ECB_RS16880
17 GCQD-1904 ECB_RS08990
18 GCQD-2302 ECB_RS11045
19 GCQD-2301 ECB_RS11040
20 GCQD-1181 ECB_RS05185
21 GCQD-4470 ECB_RS22430
22 GCQD-2296 ECB_RS11015
23 GCQD-2288 ECB_RS10970
24 GCQD-1667 ECB_RS07780
25 GCQD-2357 ECB_RS11325
26 GCQD-2280 ECB_RS10925
27 GCQD-2278 ECB_RS10915
28 GCQD-4268 ECB_RS21325
29 GCQD-835 ECB_RS03395
30 GCQD-2277 ECB_RS10910
31 GCQD-2264 ECB_RS10840
32 GCQD-2148 ECB_RS10285
33 GCQD-2775 ECB_RS13500
34 GCQD-2260 ECB_RS10820
35 GCQD-2259 ECB_RS10815
36 GCQD-607 ECB_RS02205
37 GCQD-320 ECB_RS00740
38 GCQD-2258 ECB_RS10810
39 GCQD-2257 ECB_RS10805
40 GCQD-2582 E

321 GCQD-640 ECB_RS02370
322 GCQD-1734 ECB_RS08120
323 GCQD-1733 ECB_RS08115
324 GCQD-2822 ECB_RS13740
325 GCQD-2178 ECB_RS10435
326 GCQD-1732 ECB_RS08105
327 GCQD-1731 ECB_RS25450
328 GCQD-3251 ECB_RS15980
329 GCQD-2611 ECB_RS12635
330 GCQD-1729 ECB_RS08090
331 GCQD-1728 ECB_RS08085
332 GCQD-3652 ECB_RS18095
333 GCQD-3022 ECB_RS14785
334 GCQD-1727 ECB_RS08080
335 GCQD-1718 ECB_RS25000
336 GCQD-1455 ECB_RS06670
337 GCQD-3439 ECB_RS16955
338 GCQD-1716 ECB_RS08030
339 GCQD-1713 ECB_RS08020
340 GCQD-4502 ECB_RS22590
341 GCQD-1198 ECB_RS05275
342 GCQD-1712 ECB_RS08015
343 GCQD-1711 ECB_RS08010
344 GCQD-2396 ECB_RS11525
345 GCQD-3859 ECB_RS19160
346 GCQD-1707 ECB_RS07990
347 GCQD-1705 ECB_RS07980
348 GCQD-880 ECB_RS03635
349 GCQD-1684 ECB_RS07865
350 GCQD-1704 ECB_RS07975
351 GCQD-1703 ECB_RS07970
352 GCQD-2805 ECB_RS13655
353 GCQD-4283 ECB_RS21400
354 GCQD-1698 ECB_RS07940
355 GCQD-1693 ECB_RS07915
356 GCQD-3233 ECB_RS15890
357 GCQD-621 ECB_RS02275
358 GCQD-1692 ECB_RS07910
359 GCQD-1689 E

638 GCQD-4047 ECB_RS20140
639 GCQD-1900 ECB_RS08970
640 GCQD-2354 ECB_RS11310
641 GCQD-831 ECB_RS03365
642 GCQD-2772 ECB_RS13485
643 GCQD-3197 ECB_RS15710
644 GCQD-1389 ECB_RS06315
645 GCQD-403 ECB_RS01150
646 GCQD-1879 ECB_RS08865
647 GCQD-4450 ECB_RS22305
648 GCQD-234 ECB_RS00290
649 GCQD-3179 ECB_RS15620
650 GCQD-3580 ECB_RS17700
651 GCQD-1372 ECB_RS06225
652 GCQD-4013 ECB_RS19950
653 GCQD-2323 ECB_RS11150
654 GCQD-786 ECB_RS03130
655 GCQD-2735 ECB_RS13290
656 GCQD-3160 ECB_RS15520
657 GCQD-1349 ECB_RS06105
658 GCQD-3998 ECB_RS19865
659 GCQD-3994 ECB_RS19840
660 GCQD-1828 ECB_RS08600
661 GCQD-4409 ECB_RS25660
662 GCQD-766 ECB_RS03030
663 GCQD-2713 ECB_RS13180
664 GCQD-3141 ECB_RS15410
665 GCQD-1317 ECB_RS05915
666 GCQD-3972 ECB_RS19730
667 GCQD-1811 ECB_RS08515
668 GCQD-4388 ECB_RS21945
669 GCQD-269 ECB_RS00475
670 GCQD-3120 ECB_RS15300
671 GCQD-1296 ECB_RS05810
672 GCQD-1797 ECB_RS08440
673 GCQD-4369 ECB_RS21845
674 GCQD-727 ECB_RS02830
675 GCQD-2675 ECB_RS12975
676 GCQD-3504 ECB_R

956 GCQD-3701 ECB_RS18355
957 GCQD-1500 ECB_RS06910
958 GCQD-413 ECB_RS01200
959 GCQD-199 ECB_RS00105
960 GCQD-460 ECB_RS01450
961 GCQD-943 ECB_RS03975
962 GCQD-2847 ECB_RS13875
963 GCQD-3277 ECB_RS16115
964 GCQD-1002 ECB_RS04285
965 GCQD-3680 ECB_RS18245
966 GCQD-1481 ECB_RS06815
967 GCQD-4115 ECB_RS20500
968 GCQD-1969 ECB_RS09340
969 GCQD-2422 ECB_RS11655
970 GCQD-283 ECB_RS00550
971 GCQD-326 ECB_RS00775
972 GCQD-3662 ECB_RS18150
973 GCQD-1463 ECB_RS06720
974 GCQD-4091 ECB_RS20375
975 GCQD-1945 ECB_RS09210
976 GCQD-4514 ECB_RS22650
977 GCQD-2408 ECB_RS25075
978 GCQD-890 ECB_RS03690
979 GCQD-2814 ECB_RS13700
980 GCQD-3242 ECB_RS15935
981 GCQD-3642 ECB_RS18045
982 GCQD-1445 ECB_RS06600
983 GCQD-4073 ECB_RS20275
984 GCQD-1928 ECB_RS09115
985 GCQD-4493 ECB_RS22545
986 GCQD-2385 ECB_RS11470
987 GCQD-872 ECB_RS03590
988 GCQD-2796 ECB_RS13605
989 GCQD-3623 ECB_RS25605
990 GCQD-1423 ECB_RS06485
991 GCQD-4058 ECB_RS20200
992 GCQD-1910 ECB_RS09025
993 GCQD-4478 ECB_RS22465
994 GCQD-2368 ECB_RS

1264 GCQD-2135 ECB_RS10225
1265 GCQD-589 ECB_RS02110
1266 GCQD-2564 ECB_RS12385
1267 GCQD-297 ECB_RS00625
1268 GCQD-3397 ECB_RS16740
1269 GCQD-1143 ECB_RS04985
1270 GCQD-3812 ECB_RS18920
1271 GCQD-1630 ECB_RS07580
1272 GCQD-4231 ECB_RS21120
1273 GCQD-212 ECB_RS00165
1274 GCQD-255 ECB_RS00400
1275 GCQD-2949 ECB_RS14415
1276 GCQD-3382 ECB_RS16665
1277 GCQD-1125 ECB_RS04890
1278 GCQD-3792 ECB_RS18815
1279 GCQD-1601 ECB_RS07440
1280 GCQD-4216 ECB_RS21045
1281 GCQD-2089 ECB_RS09975
1282 GCQD-552 ECB_RS01920
1283 GCQD-293 ECB_RS00605
1284 GCQD-3770 ECB_RS18685
1285 GCQD-1579 ECB_RS07315
1286 GCQD-533 ECB_RS01820
1287 GCQD-3340 ECB_RS16435
1288 GCQD-1085 ECB_RS04690
1289 GCQD-3742 ECB_RS18565
1290 GCQD-1560 ECB_RS07225
1291 GCQD-4182 ECB_RS20860
1292 GCQD-2051 ECB_RS09785
1293 GCQD-2491 ECB_RS12010
1294 GCQD-2893 ECB_RS14115
1295 GCQD-3322 ECB_RS16340
1296 GCQD-3727 ECB_RS18490
1297 GCQD-1534 ECB_RS07090
1298 GCQD-491 ECB_RS01610
1299 GCQD-2474 ECB_RS11920
1300 GCQD-2873 ECB_RS14015
1301 GCQD

1570 GCQD-1454 ECB_RS06665
1571 GCQD-1197 ECB_RS05270
1572 GCQD-4081 ECB_RS20325
1573 GCQD-3857 ECB_RS19150
1574 GCQD-1937 ECB_RS09170
1575 GCQD-4282 ECB_RS21395
1576 GCQD-4501 ECB_RS22585
1577 GCQD-620 ECB_RS02270
1578 GCQD-2395 ECB_RS11520
1579 GCQD-2161 ECB_RS10350
1580 GCQD-879 ECB_RS03630
1581 GCQD-2595 ECB_RS12550
1582 GCQD-2803 ECB_RS13645
1583 GCQD-300 ECB_RS00640
1584 GCQD-3232 ECB_RS15885
1585 GCQD-3422 ECB_RS16865
1586 GCQD-3631 ECB_RS17990
1587 GCQD-1178 ECB_RS05170
1588 GCQD-1431 ECB_RS06530
1589 GCQD-3842 ECB_RS19075
1590 GCQD-4065 ECB_RS20235
1591 GCQD-1661 ECB_RS07745
1592 GCQD-1918 ECB_RS09065
1593 GCQD-2145 ECB_RS10270
1594 GCQD-4485 ECB_RS22505
1595 GCQD-2579 ECB_RS12465
1596 GCQD-2376 ECB_RS11425
1597 GCQD-2981 ECB_RS14575
1598 GCQD-861 ECB_RS03530
1599 GCQD-3408 ECB_RS16795
1600 GCQD-2788 ECB_RS13565
1601 GCQD-1156 ECB_RS05050
1602 GCQD-3217 ECB_RS15810
1603 GCQD-3826 ECB_RS18995
1604 GCQD-3611 ECB_RS17875
1605 GCQD-4245 ECB_RS21195
1606 GCQD-1413 ECB_RS06435
1607 

1876 GCQD-965 ECB_RS04090
1877 GCQD-697 ECB_RS02680
1878 GCQD-2867 ECB_RS13980
1879 GCQD-2658 ECB_RS12870
1880 GCQD-3297 ECB_RS16215
1881 GCQD-308 ECB_RS00680
1882 GCQD-1037 ECB_RS04440
1883 GCQD-3488 ECB_RS17210
1884 GCQD-3705 ECB_RS18375
1885 GCQD-3906 ECB_RS19395
1886 GCQD-1505 ECB_RS06935
1887 GCQD-1262 ECB_RS05640
1888 GCQD-4134 ECB_RS20595
1889 GCQD-1762 ECB_RS08260
1890 GCQD-466 ECB_RS01485
1891 GCQD-4332 ECB_RS21650
1892 GCQD-2448 ECB_RS11790
1893 GCQD-2220 ECB_RS10640
1894 GCQD-947 ECB_RS04000
1895 GCQD-673 ECB_RS02530
1896 GCQD-2850 ECB_RS13890
1897 GCQD-2641 ECB_RS12785
1898 GCQD-3280 ECB_RS16130
1899 GCQD-3060 ECB_RS14975
1900 GCQD-1006 ECB_RS04305
1901 GCQD-3470 ECB_RS17120
1902 GCQD-3685 ECB_RS18270
1903 GCQD-389 ECB_RS01090
1904 GCQD-1485 ECB_RS06835
1905 GCQD-1238 ECB_RS05500
1906 GCQD-4119 ECB_RS20520
1907 GCQD-1730 ECB_RS08095
1908 GCQD-1972 ECB_RS09355
1909 GCQD-4314 ECB_RS21555
1910 GCQD-4536 ECB_RS22770
1911 GCQD-653 ECB_RS02430
1912 GCQD-2429 ECB_RS11695
1913 GCQD

2182 GCQD-3543 ECB_RS17505
2183 GCQD-3610 ECB_RS17870
2184 GCQD-3452 ECB_RS17025
2185 GCQD-2816 ECB_RS13710
2186 GCQD-3315 ECB_RS16305
2187 GCQD-3453 ECB_RS17030
2188 GCQD-3868 ECB_RS19205
2189 GCQD-3245 ECB_RS15950
2190 GCQD-3227 ECB_RS15860
2191 GCQD-994 ECB_RS04240
2192 GCQD-1215 ECB_RS05365
2193 GCQD-3644 ECB_RS25190
2194 GCQD-306 ECB_RS00670
2195 GCQD-903 ECB_RS03755
2196 GCQD-4294 ECB_RS21455
2197 GCQD-1447 ECB_RS06610
2198 GCQD-2944 ECB_RS14385
2199 GCQD-742 ECB_RS02910
2200 GCQD-2172 ECB_RS10405
2201 GCQD-4075 ECB_RS20285
2202 GCQD-2757 ECB_RS13400
2203 GCQD-616 ECB_RS02250
2204 GCQD-2606 ECB_RS12605
2205 GCQD-1930 ECB_RS09125
2206 GCQD-2543 ECB_RS12280
2207 GCQD-501 ECB_RS01655
2208 GCQD-3017 ECB_RS14760
2209 GCQD-2388 ECB_RS11485
2210 GCQD-248 ECB_RS00360
2211 GCQD-4433 ECB_RS22195
2212 GCQD-3433 ECB_RS16920
2213 GCQD-874 ECB_RS03600
2214 GCQD-2334 ECB_RS11210
2215 GCQD-4345 ECB_RS21715
2216 GCQD-1193 ECB_RS05250
2217 GCQD-2798 ECB_RS13620
2218 GCQD-228 ECB_RS00260
2219 GCQD-

2488 GCQD-1992 ECB_RS09470
2489 GCQD-4310 ECB_RS21535
2490 GCQD-1067 ECB_RS04595
2491 GCQD-1218 ECB_RS05385
2492 GCQD-462 ECB_RS01460
2493 GCQD-650 ECB_RS02415
2494 GCQD-3624 ECB_RS17945
2495 GCQD-374 ECB_RS01015
2496 GCQD-2444 ECB_RS11770
2497 GCQD-2622 ECB_RS12690
2498 GCQD-3456 ECB_RS17045
2499 GCQD-3552 ECB_RS17550
2500 GCQD-944 ECB_RS03980
2501 GCQD-3042 ECB_RS14890
2502 GCQD-995 ECB_RS04245
2503 GCQD-332 ECB_RS00805
2504 GCQD-2848 ECB_RS13880
2505 GCQD-345 ECB_RS00870
2506 GCQD-911 ECB_RS03800
2507 GCQD-3244 ECB_RS15945
2508 GCQD-3278 ECB_RS16120
2509 GCQD-3866 ECB_RS25215
2510 GCQD-743 ECB_RS02915
2511 GCQD-3068 ECB_RS15020
2512 GCQD-1003 ECB_RS04290
2513 GCQD-1210 ECB_RS05340
2514 GCQD-623 ECB_RS02285
2515 GCQD-2973 ECB_RS14535
2516 GCQD-3681 ECB_RS18250
2517 GCQD-1697 ECB_RS07935
2518 GCQD-502 ECB_RS01660
2519 GCQD-2758 ECB_RS13405
2520 GCQD-1482 ECB_RS06820
2521 GCQD-4292 ECB_RS21445
2522 GCQD-4441 ECB_RS22245
2523 GCQD-2575 ECB_RS12440
2524 GCQD-4116 ECB_RS20505
2525 GCQD-63

2794 GCQD-1783 ECB_RS08370
2795 GCQD-3808 ECB_RS18900
2796 GCQD-1792 ECB_RS08415
2797 GCQD-4129 ECB_RS20570
2798 GCQD-1507 ECB_RS06950
2799 GCQD-1644 ECB_RS07655
2800 GCQD-2261 ECB_RS10825
2801 GCQD-459 ECB_RS01445
2802 GCQD-1250 ECB_RS05580
2803 GCQD-1419 ECB_RS06465
2804 GCQD-2670 ECB_RS12950
2805 GCQD-942 ECB_RS03970
2806 GCQD-3745 ECB_RS18580
2807 GCQD-1077 ECB_RS04645
2808 GCQD-3097 ECB_RS15165
2809 GCQD-2846 ECB_RS13870
2810 GCQD-3556 ECB_RS17570
2811 GCQD-3625 ECB_RS17950
2812 GCQD-350 ECB_RS00895
2813 GCQD-3276 ECB_RS16110
2814 GCQD-3328 ECB_RS16370
2815 GCQD-3494 ECB_RS17245
2816 GCQD-1272 ECB_RS05690
2817 GCQD-1001 ECB_RS04280
2818 GCQD-3254 ECB_RS15995
2819 GCQD-996 ECB_RS04250
2820 GCQD-3920 ECB_RS19465
2821 GCQD-368 ECB_RS00985
2822 GCQD-3088 ECB_RS15120
2823 GCQD-914 ECB_RS03815
2824 GCQD-1775 ECB_RS08330
2825 GCQD-1480 ECB_RS06810
2826 GCQD-2978 ECB_RS14560
2827 GCQD-744 ECB_RS02920
2828 GCQD-4346 ECB_RS21720
2829 GCQD-4114 ECB_RS20495
2830 GCQD-2774 ECB_RS13495
2831 GCQ

3100 GCQD-4344 ECB_RS21710
3101 GCQD-1515 ECB_RS06995
3102 GCQD-2231 ECB_RS10695
3103 GCQD-4151 ECB_RS20685
3104 GCQD-690 ECB_RS02630
3105 GCQD-4144 ECB_RS20645
3106 GCQD-2015 ECB_RS09595
3107 GCQD-2651 ECB_RS12835
3108 GCQD-4105 ECB_RS20445
3109 GCQD-3361 ECB_RS16560
3110 GCQD-479 ECB_RS01550
3111 GCQD-3071 ECB_RS15035
3112 GCQD-3990 ECB_RS19820
3113 GCQD-2698 ECB_RS13105
3114 GCQD-2461 ECB_RS11855
3115 GCQD-3482 ECB_RS17180
3116 GCQD-382 ECB_RS01055
3117 GCQD-3879 ECB_RS19260
3118 GCQD-2860 ECB_RS13945
3119 GCQD-3900 ECB_RS19360
3120 GCQD-4141 ECB_RS20630
3121 GCQD-3364 ECB_RS16575
3122 GCQD-3290 ECB_RS16180
3123 GCQD-1254 ECB_RS05600
3124 GCQD-4102 ECB_RS20430
3125 GCQD-3358 ECB_RS16545
3126 GCQD-1030 ECB_RS04410
3127 GCQD-1756 ECB_RS08230
3128 GCQD-3986 ECB_RS19800
3129 GCQD-2695 ECB_RS13090
3130 GCQD-3699 ECB_RS18345
3131 GCQD-4326 ECB_RS21615
3132 GCQD-378 ECB_RS01035
3133 GCQD-3876 ECB_RS19245
3134 GCQD-1496 ECB_RS06890
3135 GCQD-666 ECB_RS02495
3136 GCQD-3360 ECB_RS16555
3137 G

3407 GCQD-2477 ECB_RS23940
3408 GCQD-837 ECB_RS03405
3409 GCQD-834 ECB_RS03390
3410 GCQD-758 ECB_RS02985
3411 GCQD-2875 ECB_RS14025
3412 GCQD-812 ECB_RS03270
3413 GCQD-808 ECB_RS03250
3414 GCQD-2703 ECB_RS13135
3415 GCQD-3306 ECB_RS16260
3416 GCQD-799 ECB_RS03195
3417 GCQD-797 ECB_RS03185
3418 GCQD-3129 ECB_RS15345
3419 GCQD-1049 ECB_RS04500
3420 GCQD-791 ECB_RS03155
3421 GCQD-785 ECB_RS03125
3422 GCQD-1306 ECB_RS05860
3423 GCQD-3713 ECB_RS18415
3424 GCQD-755 ECB_RS02975
3425 GCQD-754 ECB_RS02970
3426 GCQD-3956 ECB_RS19645
3427 GCQD-4148 ECB_RS20665
3428 GCQD-752 ECB_RS02960
3429 GCQD-751 ECB_RS02955
3430 GCQD-1804 ECB_RS08480
3431 GCQD-2011 ECB_RS09575
3432 GCQD-749 ECB_RS02945
3433 GCQD-737 ECB_RS02885
3434 GCQD-4377 ECB_RS21885
3435 GCQD-477 ECB_RS01540
3436 GCQD-726 ECB_RS02825
3437 GCQD-716 ECB_RS02775
3438 GCQD-2274 ECB_RS10890
3439 GCQD-2457 ECB_RS11835
3440 GCQD-715 ECB_RS02770
3441 GCQD-714 ECB_RS02765
3442 GCQD-735 ECB_RS02875
3443 GCQD-955 ECB_RS04040
3444 GCQD-713 ECB_RS027

3715 GCQD-3547 ECB_RS17525
3716 GCQD-3971 ECB_RS19725
3717 GCQD-397 ECB_RS25305
3718 GCQD-1602 ECB_RS07450
3719 GCQD-3978 ECB_RS19760
3720 GCQD-3967 ECB_RS19705
3721 GCQD-3965 ECB_RS19695
3722 GCQD-4217 ECB_RS21050
3723 GCQD-1818 ECB_RS08550
3724 GCQD-3964 ECB_RS19690
3725 GCQD-3963 ECB_RS19685
3726 GCQD-209 ECB_RS00150
3727 GCQD-4393 ECB_RS21970
3728 GCQD-3960 ECB_RS19665
3729 GCQD-3959 ECB_RS19660
3730 GCQD-553 ECB_RS01925
3731 GCQD-2290 ECB_RS10985
3732 GCQD-3958 ECB_RS19655
3733 GCQD-3957 ECB_RS19650
3734 GCQD-2531 ECB_RS12220
3735 GCQD-756 ECB_RS02980
3736 GCQD-3955 ECB_RS19640
3737 GCQD-3953 ECB_RS19630
3738 GCQD-2930 ECB_RS14310
3739 GCQD-2701 ECB_RS13120
3740 GCQD-3943 ECB_RS19580
3741 GCQD-3938 ECB_RS19555
3742 GCQD-336 ECB_RS00825
3743 GCQD-3127 ECB_RS15335
3744 GCQD-3937 ECB_RS19550
3745 GCQD-3935 ECB_RS19535
3746 GCQD-1103 ECB_RS04780
3747 GCQD-3530 ECB_RS17430
3748 GCQD-3934 ECB_RS19530
3749 GCQD-3930 ECB_RS19510
3750 GCQD-3773 ECB_RS18700
3751 GCQD-1303 ECB_RS05845
3752 G

4020 GCQD-3111 ECB_RS15255
4021 GCQD-3102 ECB_RS15200
4022 GCQD-1883 ECB_RS08885
4023 GCQD-2119 ECB_RS10150
4024 GCQD-3099 ECB_RS15185
4025 GCQD-3087 ECB_RS15115
4026 GCQD-4455 ECB_RS22340
4027 GCQD-569 ECB_RS02005
4028 GCQD-3086 ECB_RS15110
4029 GCQD-3085 ECB_RS15105
4030 GCQD-2344 ECB_RS11265
4031 GCQD-2549 ECB_RS12310
4032 GCQD-3083 ECB_RS15095
4033 GCQD-3082 ECB_RS15090
4034 GCQD-813 ECB_RS03275
4035 GCQD-2948 ECB_RS14410
4036 GCQD-3081 ECB_RS15085
4037 GCQD-3080 ECB_RS15080
4038 GCQD-3183 ECB_RS15640
4039 GCQD-3381 ECB_RS16660
4040 GCQD-3079 ECB_RS15075
4041 GCQD-3078 ECB_RS15070
4042 GCQD-1377 ECB_RS06250
4043 GCQD-1124 ECB_RS04885
4044 GCQD-3077 ECB_RS15065
4045 GCQD-3070 ECB_RS15030
4046 GCQD-4018 ECB_RS19975
4047 GCQD-3791 ECB_RS18810
4048 GCQD-3065 ECB_RS15000
4049 GCQD-3051 ECB_RS14930
4050 GCQD-1864 ECB_RS08780
4051 GCQD-4215 ECB_RS21035
4052 GCQD-3046 ECB_RS14910
4053 GCQD-3045 ECB_RS14905
4054 GCQD-444 ECB_RS01365
4055 GCQD-2087 ECB_RS09965
4056 GCQD-3039 ECB_RS14875
4057

In [29]:
seek_ecocyc

Unnamed: 0,UNIQUE-ID,NAME,PRODUCT-NAME,SWISS-PROT-ID,REPLICON,START-BASE,END-BASE,SYNONYMS,SYNONYMS.1,SYNONYMS.2,SYNONYMS.3,GENE-CLASS,GENE-CLASS.1,GENE-CLASS.2,GENE-CLASS.3,ACCESSION-1,TRANSCRIPTION-DIRECTION
0,GCQD-1687,marR,MarR transcriptional repressor,C6UHT4,CHROMOSOME-1,1586886,1587320,cfxB,inaR,soxQ,,UNCLASSIFIED,,,,ECB_01489,+
1,GCQD-3219,garD,(D)-galactarate dehydrogenase,C6UEH4,CHROMOSOME-1,3208211,3209782,yhaG,,,,UNCLASSIFIED,,,,ECB_02993,+
2,GCQD-2362,yfaS,orf,C6ULN9,CHROMOSOME-1,2281102,2276498,,,,,,,,,ECB_02154,-
3,GCQD-2361,yfaQ,predicted protein,C6ULN8,CHROMOSOME-1,2276497,2274848,,,,,,,,,ECB_02153,-
4,GCQD-4285,frdD,fumarate reductase membrane protein,C6UKE8,CHROMOSOME-1,4358077,4357718,,,,,UNCLASSIFIED,,,,ECB_04023,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4305,GCQD-2371,inaA,pH-inducible protein involved in stress response,C6ULP8,CHROMOSOME-1,2296180,2295530,yfaG,,,,,,,,ECB_02163,-
4306,GCQD-1200,torA,"trimethylamine N-oxide reductase, catalytic su...",C6UFA8,CHROMOSOME-1,1075546,1078092,,,,,UNCLASSIFIED,,,,ECB_01000,+
4307,GCQD-279,yacG,DNA gyrase inhibitor YacG,C6UM63,CHROMOSOME-1,114650,114453,,,,,UNCLASSIFIED,,,,ECB_00101,-
4308,GCQD-2364,yfaA,predicted protein,C6ULP1,CHROMOSOME-1,2283344,2281656,pufY,,,,,,,,ECB_02156,-


In [30]:
seek_genbank

Unnamed: 0,locus_tag,name,location,old_locus_tag,product,note,start,end
0,ECB_RS07880,marR,[1586885:1587320](+),ECB_01489,multiple antibiotic resistance transcriptional...,Derived by automated computational analysis us...,1586885,1587320
1,ECB_RS15820,garD,[3208210:3209782](+),ECB_02993,galactarate dehydratase,Derived by automated computational analysis us...,3208210,3209782
2,ECB_RS11350,,[2276497:2280892](-),ECB_02154,alpha-2-macroglobulin family protein,Derived by automated computational analysis us...,2280892,2276497
3,ECB_RS11345,,[2274847:2276497](-),ECB_02153,DUF2300 domain-containing protein,Derived by automated computational analysis us...,2276497,2274847
4,ECB_RS21410,frdD,[4357717:4358077](-),ECB_04023,fumarate reductase subunit FrdD,Derived by automated computational analysis us...,4358077,4357717
...,...,...,...,...,...,...,...,...
4305,ECB_RS11400,inaA,[2295529:2296180](-),ECB_02163,lipopolysaccharide kinase InaA,Derived by automated computational analysis us...,2296180,2295529
4306,ECB_RS05285,torA,[1075545:1078092](+),ECB_01000,trimethylamine-N-oxide reductase TorA,Derived by automated computational analysis us...,1075545,1078092
4307,ECB_RS00530,yacG,[114452:114650](-),ECB_00101,DNA gyrase inhibitor YacG,Derived by automated computational analysis us...,114650,114452
4308,ECB_RS11360,,[2281655:2283344](-),ECB_02156,DUF2138 domain-containing protein,Derived by automated computational analysis us...,2283344,2281655


In [31]:
seek_ecocyc.reset_index(drop=True, inplace=True)
seek_genbank.reset_index(drop=True, inplace=True)

final_df = pd.concat( [seek_ecocyc, seek_genbank], axis=1) 
final_df

Unnamed: 0,UNIQUE-ID,NAME,PRODUCT-NAME,SWISS-PROT-ID,REPLICON,START-BASE,END-BASE,SYNONYMS,SYNONYMS.1,SYNONYMS.2,SYNONYMS.3,GENE-CLASS,GENE-CLASS.1,GENE-CLASS.2,GENE-CLASS.3,ACCESSION-1,TRANSCRIPTION-DIRECTION,locus_tag,name,location,old_locus_tag,product,note,start,end
0,GCQD-1687,marR,MarR transcriptional repressor,C6UHT4,CHROMOSOME-1,1586886,1587320,cfxB,inaR,soxQ,,UNCLASSIFIED,,,,ECB_01489,+,ECB_RS07880,marR,[1586885:1587320](+),ECB_01489,multiple antibiotic resistance transcriptional...,Derived by automated computational analysis us...,1586885,1587320
1,GCQD-3219,garD,(D)-galactarate dehydrogenase,C6UEH4,CHROMOSOME-1,3208211,3209782,yhaG,,,,UNCLASSIFIED,,,,ECB_02993,+,ECB_RS15820,garD,[3208210:3209782](+),ECB_02993,galactarate dehydratase,Derived by automated computational analysis us...,3208210,3209782
2,GCQD-2362,yfaS,orf,C6ULN9,CHROMOSOME-1,2281102,2276498,,,,,,,,,ECB_02154,-,ECB_RS11350,,[2276497:2280892](-),ECB_02154,alpha-2-macroglobulin family protein,Derived by automated computational analysis us...,2280892,2276497
3,GCQD-2361,yfaQ,predicted protein,C6ULN8,CHROMOSOME-1,2276497,2274848,,,,,,,,,ECB_02153,-,ECB_RS11345,,[2274847:2276497](-),ECB_02153,DUF2300 domain-containing protein,Derived by automated computational analysis us...,2276497,2274847
4,GCQD-4285,frdD,fumarate reductase membrane protein,C6UKE8,CHROMOSOME-1,4358077,4357718,,,,,UNCLASSIFIED,,,,ECB_04023,-,ECB_RS21410,frdD,[4357717:4358077](-),ECB_04023,fumarate reductase subunit FrdD,Derived by automated computational analysis us...,4358077,4357717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4305,GCQD-2371,inaA,pH-inducible protein involved in stress response,C6ULP8,CHROMOSOME-1,2296180,2295530,yfaG,,,,,,,,ECB_02163,-,ECB_RS11400,inaA,[2295529:2296180](-),ECB_02163,lipopolysaccharide kinase InaA,Derived by automated computational analysis us...,2296180,2295529
4306,GCQD-1200,torA,"trimethylamine N-oxide reductase, catalytic su...",C6UFA8,CHROMOSOME-1,1075546,1078092,,,,,UNCLASSIFIED,,,,ECB_01000,+,ECB_RS05285,torA,[1075545:1078092](+),ECB_01000,trimethylamine-N-oxide reductase TorA,Derived by automated computational analysis us...,1075545,1078092
4307,GCQD-279,yacG,DNA gyrase inhibitor YacG,C6UM63,CHROMOSOME-1,114650,114453,,,,,UNCLASSIFIED,,,,ECB_00101,-,ECB_RS00530,yacG,[114452:114650](-),ECB_00101,DNA gyrase inhibitor YacG,Derived by automated computational analysis us...,114650,114452
4308,GCQD-2364,yfaA,predicted protein,C6ULP1,CHROMOSOME-1,2283344,2281656,pufY,,,,,,,,ECB_02156,-,ECB_RS11360,,[2281655:2283344](-),ECB_02156,DUF2138 domain-containing protein,Derived by automated computational analysis us...,2283344,2281655


In [32]:
final_df.to_csv("Annotation_NCBI_Ecocyc.csv")