In [192]:
import pandas as pd
import gzip
from functools import partial
from os import listdir
from os.path import isfile

def Return_Rev_Complement(dna):
    rev = ""
    for d in dna:
        if d == 'A':
            rev += 'T'
        elif d == 'T':
            rev += 'A'
        elif d == 'C':
            rev += 'G'
        elif d == 'G':
            rev += 'C'
    return rev[::-1]

def Get_Nucleotide_Sequence(row, dna):
    if row['Orientation'] == '+':
        return dna[row['Start']-1:row['End']-1]
    else:
        return Return_Rev_Complement(dna[row['Start']-1:row['End']-1])
        
def Parse_GBFF_File(filepath):
    lines = gzip.open(filepath,'rt').readlines()
    found_cds = False
    found_translation = False
    found_nuc = False
    translation = ""
    op = []
    nuc_seq = ""
    ctr = 0
    temp = 0
    for l in lines:
        l = l.lstrip().rstrip()
        if l.startswith("CDS "):
            t = l.replace("CDS","").lstrip()
            orientation = '+'
            if t.startswith("complement"):
                orientation = '-'
            if 'join' in t:
                ctr += 1
                continue
            t = t.replace("complement","").replace("(","").replace(")","").replace("<","").replace(">","")
            splits = t.split("..")
            start,end = int(splits[0]), int(splits[1])
            d = {'Start':start, 'End':end, 'Orientation':orientation}
            found_cds = True
            

        if l.startswith("gene    "):
            found_cds = False
        
        if (found_cds == True):
            if l.startswith("/"):
                feature = l.strip("/").split("=")[0]
                if feature == "translation":
                    value = l.split("=")[1]
                    if value[-1] == "\"" or value[-1] == "\'":
                        found_translation = False
                        d[feature] = value.replace("\"","").replace("\'","")
                        temp += 1
                        op.append(d)
                    else:
                        found_translation = True
                elif feature == "gene" or feature == "locus_tag" or feature == "gene_synonym":
                        d[feature] = l.split("=")[1].replace("\"","").replace("\'","")

        if found_translation == True:
            splits = l.split("=")
            if len(splits) == 1:
                value = splits[0]
            else:
                value = splits[1]
            if value[-1] == "\"" or value[-1] == "\'":
                found_translation = False
                translation += value
                d[feature] = translation.replace("\"","").replace("\'","")
                op.append(d)
                temp += 1
                translation = ""
            else:
                translation += value

        if l.startswith("ORIGIN"):
            found_nuc = True
            continue

        if(found_nuc):
            if not l.startswith("//"):
                splits = l.split(" ")
                j = "".join(splits[1:])
                nuc_seq += j
            else:
                found_nuc = False
    nuc_seq = nuc_seq.upper()
    df_prot = pd.DataFrame(op)
    df_prot = df_prot[df_prot['Start'] >= 0]
    print(filepath, ctr, len(df_prot), len(op), temp)
    
    df_prot['Start'] = df_prot['Start'].astype(int)
    df_prot['End'] = df_prot['End'].astype(int)
    df_prot['DNA'] = df_prot.apply(partial(Get_Nucleotide_Sequence, dna = nuc_seq),axis = 1)
    df_prot['Nuc_Len'] = df_prot['DNA'].str.len()
    df_prot['Prot_Len'] = df_prot['translation'].str.len()
    
    
    return df_prot


In [194]:
filedir = 'Mount/CMSC829A/Data/gbff/'
outdir = 'Mount/CMSC829A/Data/Processed_GBFF/'
files = listdir(filedir)
for f in files:
    if f.startswith('GCF'):
        if not isfile(outdir+f.replace(".gbff.gz",".txt.gz")):
            df_prot = Parse_GBFF_File(filedir+f)
            df_prot.to_csv(outdir+f.replace(".gbff.gz",".txt.gz"), compression = "gzip", sep = "\t")

Mount/CMSC829A/Data/gbff/GCF_014905885.1_ASM1490588v1_genomic.gbff.gz 2 1555 1555 1555
Mount/CMSC829A/Data/gbff/GCF_009671165.1_ASM967116v1_genomic.gbff.gz 1 662 662 662
Mount/CMSC829A/Data/gbff/GCF_009930955.1_ASM993095v1_genomic.gbff.gz 3 1592 1592 1592
Mount/CMSC829A/Data/gbff/GCF_003004115.1_ASM300411v1_genomic.gbff.gz 1 2997 2997 2997
Mount/CMSC829A/Data/gbff/GCF_013284015.2_ASM1328401v2_genomic.gbff.gz 3 5164 5164 5164
Mount/CMSC829A/Data/gbff/GCF_013487885.1_ASM1348788v1_genomic.gbff.gz 1 1855 1855 1855
Mount/CMSC829A/Data/gbff/GCF_016728045.1_ASM1672804v1_genomic.gbff.gz 2 4796 4796 4796
Mount/CMSC829A/Data/gbff/GCF_003665415.1_ASM366541v1_genomic.gbff.gz 58 5557 5557 5557
Mount/CMSC829A/Data/gbff/GCF_016599595.1_ASM1659959v1_genomic.gbff.gz 6 5180 5180 5180
Mount/CMSC829A/Data/gbff/GCF_002094975.1_ASM209497v1_genomic.gbff.gz 3 2001 2001 2001
Mount/CMSC829A/Data/gbff/GCF_001017595.1_ASM101759v1_genomic.gbff.gz 0 554 554 554
Mount/CMSC829A/Data/gbff/GCF_009625995.1_ASM962599v1_g

Mount/CMSC829A/Data/gbff/GCF_003628595.1_ASM362859v1_genomic.gbff.gz 4 4225 4225 4225
Mount/CMSC829A/Data/gbff/GCF_020276565.1_ASM2027656v1_genomic.gbff.gz 0 673 673 673
Mount/CMSC829A/Data/gbff/GCF_002025225.1_ASM202522v1_genomic.gbff.gz 1 4768 4768 4768
Mount/CMSC829A/Data/gbff/GCF_002888715.1_ASM288871v1_genomic.gbff.gz 3 3814 3814 3814
Mount/CMSC829A/Data/gbff/GCF_000959305.1_ASM95930v1_genomic.gbff.gz 8 5871 5871 5871
Mount/CMSC829A/Data/gbff/GCF_007972745.1_ASM797274v1_genomic.gbff.gz 14 1120 1120 1120
Mount/CMSC829A/Data/gbff/GCF_001971565.1_ASM197156v1_genomic.gbff.gz 6 2950 2950 2950
Mount/CMSC829A/Data/gbff/GCF_016756375.1_ASM1675637v1_genomic.gbff.gz 1 2499 2499 2499
Mount/CMSC829A/Data/gbff/GCF_002999295.1_ASM299929v1_genomic.gbff.gz 0 686 686 686
Mount/CMSC829A/Data/gbff/GCF_003228315.1_ASM322831v1_genomic.gbff.gz 1 6068 6068 6068
Mount/CMSC829A/Data/gbff/GCF_019930685.1_ASM1993068v1_genomic.gbff.gz 2 3502 3502 3502
Mount/CMSC829A/Data/gbff/GCF_018742045.1_ASM1874204v1_gen

Mount/CMSC829A/Data/gbff/GCF_017068275.1_ASM1706827v1_genomic.gbff.gz 5 2233 2233 2233
Mount/CMSC829A/Data/gbff/GCF_001708595.1_ASM170859v1_genomic.gbff.gz 11 4113 4113 4113
Mount/CMSC829A/Data/gbff/GCF_003255835.1_ASM325583v1_genomic.gbff.gz 3 1984 1984 1984
Mount/CMSC829A/Data/gbff/GCF_018361225.1_ASM1836122v1_genomic.gbff.gz 6 4194 4194 4194
Mount/CMSC829A/Data/gbff/GCF_004103595.1_ASM410359v1_genomic.gbff.gz 1 4078 4078 4078
Mount/CMSC829A/Data/gbff/GCF_017298875.1_ASM1729887v1_genomic.gbff.gz 4 4745 4745 4745
Mount/CMSC829A/Data/gbff/GCF_003957375.1_ASM395737v1_genomic.gbff.gz 6 2420 2420 2420
Mount/CMSC829A/Data/gbff/GCF_018740945.1_ASM1874094v1_genomic.gbff.gz 1 5516 5516 5516
Mount/CMSC829A/Data/gbff/GCF_002850455.1_ASM285045v1_genomic.gbff.gz 4 4459 4459 4459
Mount/CMSC829A/Data/gbff/GCF_000286275.1_ASM28627v1_genomic.gbff.gz 1 4301 4301 4301
Mount/CMSC829A/Data/gbff/GCF_011612125.1_ASM1161212v1_genomic.gbff.gz 0 3393 3393 3393
Mount/CMSC829A/Data/gbff/GCF_004526275.1_ASM45262

Mount/CMSC829A/Data/gbff/GCF_007954705.1_ASM795470v1_genomic.gbff.gz 3 1846 1846 1846
Mount/CMSC829A/Data/gbff/GCF_001449005.1_ASM144900v1_genomic.gbff.gz 5 7740 7740 7740
Mount/CMSC829A/Data/gbff/GCF_002949175.1_ASM294917v1_genomic.gbff.gz 96 3714 3714 3714
Mount/CMSC829A/Data/gbff/GCF_002949535.1_ASM294953v1_genomic.gbff.gz 101 3698 3698 3698
Mount/CMSC829A/Data/gbff/GCF_001723525.1_ASM172352v1_genomic.gbff.gz 3 1880 1880 1880
Mount/CMSC829A/Data/gbff/GCF_016028755.1_ASM1602875v1_genomic.gbff.gz 6 2053 2053 2053
Mount/CMSC829A/Data/gbff/GCF_002786455.2_ASM278645v2_genomic.gbff.gz 4 2373 2373 2373
Mount/CMSC829A/Data/gbff/GCF_004006435.1_ASM400643v1_genomic.gbff.gz 11 4004 4004 4004
Mount/CMSC829A/Data/gbff/GCF_001931595.1_ASM193159v1_genomic.gbff.gz 6 4624 4624 4624
Mount/CMSC829A/Data/gbff/GCF_003004215.1_ASM300421v1_genomic.gbff.gz 3 3130 3130 3130
Mount/CMSC829A/Data/gbff/GCF_001663755.1_ASM166375v1_genomic.gbff.gz 7 2495 2495 2495
Mount/CMSC829A/Data/gbff/GCF_005221385.1_ASM52213

Mount/CMSC829A/Data/gbff/GCF_003932035.1_ASM393203v1_genomic.gbff.gz 7 4309 4309 4309
Mount/CMSC829A/Data/gbff/GCF_002952035.2_ASM295203v2_genomic.gbff.gz 3 4464 4464 4464
Mount/CMSC829A/Data/gbff/GCF_008931605.1_ASM893160v1_genomic.gbff.gz 10 5734 5734 5734
Mount/CMSC829A/Data/gbff/GCF_016728105.1_ASM1672810v1_genomic.gbff.gz 12 2532 2532 2532
Mount/CMSC829A/Data/gbff/GCF_003952885.1_ASM395288v1_genomic.gbff.gz 5 2759 2759 2759
Mount/CMSC829A/Data/gbff/GCF_019856295.1_ASM1985629v1_genomic.gbff.gz 29 6262 6262 6262
Mount/CMSC829A/Data/gbff/GCF_017858835.1_ASM1785883v1_genomic.gbff.gz 63 5797 5797 5797
Mount/CMSC829A/Data/gbff/GCF_006337345.1_ASM633734v1_genomic.gbff.gz 0 1404 1404 1404
Mount/CMSC829A/Data/gbff/GCF_016027775.1_ASM1602777v1_genomic.gbff.gz 10 6416 6416 6416
Mount/CMSC829A/Data/gbff/GCF_017570225.1_ASM1757022v1_genomic.gbff.gz 13 4187 4187 4187
Mount/CMSC829A/Data/gbff/GCF_020422985.1_ASM2042298v1_genomic.gbff.gz 0 886 886 886
Mount/CMSC829A/Data/gbff/GCF_003597595.1_ASM3

Mount/CMSC829A/Data/gbff/GCF_000875695.1_ASM87569v1_genomic.gbff.gz 9 2976 2976 2976
Mount/CMSC829A/Data/gbff/GCF_003925975.1_ASM392597v1_genomic.gbff.gz 0 636 636 636
Mount/CMSC829A/Data/gbff/GCF_009729915.1_ASM972991v1_genomic.gbff.gz 3 4743 4743 4743
Mount/CMSC829A/Data/gbff/GCF_020547085.1_ASM2054708v1_genomic.gbff.gz 9 3089 3089 3089
Mount/CMSC829A/Data/gbff/GCF_008728935.1_ASM872893v1_genomic.gbff.gz 0 742 742 742
Mount/CMSC829A/Data/gbff/GCF_018223785.1_ASM1822378v1_genomic.gbff.gz 4 8105 8105 8105
Mount/CMSC829A/Data/gbff/GCF_001932555.1_ASM193255v1_genomic.gbff.gz 19 3179 3179 3179
Mount/CMSC829A/Data/gbff/GCF_008124325.1_ASM812432v1_genomic.gbff.gz 17 4286 4286 4286
Mount/CMSC829A/Data/gbff/GCF_010918635.1_ASM1091863v1_genomic.gbff.gz 11 3051 3051 3051
Mount/CMSC829A/Data/gbff/GCF_010232985.1_ASM1023298v1_genomic.gbff.gz 57 1524 1524 1524
Mount/CMSC829A/Data/gbff/GCF_900475195.1_42650_C01_genomic.gbff.gz 9 2109 2109 2109
Mount/CMSC829A/Data/gbff/GCF_018741145.1_ASM1874114v1_g