In [None]:
import os
import gzip
import pickle

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [None]:
bakta_files = '../../data/processed/bakta/'
cd_hit_headers = '../../data/processed/cd-hit-results/rep_headers.txt'
header_to_allele = '../../data/processed/cd-hit-results/header_to_allele_80.pickle.gz'

In [None]:
df_h2a = pd.read_pickle(header_to_allele)

In [None]:
headers = open(cd_hit_headers).readlines()
headers = [x[1:13] for x in headers]

In [None]:
for genome in tqdm(os.listdir(bakta_files)):
    file = open(bakta_files + genome + '/' + genome + '.gff3')
    file.seek(0)
    text = file.read(10000)
    loc = text.find('JDDBHE')
    if loc != -1:
        print('found')
        print(genome)

In [None]:
tag_to_genome = {}
for genome in tqdm(os.listdir(bakta_files)):
    file = open(bakta_files + genome + '/' + genome + '.gff3')
    file.seek(0)
    text = file.read(10000)
    loc = text.find('locus_tag=')
    tag = text[loc+10:loc+16]
    tag_to_genome[tag] = genome

In [None]:
bakta_files + genome + '/' + genome + '.gbff3'

In [None]:
annotations = {}
for header in tqdm(headers):
    genome = tag_to_genome[header.split('_')[0]]
    file = bakta_files + genome + '/' + genome + '.gff3'
    with open(file,'r',buffering=1) as f:
        found = False
        while(not found):
            line = f.readline()
            if header in line:
                annotations[header] = line.split('\t')
                found = True

In [None]:
df_annot = pd.DataFrame.from_dict(annotations, orient='index', 
                                  columns=['contig', 'Software', 'Region_Type', 'Start', 
                                           'End', 'Score', 'Strand', 'Frame', 'Attribute'])
df_annot['Length(BP)'] = df_annot.apply(lambda x: int(x.End) - int(x.Start), axis = 1)

def get_product(x):
    ret_val = x.split(';')[3][8:]
    if ret_val[-1]  == '\n':
        ret_val = ret_val[:-1]
    return ret_val

df_annot['Product'] = df_annot.Attribute.apply(lambda x: get_product(x))

def get_name(x):
    if 'gene=' in x:
        loc = x.find('gene=')
        end = x.find(';', loc)
        if end == -1:
            end = len(x)-1
        return x[loc+5:end]
    return None

df_annot['Name'] = df_annot.Attribute.apply(lambda x: get_name(x))
df_annot = df_annot.reset_index(names=['Ref_allele'])
df_annot['Gene'] = df_annot.Ref_allele.apply(lambda x: df_h2a[x].split('A')[0])
df_annot.set_index('Gene', inplace=True)

In [None]:
df_return = df_annot[['Ref_allele', 'Length(BP)', 'Product', 'Name', 'Attribute']]
df_return

In [None]:
df_annot.Product.value_counts()

In [None]:
df_annot.Name.value_counts()

In [None]:
df_return.to_csv('../../data/processed/bakta_gene_annotations.csv')