In [12]:
import pandas as pd
import numpy as np
import requests
import re
from Bio import SeqIO
import subprocess
import cobra

def ko2genes(ko):
    url = 'http://rest.kegg.jp/link/genes/' + ko
    r = requests.get(url)
    geneKO = pd.DataFrame([x.split('\t') for x in r.text.split('\n')],columns=['ko','gene'])
    geneKO = geneKO.iloc[0:-1]
    return geneKO

def download_seqs(geneList,fileName,batchSize):    
    geneList_chunks = [geneList[i * batchSize:(i + 1) * batchSize] for i in range((len(geneList) + batchSize - 1) // batchSize )]
    with open(fileName,'w') as fastafile:
        for sglist in geneList_chunks:
            url =  'http://rest.kegg.jp/get/' + "+".join(sglist) + '/aaseq'
            fseqs = requests.get(url).text
            fastafile.write(fseqs)

            
def msaSeq2df(seq,indexStart):
    idx = []
    resid = []
    j = indexStart;
    for i in seq:
        resid.append(i)
        if i == '-':
            idx.append(np.nan)
        else:
            idx.append(j)
            j = j+1
    
    return pd.DataFrame({'residue': resid, 'index': idx})

def ecogenes():
    url = 'http://rest.kegg.jp/link/ko/eco'
    r = requests.get(url)
    geneKO = pd.DataFrame([x.split('\t') for x in r.text.split('\n')],columns=['gene','ko'])
    geneKO = geneKO.iloc[0:-1]
    return geneKO


def get_cpd_kos(cpd,kind):
    # download all ecs associated with cpds
    url = 'http://rest.kegg.jp/link/ec/cpd:' + cpd;
    r = requests.get(url)
    cpd2ec = pd.DataFrame([x.split('\t') for x in r.text.split('\n')],columns=['cpd','ec'])
    
    url = 'http://rest.kegg.jp/link/ec/ko'
    r = requests.get(url)
    ko2ec = pd.DataFrame([x.split('\t') for x in r.text.split('\n')],columns=['ko','ec'])
    
    if kind == 'unique':
        ecs = []
        for ec,dff in ko2ec.groupby('ec'):
            if len(dff) < 2:
                ecs.append(ec)
        ko2ec = ko2ec[ko2ec.ec.isin(ecs)]
    
    ko2ec = ko2ec[ko2ec.ec.isin(cpd2ec.ec.tolist())]
    ko2ec['cpd'] = cpd;
    return ko2ec
    
get_resid = lambda x: [x for x in re.findall(r'\D',x) if len(x)>0][0]
get_resid_idx = lambda x: [int(x) for x in re.findall(r'\d*',x) if len(x)>0][0]


#taxonomy = pd.read_csv('../assets/kegg/KEGG_taxonomy_final.csv')



In [13]:
c24 = get_cpd_kos('C00024','unique')
c10 = get_cpd_kos('C00010','unique')
c24 = c24[c24.ko.isin(c10.ko)]

In [21]:
nad = get_cpd_kos('C00003','all')
nadh = get_cpd_kos('C00004','all')
nadp = get_cpd_kos('C00006','all')
nadph = get_cpd_kos('C00005','all')


In [27]:
kos = list(set(nad[nad.ko.isin(nadh.ko.tolist())].ko.tolist() + nadp[nadp.ko.isin(nadph.ko.tolist())].ko.tolist()))

In [32]:
c24_nrd = c24[~c24.ko.isin(kos)]

In [42]:
gls = []
for idx,row in c24_nrd.iterrows():
    ko = row.ko.split(":")[1]
    genes  = ko2genes(ko)
    if len(genes) > 10:
        gl = genes.sample(10)
    else:
        gl = genes.copy()
    gls.append(gl)

In [54]:
gls = []
for idx,row in c24_nrd.iterrows():
    ko = row.ko.split(":")[1]
    genes  = ko2genes(ko)
    gls.append(genes)
    
g_all = pd.concat(gls,axis=0)

In [55]:
len(g_all)

79919

In [44]:
g = pd.concat(gls,axis=0)

In [49]:
genes = g.gene.tolist()

In [52]:
len(g.ko.unique())

66

In [53]:
download_seqs(genes,'../assets/coa_seqs/coa_binding_ko_subsample.fasta',10)

In [56]:
genes = g_all.gene.tolist()
download_seqs(genes,'../assets/coa_seqs/coa_binding.fasta',10)

In [58]:
g_all.to_csv('../assets/coa_seqs/coa_binding_ko_df.csv')

In [59]:
len(g_all)

79919