In [8]:
import pandas as pd
import numpy as np
import requests
import re
from Bio import SeqIO
import subprocess


def ko2genes(ko):
    url = 'http://rest.kegg.jp/link/genes/' + ko
    r = requests.get(url)
    geneKO = pd.DataFrame([x.split('\t') for x in r.text.split('\n')],columns=['ko','gene'])
    geneKO = geneKO.iloc[0:-1]
    return geneKO

def download_seqs(geneList,fileName,batchSize):    
    geneList_chunks = [geneList[i * batchSize:(i + 1) * batchSize] for i in range((len(geneList) + batchSize - 1) // batchSize )]
    with open(fileName,'w') as fastafile:
        for sglist in geneList_chunks:
            url =  'http://rest.kegg.jp/get/' + "+".join(sglist) + '/aaseq'
            fseqs = requests.get(url).text
            fastafile.write(fseqs)


In [51]:
structure = pd.read_csv('../assets/structure/oxidoreductases_residue_annotation.csv',index_col=0)

In [52]:
enterobacteraceae = pd.read_csv('../assets/kegg/enterobacteria.csv')

In [33]:
genes = ['leuB','pdxB','gdhA','gatD','lgoD','fucO','mdh','icd','paaH','gapA']

In [34]:
s = structure[structure['gene name'].isin(genes)]

In [233]:
s = structure.copy()

In [234]:
KO = s[s['Coenzyme Bound Structure'] != 'None'].KO.unique().tolist()

In [236]:
len(KO)

76

In [237]:
for ko in KO:
    genes = ko2genes(ko)
    genes['species'] = genes['gene'].apply(lambda x: x.split(':')[0])
    ge = genes[genes.species.isin(enterobacteraceae.kegg_id.tolist())]
    gl = ge.gene.unique().tolist()
    download_seqs(gl,'../assets/kegg/orthogroups_family/' + ko + '_enterobacteriaceae.fasta',10)
    print('finished downloading :' + ko + '; num seqs: ' + str(len(gl)))

finished downloading :K12524; num seqs: 210
finished downloading :K00215; num seqs: 214
finished downloading :K00382; num seqs: 263
finished downloading :K00099; num seqs: 212
finished downloading :K04073; num seqs: 90
finished downloading :K00121; num seqs: 207
finished downloading :K00286; num seqs: 215
finished downloading :K00077; num seqs: 319
finished downloading :K01491; num seqs: 213
finished downloading :K00384; num seqs: 213
finished downloading :K12972; num seqs: 232
finished downloading :K00059; num seqs: 463
finished downloading :K02492; num seqs: 202
finished downloading :K00208; num seqs: 215
finished downloading :K09472; num seqs: 120
finished downloading :K03778; num seqs: 203
finished downloading :K13953; num seqs: 189
finished downloading :K00027; num seqs: 223
finished downloading :K08324; num seqs: 197
finished downloading :K16066; num seqs: 201
finished downloading :K05887; num seqs: 154
finished downloading :K06447; num seqs: 229
finished downloading :K00262; num

In [246]:
for ko in KO:
    fin = '../assets/kegg/orthogroups_family/' + ko + '_enterobacteriaceae.fasta'
    fout = '../assets/kegg/orthogroups_family/' + ko + '_enterobacteriaceae.MSA.fasta'
    call = '/Users/Joshua.Goldford/opt/miniconda3/bin/muscle -in ' + fin + ' -out ' + fout
    subprocess.call(call,shell=True)
    print('finished with mutiple sequence alignment: ' + ko + '; num seqs: ')

finished with mutiple sequence alignment: K12524; num seqs: 
finished with mutiple sequence alignment: K00215; num seqs: 
finished with mutiple sequence alignment: K00382; num seqs: 
finished with mutiple sequence alignment: K00099; num seqs: 
finished with mutiple sequence alignment: K04073; num seqs: 
finished with mutiple sequence alignment: K00121; num seqs: 
finished with mutiple sequence alignment: K00286; num seqs: 
finished with mutiple sequence alignment: K00077; num seqs: 
finished with mutiple sequence alignment: K01491; num seqs: 
finished with mutiple sequence alignment: K00384; num seqs: 
finished with mutiple sequence alignment: K12972; num seqs: 
finished with mutiple sequence alignment: K00059; num seqs: 
finished with mutiple sequence alignment: K02492; num seqs: 
finished with mutiple sequence alignment: K00208; num seqs: 
finished with mutiple sequence alignment: K09472; num seqs: 
finished with mutiple sequence alignment: K03778; num seqs: 
finished with mutiple se

In [254]:
ko_list =  [];
residue_list = []
residue_ref_idx = []
residue_msa_idx = []
count_occured = []
count_not_occured = []
frac_occured = []

for ko in KO:
    seq_dict = SeqIO.to_dict(SeqIO.parse('../assets/kegg/orthogroups_family/msa/' + ko + '_enterobacteriaceae.MSA.fasta','fasta'))
    if s[s.KO == ko]['Homolog-KEGG'].tolist()[0] in list(seq_dict):
        seq_id = s[s.KO == ko]['Homolog-KEGG'].tolist()[0]
        r = s[s.KO == ko].Residues[0].split(';')
        r  = [x for x in r if x not in ['Unclear',None]]
        if len(r) > 0:
            r = pd.DataFrame({'residue': [get_resid(y) for y in r],'index':[get_resid_idx(y) for y in r]})
            seq = str(seq_dict[seq_id].seq)
            seqdf = msaSeq2df(seq,1)
            sdf = seqdf[seqdf['index'].isin(r['index'].tolist())]
            for midx,row in sdf.iterrows():
                chars = [y.seq[midx] for y in list(seq_dict.values())]
                z = pd.DataFrame({'idx':chars}).reset_index().groupby('idx').count()
                frac = z.loc[row.residue]['index'] / z['index'].sum()
                ko_list.append(ko)
                residue_list.append(row.residue)
                residue_ref_idx.append(row['index'])
                residue_msa_idx.append(midx)
                count_occured.append(z.loc[row.residue]['index'])
                count_not_occured.append(z['index'].sum() - z.loc[row.residue]['index'])
                frac_occured.append(frac)

results = pd.DataFrame({'ko':ko_list,'residue':residue_list,'idx': residue_ref_idx, 'midx': residue_msa_idx, 'count': count_occured, 'ncount': count_not_occured, 'frac': frac_occured})    

In [257]:
results

Unnamed: 0,ko,residue,idx,midx,count,ncount,frac
0,K00215,E,38.0,55,198,16,0.925234
1,K00099,V,20.0,49,207,5,0.976415
2,K00099,L,47.0,76,199,13,0.938679
3,K00099,E,48.0,77,198,14,0.933962
4,K00077,R,31.0,55,200,119,0.626959
5,K00384,H,176.0,223,213,0,1.0
6,K00384,R,177.0,224,212,1,0.995305
7,K00384,F,181.0,228,212,1,0.995305
8,K12972,G,174.0,194,191,41,0.823276
9,K12972,Q,176.0,196,102,130,0.439655


In [259]:
results

Unnamed: 0,ko,residue,idx,midx,count,ncount,frac
0,K00215,E,38.0,55,198,16,0.925234
1,K00099,V,20.0,49,207,5,0.976415
2,K00099,L,47.0,76,199,13,0.938679
3,K00099,E,48.0,77,198,14,0.933962
4,K00077,R,31.0,55,200,119,0.626959
5,K00384,H,176.0,223,213,0,1.0
6,K00384,R,177.0,224,212,1,0.995305
7,K00384,F,181.0,228,212,1,0.995305
8,K12972,G,174.0,194,191,41,0.823276
9,K12972,Q,176.0,196,102,130,0.439655


In [112]:
def msaSeq2df(seq,indexStart):
    idx = []
    resid = []
    j = indexStart;
    for i in seq:
        resid.append(i)
        if i == '-':
            idx.append(np.nan)
        else:
            idx.append(j)
            j = j+1
    
    return pd.DataFrame({'residue': resid, 'index': idx})
    
get_resid = lambda x: [x for x in re.findall(r'\D',x) if len(x)>0][0]
get_resid_idx = lambda x: [int(x) for x in re.findall(r'\d*',x) if len(x)>0][0]

In [172]:
r

Unnamed: 0,residue,index
0,D,34


In [190]:
msa_index[0]

60

In [210]:
pd.DataFrame({'idx':chars}).reset_index().groupby('idx').count()

Unnamed: 0_level_0,index
idx,Unnamed: 1_level_1
-,5
D,392
