## Generate MetaCyc-dependendt files

### protein_complexes.txt

In [33]:
import cobra
import os
import pandas as pd
import numpy as np
import urllib
import pythoncyc as pc
import re
from Bio.SeqUtils import seq3

## Metabolites

met_output_file = 'metabolites.txt'
m_model_file = './iYO844.json'

m_model = cobra.io.load_json_model(m_model_file)

## Fix of gene reaction rules
m_model.reactions.ACCOAC.gene_reaction_rule = '(BSU29200 and BSU29210) or BSU24350 or BSU24340 or BSU22440'
m_model.reactions.PDH.gene_reaction_rule = '(BSU14580 and BSU14590) or BSU14600 or BSU14610'
m_model.reactions.ACTD2.gene_reaction_rule = 'BSU08060 and BSU08070'
m_model.reactions.get_by_id('AIRC1').gene_reaction_rule = 'BSU06420 or BSU06430'
m_model.reactions.get_by_id('ANS').gene_reaction_rule = 'BSU00750 or BSU22680'
m_model.reactions.get_by_id('PRFGS_1').gene_reaction_rule = 'BSU06480 or BSU06470'
m_model.reactions.get_by_id('RNDR1').gene_reaction_rule = 'BSU17380 and BSU17390'
m_model.reactions.get_by_id('RNDR2').gene_reaction_rule = 'BSU17380 and BSU17390'
m_model.reactions.get_by_id('RNDR3').gene_reaction_rule = 'BSU17380 and BSU17390'
m_model.reactions.get_by_id('RNDR4').gene_reaction_rule = 'BSU17380 and BSU17390'


In [34]:
bsub = pc.select_organism('bsub')
ecoli = pc.select_organism('ecoli')

In [35]:
PROTEINS = bsub.proteins

In [36]:
filename = 'protein_complexes.txt'
file = open(filename,'w')
CPLX_list = []
genes_all_cplxs = []

for cplx in PROTEINS.instances:
    
    cplx_string = str(cplx.frameid)
    cplx_string = cplx_string[1:len(cplx_string)-1]
    string = cplx_string + '\t' + 'default_name' + '\t'

    
    genes_of_cplx = pc.PGDB.genes_of_protein(bsub,cplx)
    genes_of_cplx_comp = []
    
    for gene in genes_of_cplx:
        gene_string = str(gene)
        gene_string = gene_string[1:len(gene_string)-1]
        
        genes_of_cplx_comp.append(gene_string)
        string = string + gene_string +'(1)' + ' AND '

    
    string = string[0:len(string)-5]
    string = string + '\t' + 'M_protein_recon' + '\n'
    
    file.write(string)
    
    CPLX_list.append(cplx_string)
    genes_all_cplxs.append(genes_of_cplx_comp)
file.close()

### enzyme_reaction_association.txt

In [37]:
filename = 'enzyme_reaction_association.txt'
prot_cplx_filename = 'protein_complexes.txt'

file = open(filename,'w')
prot_cplx_file = open(prot_cplx_filename,'a')

standard_gene_length = 8

enz_rxn_assoc_list = []
artificial_cplxs = []
artificial_id = 0;
for reaction in m_model.reactions:
    if not (reaction.id[0:3] == 'EX_') and not (reaction.id[0:3] == 'DM_'):
        string = str(reaction.id) + '\t' 

        rule_string = str(reaction.gene_reaction_rule)
            
        if rule_string:
            rule_string = rule_string.replace('(','')
            rule_string = rule_string.replace(')','')
            rule_list = rule_string.split(' or ')
            enz_rxn_assoc = []

            reaction_cplx_list = []
            for rule in rule_list:
                rule_gene_list = rule.split(' and ')

                for index in range(0,len(genes_all_cplxs)-1):
                    ref_rule = genes_all_cplxs[index]

                    if set(ref_rule) == set(rule_gene_list):
                        rule_cplx = CPLX_list[index]
                        reaction_cplx_list.append(rule_cplx)

            enz_rxn_assoc_list.append(reaction_cplx_list)
            string = str(reaction.id) + '\t'

            if reaction_cplx_list:
                for cplx in reaction_cplx_list:
                    string = string + cplx + ' OR '
            else:
                for rule in rule_list:
                    stoichiometry_string = ''
                    if len(rule) == standard_gene_length:
                        artificial_cplx = rule + '-MONOMER'
                        stoichiometry_string = rule + '(1)'
                    else:
                        artificial_id = artificial_id + 1
                        artificial_cplx = 'CPLX000-' + str(artificial_id)
                        cplx_gene_list = rule.split(' and ')

                        for gene in cplx_gene_list:
                            stoichiometry_string = stoichiometry_string + gene + '(1)' + ' AND '
                        stoichiometry_string = stoichiometry_string[0:len(stoichiometry_string)-4]
                    string = string + artificial_cplx + ' OR '
                    
                    if artificial_cplx not in artificial_cplxs:
                        artificial_cplxs.append(artificial_cplx)
                        prot_cplx_file.write(artificial_cplx + '\t' + 'default_name' + '\t' + stoichiometry_string + '\t' + 'M_protein_recon' + '\n')

        else:
            ## TEST
            string = string + 'BSU07840-MONOMER    '

        string = string[0:len(string)-4]
        string = string + '\n'
        file.write(string)
    
file.close() 
prot_cplx_file.close()
artificial_cplxs = list(set(artificial_cplxs))



## TUs_from_bsubcyc.txt

In [38]:
from Bio import SeqIO

filename = 'TUs_from_bsubcyc.txt'
file = open(filename,'w')

gb_filename = 'NC_000964.gb'   
gb_file = SeqIO.read(gb_filename, 'gb')
full_seq = str(gb_file.seq)
element_types={'CDS', 'rRNA','tRNA', 'ncRNA'}

head_string = 'TU_id' + '\t' + 'start' + '\t' + 'stop' + '\t' + 'tss' + '\t' + 'strand' + '\t' + 'rho_dependent' + '\t' + 'sigma' + '\n'
file.write(head_string)
TUs = bsub.transcription_units
for tu_PFrame in TUs.instances:
    tu_data = pc.PToolsFrame.PFrame.get_frame_data(tu_PFrame)
    
    ## Start and stop
    positions = []
    for gene_fid in tu_data.components:
        if 'BSU' in gene_fid:
            gene_PFrame = pc.PToolsFrame.PFrame(gene_fid,bsub,getFrameData=False, isClass=False)
            gene_data = pc.PToolsFrame.PFrame.get_frame_data(gene_PFrame)
            positions.append(gene_data.left_end_position)
            positions.append(gene_data.right_end_position)
            gene_id = gene_fid.replace('BSU','BSU_')
    if positions:
        tu_start = min(positions)
        tu_stop = max(positions)
    else:
        tu_start = 0
        tu_stop = 0
    
    tu_tss = tu_stop
    tu_id = tu_data.frameid + '_from_RpoD_mono'
    tu_id = tu_id.replace('|','')
    tu_id = tu_id.replace('-','_')
    
    ## Strand    
    gene_id = gene_id[1:len(gene_id)-1] ## Only use one gene. The others should have the same direction.
    for feature in gb_file.features:
        if feature.type not in element_types or 'pseudo' in feature.qualifiers:
            continue
        if feature.qualifiers["locus_tag"][0] == gene_id:
            strand = '+' if feature.location.strand == 1 else '-'
    
    ##
    string = str(tu_id) + '\t' + str(tu_start) + '\t' + str(tu_stop) + '\t' + str(tu_tss) + '\t' + str(strand) + '\t' + 'True' + '\t' + 'RpoD_mono' + '\n'
    
    file.write(string)
    
file.close()

In [39]:
for instance in bsub.transcription_units.instances:
    if (instance.frameid == '|TU8J2-1577|') or (instance.frameid == '|TU8J2-467|'):
        tu_data = pc.PToolsFrame.PFrame.get_frame_data(tu_PFrame)
        print instance.frameid, tu_data.components

|TU8J2-1577| [u'|BS8J2-422|', u'|PM8J2-611|', u'|TERM8J2-736|', u'|BSU32470|', u'|BSU32480|', u'|BSU32490|', u'|BSU32500|', u'|BSU32510|']
|TU8J2-467| [u'|BS8J2-422|', u'|PM8J2-611|', u'|TERM8J2-736|', u'|BSU32470|', u'|BSU32480|', u'|BSU32490|', u'|BSU32500|', u'|BSU32510|']


## trna_to_codon dictionary

In [8]:

tRNA_to_codon_table = {'TTT': 'F',
 'TTC': 'F',
 'TTA': 'L',
 'TTG': 'L',
 'TCT': 'S',
 'TCC': 'S',
 'TCA': 'S',
 'TCG': 'S',
 'TAT': 'Y',
 'TAC': 'Y',
 'TAA': '*',
 'TAG': '*',
 'TGT': 'C',
 'TGC': 'C',
 'TGA': '*',
 'TGG': 'W',
 'CTT': 'L',
 'CTC': 'L',
 'CTA': 'L',
 'CTG': 'L',
 'CCT': 'P',
 'CCC': 'P',
 'CCA': 'P',
 'CCG': 'P',
 'CAT': 'H',
 'CAC': 'H',
 'CAA': 'Q',
 'CAG': 'Q',
 'CGT': 'R',
 'CGC': 'R',
 'CGA': 'R',
 'CGG': 'R',
 'ATT': 'I',
 'ATC': 'I',
 'ATA': 'I',
 'ATG': 'M',
 'ACT': 'T',
 'ACC': 'T',
 'ACA': 'T',
 'ACG': 'T',
 'AAT': 'N',
 'AAC': 'N',
 'AAA': 'K',
 'AAG': 'K',
 'AGT': 'S',
 'AGC': 'S',
 'AGA': 'R',
 'AGG': 'R',
 'GTT': 'V',
 'GTC': 'V',
 'GTA': 'V',
 'GTG': 'V',
 'GCT': 'A',
 'GCC': 'A',
 'GCA': 'A',
 'GCG': 'A',
 'GAT': 'D',
 'GAC': 'D',
 'GAA': 'E',
 'GAG': 'E',
 'GGT': 'G',
 'GGC': 'G',
 'GGA': 'G',
 'GGG': 'G'}

for key in tRNA_to_codon_table:
    aa_id = tRNA_to_codon_table[key]
    aa_id = seq3(aa_id)
    tRNA_to_codon_table[key] = aa_id
    

tRNA_to_codon_table

{'AAA': 'Lys',
 'AAC': 'Asn',
 'AAG': 'Lys',
 'AAT': 'Asn',
 'ACA': 'Thr',
 'ACC': 'Thr',
 'ACG': 'Thr',
 'ACT': 'Thr',
 'AGA': 'Arg',
 'AGC': 'Ser',
 'AGG': 'Arg',
 'AGT': 'Ser',
 'ATA': 'Ile',
 'ATC': 'Ile',
 'ATG': 'Met',
 'ATT': 'Ile',
 'CAA': 'Gln',
 'CAC': 'His',
 'CAG': 'Gln',
 'CAT': 'His',
 'CCA': 'Pro',
 'CCC': 'Pro',
 'CCG': 'Pro',
 'CCT': 'Pro',
 'CGA': 'Arg',
 'CGC': 'Arg',
 'CGG': 'Arg',
 'CGT': 'Arg',
 'CTA': 'Leu',
 'CTC': 'Leu',
 'CTG': 'Leu',
 'CTT': 'Leu',
 'GAA': 'Glu',
 'GAC': 'Asp',
 'GAG': 'Glu',
 'GAT': 'Asp',
 'GCA': 'Ala',
 'GCC': 'Ala',
 'GCG': 'Ala',
 'GCT': 'Ala',
 'GGA': 'Gly',
 'GGC': 'Gly',
 'GGG': 'Gly',
 'GGT': 'Gly',
 'GTA': 'Val',
 'GTC': 'Val',
 'GTG': 'Val',
 'GTT': 'Val',
 'TAA': 'Ter',
 'TAC': 'Tyr',
 'TAG': 'Ter',
 'TAT': 'Tyr',
 'TCA': 'Ser',
 'TCC': 'Ser',
 'TCG': 'Ser',
 'TCT': 'Ser',
 'TGA': 'Ter',
 'TGC': 'Cys',
 'TGG': 'Trp',
 'TGT': 'Cys',
 'TTA': 'Leu',
 'TTC': 'Phe',
 'TTG': 'Leu',
 'TTT': 'Phe'}

In [9]:
def get_key(my_dict, val): 
    key_list = []
    for key, value in my_dict.items(): 
         if val == value: 
                key_list.append(key)
    return key_list



In [10]:
trna_to_codon = dict()
for tRNA_PFrame in bsub.tRNAs.instances:
    tRNA_data = pc.PToolsFrame.PFrame.get_frame_data(tRNA_PFrame)
    
    tRNA_id = str(tRNA_data.frameid)
    tRNA_id = tRNA_id.replace('|','')
    tRNA_id = tRNA_id.replace('-tRNA','')
    tRNA_id = tRNA_id.replace('TRNA','tRNA')
    
    aa_id = tRNA_data.common_name
    aa_id = aa_id[len(aa_id)-3:len(aa_id)]
    
    codon_list = get_key(tRNA_to_codon_table, aa_id)
    
    trna_to_codon[tRNA_id] = codon_list
    
trna_to_codon
    

{'BSU_tRNA_1': ['TTT', 'TTC'],
 'BSU_tRNA_10': ['ATG'],
 'BSU_tRNA_11': ['GAG', 'GAA'],
 'BSU_tRNA_12': ['GTA', 'GTC', 'GTG', 'GTT'],
 'BSU_tRNA_13': ['ACA', 'ACG', 'ACT', 'ACC'],
 'BSU_tRNA_14': ['AAG', 'AAA'],
 'BSU_tRNA_15': ['CTT', 'CTG', 'CTA', 'CTC', 'TTA', 'TTG'],
 'BSU_tRNA_16': ['GGT', 'GGG', 'GGA', 'GGC'],
 'BSU_tRNA_17': ['CTT', 'CTG', 'CTA', 'CTC', 'TTA', 'TTG'],
 'BSU_tRNA_18': ['AGG', 'AGA', 'CGA', 'CGG', 'CGT', 'CGC'],
 'BSU_tRNA_19': ['CCT', 'CCG', 'CCA', 'CCC'],
 'BSU_tRNA_2': ['GAT', 'GAC'],
 'BSU_tRNA_20': ['GCA', 'GCC', 'GCG', 'GCT'],
 'BSU_tRNA_21': ['ATG'],
 'BSU_tRNA_22': ['GAT', 'GAC'],
 'BSU_tRNA_23': ['AAC', 'AAT'],
 'BSU_tRNA_24': ['ACA', 'ACG', 'ACT', 'ACC'],
 'BSU_tRNA_25': ['GGT', 'GGG', 'GGA', 'GGC'],
 'BSU_tRNA_26': ['AGG', 'AGA', 'CGA', 'CGG', 'CGT', 'CGC'],
 'BSU_tRNA_27': ['CCT', 'CCG', 'CCA', 'CCC'],
 'BSU_tRNA_28': ['GCA', 'GCC', 'GCG', 'GCT'],
 'BSU_tRNA_29': ['AAC', 'AAT'],
 'BSU_tRNA_3': ['GAG', 'GAA'],
 'BSU_tRNA_30': ['AGC', 'AGT', 'TCT', 'TCG'

## protein_modification.txt

In [42]:
filename = 'protein_modification.txt'
file = open(filename,'w')
file.write('Modified_enzyme' + '\t' + 'Core_enzyme' + '\t' + 'Modifications' + '\t' + 'Source' + '\n')
cofactors = {'|ZN+2|':'zn2','|CU+2|':'cu2','|NAD|':'nad','|FAD|':'fad','MG+2':'mg2','|CA+2|':'ca2',
             '|Reduced-2Fe-2S-Ferredoxins|':'2fe2s','|Oxidized-ferredoxins|':'2fe2s',
             '|Reduced-ferredoxins|':'2fe2s','|NADP|':'nad','|PYRIDOXAL_PHOSPHATE|':'pydx5p',
             '|ADENOSYLCOBALAMIN|':'adocbl','|BIOTIN|':'btn'}

cplx_cofactor_dict = dict()
RXNS = bsub.reactions
for rxn in RXNS.instances:
    rxn_data = pc.PToolsFrame.PFrame.get_frame_data(rxn)
    
    substrates = rxn_data.substrates
    rxn_cofactors = list(set(cofactors) & set(substrates))
    
    #print rxn_data.frameid, substrates
    if any(rxn_cofactors):
        cplx_list = pc.PGDB.enzymes_of_reaction(bsub,rxn)
        for cplx in cplx_list:
            cplx = str(cplx[1:len(cplx)-1])
            
            if (cplx not in cplx_cofactor_dict.keys()):
                cplx_cofactor_dict[cplx] = []
                
            for cofactor in rxn_cofactors:
                
                if (cofactors[cofactor] not in cplx_cofactor_dict[cplx]):
                    cplx_cofactor_dict[cplx].append(cofactors[cofactor])
                    
for cplx in cplx_cofactor_dict.keys():
    cofactor_string = ''
    mod_string = ''
    for cofactor in cplx_cofactor_dict[cplx]:
        cofactor_string = cofactor_string + cofactor + '()' + ' AND '
        mod_string = mod_string + '_' + cofactor
    cofactor_string = cofactor_string[0:len(cofactor_string)-5]
    
    string = cplx + mod_string +'\t' + cplx + '\t' + cofactor_string + '\t' + 'M_protein_recon' + '\n'
    
    file.write(string)
file.close()


In [41]:
cplx_cofactor_dict

{'BSU00090-MONOMER': ['nad'],
 'BSU00810-MONOMER': ['nad'],
 'BSU02470-MONOMER': ['nad'],
 'BSU03050-MONOMER': ['nad'],
 'BSU03210-MONOMER': ['nad'],
 'BSU03270-MONOMER': ['nad'],
 'BSU03910-MONOMER': ['nad'],
 'BSU03990-MONOMER': ['nad'],
 'BSU04000-MONOMER': ['nad'],
 'BSU06150-MONOMER': ['nad'],
 'BSU06240-MONOMER': ['nad'],
 'BSU06620-MONOMER': ['nad'],
 'BSU07250-MONOMER': ['nad'],
 'BSU07830-MONOMER': ['nad'],
 'BSU08030-MONOMER': ['nad'],
 'BSU08090-MONOMER': ['nad'],
 'BSU10530-MONOMER': ['nad'],
 'BSU10850-MONOMER': ['nad'],
 'BSU11010-MONOMER': ['nad'],
 'BSU11190-MONOMER': ['nad'],
 'BSU11610-MONOMER': ['nad'],
 'BSU12290-MONOMER': ['nad'],
 'BSU12350-MONOMER': ['nad'],
 'BSU12380-MONOMER': ['nad'],
 'BSU13130-MONOMER': ['nad'],
 'BSU14060-MONOMER': ['nad'],
 'BSU14440-MONOMER': ['nad'],
 'BSU14610-MONOMER': ['nad'],
 'BSU15110-MONOMER': ['nad'],
 'BSU15230-MONOMER': ['nad'],
 'BSU15630-MONOMER': ['nad'],
 'BSU15750-MONOMER': ['2fe2s'],
 'BSU15910-MONOMER': ['nad'],
 'BSU161

In [13]:
RXNS

0
Class |Reactions| has 1405 instances


In [14]:
gene_data

0,1
accession_1,BSU32510
centisome_position,79.22811
comment,[u'Evidence 1a: Function experimentally demonstrated in the studied strain; PubMedId: 11344136; Product type c: carrier']
common_name,pucA
component_of,"[u'|CHROM-1-70|', u'|TU8J2-467|']"
creation_date,3465237850
creator,|keseler|
dblinks,"{u'|STRING|': [u'224308.Bsubs1_010100017636', None, u'|kothari|', 3652461190, None, None], u'|DBTBS-GENES|': [u'pucA', None, u'|taltman|', 3484502256, None, None], u'|GENOLIST|': [u'BSU32510', None, u'|keseler|', 3489409638, None, None], u'|GOA|': [u'O32147', None, u'|keseler|', 3465237847, None, None], u'|INTERPRO|': [u'IPR003777', u'|IN-FAMILY|', u'|keseler|', 3465237847, None, None], u'|SUBTILISWIKI|': [u'pucA', None, u'|keseler|', 3488807606, None, None], u'|SUBTIWIKI|': [u'pucA', None, u'|keseler|', 3474754564, None, None]}"
frameid,|BSU32510|
key_slots,|COMMON-NAME|


In [16]:
ecoli.complexes

0
Class |Complexes| has 1211 instances


In [29]:
for cplx in bsub.complexes.instances:
    if cplx.frameid == '|CPLX8J2-147|':
        ribosome = cplx
        ribosome_data = pc.PToolsFrame.PFrame.get_frame_data(ribosome)
        print ribosome.frameid

|CPLX8J2-147|


In [44]:
for gene in bsub.genes.instances:
    if gene.frameid == '|BSU01270|':
        test_gene = gene
        test_gene_data = pc.PToolsFrame.PFrame.get_frame_data(test_gene)
        print test_gene.frameid
test_gene_data

|BSU01270|


0,1
accession_1,BSU01270
centisome_position,3.3413227
comment,"[u'Evidence 1a: Function experimentally demonstrated in the studied strain; PubMedId: 11278078, 12682299; Product type s: structure']"
common_name,rplX
component_of,[u'|CHROM-1-2|']
creation_date,3465237857
creator,|keseler|
dblinks,"{u'|STRING|': [u'224308.Bsubs1_010100000655', None, u'|kothari|', 3652461192, None, None], u'|GENOLIST|': [u'BSU01270', None, u'|keseler|', 3489409637, None, None], u'|GOA|': [u'P12876', None, u'|keseler|', 3465237842, None, None], u'|INTERPRO|': [u'IPR014723', u'|IN-FAMILY|', u'|keseler|', 3465237842, None, None], u'|SUBTILISWIKI|': [u'rplX', None, u'|keseler|', 3488807606, None, None], u'|SUBTIWIKI|': [u'rplX', None, u'|keseler|', 3474754563, None, None]}"
frameid,|BSU01270|
key_slots,|COMMON-NAME|


In [53]:
for protein in bsub.proteins.instances:
    if protein.frameid == '|BSU01270-MONOMER|':
        test_protein = gene
        print test_protein.frameid
test_protein

|BSU30940|


0,1
accession_1,BSU30940
centisome_position,75.048164
comment,"[u'Evidence 1a: Function experimentally demonstrated in the studied strain; PubMedId: 15272305, 8145641; Product type e: enzyme']"
common_name,glgP
component_of,"[u'|CHROM-1-66|', u'|TU8J2-507|']"
creation_date,3465237850
creator,|keseler|
dblinks,"{u'|STRING|': [u'224308.Bsubs1_010100016826', None, u'|kothari|', 3652461192, None, None], u'|DBTBS-GENES|': [u'glgP', None, u'|taltman|', 3484502254, None, None], u'|GENOLIST|': [u'BSU30940', None, u'|keseler|', 3489409638, None, None], u'|GOA|': [u'P39123', None, u'|keseler|', 3465237847, None, None], u'|INTERPRO|': [u'IPR000811', u'|IN-FAMILY|', u'|keseler|', 3465237847, None, None], u'|SUBTILISWIKI|': [u'glgP', None, u'|keseler|', 3488807606, None, None], u'|SUBTIWIKI|': [u'glgP', None, u'|keseler|', 3474754564, None, None]}"
frameid,|BSU30940|
key_slots,|COMMON-NAME|
