## Modify inputs

In [5]:
import pandas as pd
import re
import Bio
from Bio import SeqIO
fasta_file = Bio.SeqIO.parse("../inputs/_sequences.fasta", "fasta")
contigs = [ i for i in fasta_file ]
gene_dct = pd.read_csv('../inputs/genes.txt',index_col=0,sep='\t')['Accession-1'].dropna().to_dict()
for f in contigs:
    geneid = f.name.split('|')[1]
    if geneid in gene_dct:
        f.description += '|{}'.format(gene_dct[geneid])

In [6]:
with open('../inputs/sequences.fasta', 'w') as outfile:
    for contig in contigs:
        Bio.SeqIO.write(contig, outfile, 'fasta')

## Get M-model

In [7]:
from cobrame.io.json import load_reduced_json_me_model

In [8]:
import cobra

In [9]:
model = cobra.io.load_json_model('../inputs/_m_model.json')

In [10]:
clostrime = load_reduced_json_me_model('./iJL965_ME_reduced.json')

In [11]:
cplxs = pd.read_csv('./iJL965_complexes.txt',index_col=0,sep='\t')

In [12]:
# model = cobra.core.Model('iJL680_inferred')

In [9]:
for r in clostrime.reactions.query('FWD|EX'):
    rxn = cobra.Reaction(r.id)
    for k,v in r.metabolites.items():
        if not isinstance(v,float):
            continue
        if k.id.split('_mod_')[0] in cplxs.index:
            continue
        if 'generic' in k.id:
            continue
        k.id = k.id.replace(':','')
    

## ATPM

In [116]:
for r in clostrime.metabolites.atp_c.reactions:
    if r.lower_bound > 0:
        print(r.id) 
        print(r.reaction)

growth_atp_maintenance
46.666 atp_c + 46.666 h2o_c --> 46.666 adp_c + 46.666 h_c + 46.666 pi_c


## New reactions

In [10]:
# me_rxns = list(set([(r,r.id.split('_FWD')[0]) for r in clostrime.reactions.query('FWD|EX')]))

In [11]:
exclude_list = ['translation',
     'transcription',
     'formation',
     '_dilution',
     '_demand',
     'charging',
     '_generic',
     'to_biomass',
     '_transfer_',
     '_to_']
def check_id(i):
    for j in exclude_list:
        if j in i:
            return True
    return False

In [12]:
me_rxns = set()
for r in clostrime.reactions:
    if check_id(r.id):
        continue
    me_rxns.add((r,r.id.split('_FWD')[0].split('_REV')[0]))

In [13]:
c = 0
for r,i in me_rxns:
    if model.reactions.has_id(i):
        continue
    c += 1
    print('{},{},{},{}'.format(i,'','',r.reaction,''))

BTDD-RRx,,,1.38888888888889e-5*mu CPLX-539_mod_1:zn2 + actn__R_c + h_c + nadph_c --> btd__RR_c + nadp_c
dummy_reaction,,,1.38888888888889e-5*mu CPLX_dummy --> 
DM_succ(c),,,1.38888888888889e-5*mu CPLX_dummy + succ_c --> 
growth_atp_maintenance,,,46.666 atp_c + 46.666 h2o_c --> 46.666 adp_c + 46.666 h_c + 46.666 pi_c
EX_dad__5_e,,,dad__5_e --> 
D-LACt2,,,1.38888888888889e-5*mu GlcA_mono + h_c + lac__D_c --> h_e + lac__D_e
I4FE4SR1,,,NifU_dim_mod_2:2fe2s + fmnh2_c + 2.0 h_c --> NifU_dim_mod_1:4fe4s + fmn_c
TYRt2r,,,1.38888888888889e-5*mu CLJU_c24250_mono + h_c + tyr__L_c --> h_e + tyr__L_e
ICYSDS1,,,1.38888888888889e-5*mu CPLX_dummy + cys__L_c + generic_IscS --> ala__L_c + generic_IscS_mod_1:SH
charge_ef_tu,,,1.38888888888889e-5*mu Tsf_mono + generic_Tuf + gtp_c --> generic_Tuf_gtp
EX_thr__L_e,,,thr__L_e --> 
DM_inner_membrane_protein_capacity,,,inner_membrane_protein_capacity --> 
EX_lipoate_e,,,lipoate_e <=> 
EX_pnto__R_e,,,pnto__R_e --> 
EX_dpm_c,,,dpm_c <=> 
TYRt2r,,,1.38888888888889

## New gprs

In [14]:
def update_mod_convention(s):
    base = s.split('_mod_')[0]
    infomods = s.split('_mod_')[1:]
    modstring = ''
    for i in infomods:
        if ':' in i:
            coeff,mod = i.split(':')
        else:
            coeff = 1
            mod = i
        modstring += '_mod_{}({})'.format(mod,coeff)
    return base + modstring

In [15]:
d = {}
cplxnames = {}
for (r,i) in me_rxns:
    for k,v in r.metabolites.items():
        if isinstance(v,float):
            continue
        if 'dummy' in k.id:
            continue
        if i not in d:
            d[i] = set()
        d[i].add(update_mod_convention(k.id))
for k,v in d.items():
    for i in v:
        if k not in model.reactions:
            print(k,' not in model')
            continue
        cplxnames[i] = model.reactions.get_by_id(k).name
    d[k] = ' OR '.join(v)
df = pd.DataFrame.from_dict({'Complexes':d})
df.index.name = 'Reaction'
df

BTDD-RRx  not in model
D-LACt2  not in model
TYRt2r  not in model
charge_ef_tu  not in model


Unnamed: 0_level_0,Complexes
Reaction,Unnamed: 1_level_1
3HAD100,CPLX-43
3HAD120,CPLX-43
3HAD121,CPLX-43
3HAD140,CPLX-43
3HAD141,CPLX-43
...,...
XYLabc,CPLX-136
XYLt2,CPLX-204
ZN2t,ZupT_mono
ZNabc,CPLX-303 OR CPLX-304 OR CPLX-306 OR CPLX-305


In [16]:
df.to_csv('../building_data/enzyme_reaction_association.txt',sep='\t')

## New protein_complexes

In [17]:
complexes_df = pd.read_csv('../building_data/protein_corrections.txt',sep='\t',index_col=0)
iJL965_complexes = pd.read_csv('./iJL965_complexes.txt',sep='\t',index_col=0)
iJL965_complexes = iJL965_complexes[iJL965_complexes['genes'].notna()].fillna('')

In [18]:
for c,row in iJL965_complexes.iterrows():
    iJL965_complexes.loc[c]['source'] = 'iJL965-ME'        
    if c not in cplxnames:
        print(c, ' has no name')
        continue
    iJL965_complexes.loc[c]['name'] = cplxnames[c]


CPLX-15  has no name
CPLX-18  has no name
GluS_mono  has no name
CPLX-27  has no name
Hyp_CPLX  has no name
CPLX-28  has no name
Etf3_CPLX  has no name
CPLX-31  has no name
CPLX-32  has no name
CPLX-33  has no name
CPLX-39  has no name
CPLX-40  has no name
CPLX-46  has no name
CPLX-60  has no name
CPLX-61  has no name
CPLX-62  has no name
CPLX-65  has no name
CPLX-75  has no name
CPLX-82  has no name
CPLX-83  has no name
CPLX-84  has no name
CPLX-90  has no name
FolE_CPLX  has no name
CPLX-102  has no name
CPLX-106  has no name
CPLX-115  has no name
CPLX-116  has no name
CPLX-122  has no name
CPLX-125  has no name
CPLX-126  has no name
CPLX-127  has no name
CPLX-128  has no name
CPLX-129  has no name
CPLX-130  has no name
CPLX-131  has no name
CPLX-132  has no name
CPLX-134  has no name
CPLX-138  has no name
CPLX-149  has no name
CPLX-150  has no name
CPLX-159  has no name
CPLX-172  has no name
CPLX-182  has no name
CPLX-184  has no name
CPLX-185  has no name
CPLX-186  has no name
CPLX

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# ACP should not be named ACP_c as it would be mixed with the M-model ACP_c
iJL965_complexes = iJL965_complexes.rename(index={'ACP_c': 'ACP'})

In [19]:
iJL965_complexes.to_csv('./iJL965_complexes_with_names.txt',sep='\t')

## New protein_modification

In [20]:
iJL965_mod = pd.read_csv('./iJL965_protein_modification.txt',sep='\t',index_col=0,comment='#')

In [21]:
tmp = {}
for c,row in iJL965_mod.iterrows():
    new_c = update_mod_convention(c)
    tmp[new_c] = {
        'Core_enzyme' : row['Core_enzyme'],
        'Modifications' : row['Modifications'],
        'Source' : 'iJL965'
    }
df =pd.DataFrame.from_dict(tmp).T
df.index.name = 'Modified_enzyme'
df.to_csv('../building_data/protein_modification.txt',sep='\t')

## Rho independent

In [22]:
df = pd.read_csv('../building_data/TUs_from_biocyc.txt',sep='\t',index_col=0,comment='#')

In [23]:
iJL965_tus = pd.read_csv('iJL965_TUs.txt',sep='\t',index_col=0,comment='#')
iJL965_tus = iJL965_tus[iJL965_tus['Rho_independent'] == 't']

In [24]:
l = []
for start,row in iJL965_tus.iterrows():
    tu = df[df['start'].str.contains('([,]{1}|^){}([,]{1}|$)'.replace('{}',str(start+1)),regex=True)]
    if not tu.empty:
        l.append(tu.index.values[0].split('_from_')[0])



In [25]:
df = pd.DataFrame(columns=l).T
df.index.name = 'id'
df.to_csv('../building_data/rho_independent.txt')

## New subsystems

In [26]:
model = cobra.io.load_json_model('../inputs/_m_model.json')

In [27]:
def is_same_reaction(r1,r2):
    if r1.id == r2.id:
        return True
    if len(r1.metabolites) != len(r2.metabolites):
        return False
    for m1,c1 in r1.metabolites.items():
        if not r2._model.metabolites.has_id(m1.id):
            return False
        m2 = r2._model.metabolites.get_by_id(m1.id)
        if m2 not in r2.metabolites:
            return False
        c2 = r2.metabolites[m2]
        if m1 != m2:
            return False
    return True
def get_least_connected_met(r):
    d = {k.id:len(k.reactions) for k,_ in r.metabolites.items()}
    m_id = pd.DataFrame.from_dict({'num':d}).sort_values('num').iloc[0].name
    return r._model.metabolites.get_by_id(m_id)
def get_homolog_reaction(model,r):
    m = get_least_connected_met(r)
    if not model.metabolites.has_id(m.id):
        return None
    mm = model.metabolites.get_by_id(m.id)
    for rm in mm.reactions:
        if is_same_reaction(r,rm):
            return rm
    return None
def get_compartments(r):
    return set(m.id[-1] for m in r.metabolites)
        

In [28]:
d = {}

#### From E. coli

In [30]:
eco = cobra.io.load_json_model('../../ecoli/inputs/m_model.json')

In [31]:

for r in model.reactions:
    if 'biomass' in r.id:
        continue
    hr = get_homolog_reaction(eco,r)
    if hr is None:
        continue
    d[r.id] = {'subs':hr.subsystem}


#### From gene annotation

In [32]:
iJL965_annot = pd.read_csv('./iJL965_gene_annotation.txt',sep='\t',index_col=1,comment='#')
iJL965_annot.head()

Unnamed: 0_level_0,Category
CLJU,Unnamed: 1_level_1
CLJU_c39390,"Cofactors, Vitamins, Prosthetic Groups, Pigments"
CLJU_c22770,"Cofactors, Vitamins, Prosthetic Groups, Pigments"
CLJU_c10840,"Cofactors, Vitamins, Prosthetic Groups, Pigments"
CLJU_c24470,"Cofactors, Vitamins, Prosthetic Groups, Pigments"
CLJU_c26660,"Cofactors, Vitamins, Prosthetic Groups, Pigments"


In [33]:
for r in model.reactions:
    if r.id in d:
        continue
    subs = set(iJL965_annot.loc[[g.id for g in r.genes]]['Category'])
    if not subs:
        continue
    subs = [list(subs)[0]]
    d[r.id] = {'subs' : ' // '.join(subs)}

#### From reactions

In [34]:
for r in model.reactions.query('^EX_'):
    d[r.id] = 'Exchange'
for r in model.reactions.query('^DM_'):
    d[r.id] = 'Demand'

In [35]:
for r in model.reactions:
    if r.id in d:
        continue
    if len(get_compartments(r)) > 1:
        d[r.id] = 'Transport'

#### save

In [36]:
df = pd.DataFrame.from_dict(d).T

In [37]:
df.to_csv('inferred_iJL965_compartments.txt')

In [38]:
for r,row in df.iterrows():
    model.reactions.get_by_id(r).subsystem = row['subs']

In [39]:
model.metabolites.get_by_id('fdxo__4:2_c').id = 'fdxo__42_c'
model.metabolites.get_by_id('fdxr__4:2_c').id = 'fdxr__42_c'

In [40]:
cobra.io.save_json_model(model,'../inputs/m_model.json')

### Generics

In [41]:
tmp = {}
for r in clostrime.reactions.query('generic'):
    generic = [m for m in r.products if 'generic' in m.id]
    if not generic:
#         print('Skipped {}'.format(r.id))
        continue
    generic = generic[0]
    cplx = [update_mod_convention(c.id) for c in r.reactants if c.id.split('_mod_')[0] in iJL965_complexes.index]
    if generic.id not in tmp:
        tmp[generic.id] = []
    tmp[generic.id].append(cplx[0])
#     print(r.reaction)
    

In [42]:
for k,v in tmp.items():
    print('{}\t{}'.format(k,' OR '.join(v)))

generic_Tyr_RS_dim	Tyr_RS_dim OR TyrII_RS_dim
generic_Asn_RS_dim	Asn_RS_dim OR AsnII_RS_dim
generic_CysS_mono	CysS_mono_mod_zn2(1) OR CysSII_mono_mod_zn2(1)
generic_LplA	LplA_mono OR LplAII_mono
generic_GreA	GreA_mono OR GreAII_mono OR GreAIII_mono
generic_Cca	CLJU_c11280_mono_mod_mg2(1) OR CLJU_c01520_mono_mod_mg2(1)
generic_TrmD	TrmD_dim OR TrmDII_dim
generic_QueG	QueG_mono_mod_4fe4s(2)_mod_adocbl(1) OR QueGII_mono_mod_4fe4s(2)_mod_adocbl(1)
generic_TsaC	TsaC_mono OR TsaCII_mono
generic_IscS	IscS_dim_mod_pydx5p(2) OR IscSII_dim_mod_pydx5p(2) OR NifU_dim_mod_fe2(2) OR NifU_dim_mod_2fe1s(1)
generic_SufBC	SufBC_CPLX OR SufBCII_CPLX
generic_Tuf	Tuf_mono OR TufII_mono
generic_RF	PrfA_mono OR PrfB_mono
generic_Def	Def_mono_mod_fe2(1) OR DefII_mono_mod_fe2(1) OR DefIII_mono_mod_fe2(1) OR DefIV_mono_mod_fe2(1) OR DefV_mono_mod_fe2(1)
generic_InfC	InfC_mono OR InfCII_mono
generic_DnaJ	DnaJ_dim_mod_zn2(4) OR DnaJII_dim_mod_zn2(4)
generic_PrsA	PrsA_mono OR PrsAII_mono


### rna mod targets

In [158]:
targets = pd.read_csv('./iJL965_rnamodtargets.txt',sep='\t',index_col=0).fillna('0')

##### 23S

In [159]:
rrna_23s = ['RNA_CLJU_c00090', 'RNA_CLJU_c01010', 'RNA_CLJU_c01150', 'RNA_CLJU_c01300', 'RNA_CLJU_c06260', 'RNA_CLJU_c06550', 'RNA_CLJU_c14940', 'RNA_CLJU_c28250', 'RNA_CLJU_c41430']

In [160]:
d = [
    ('Y',955),
    ('Y',1911),
    ('m3Y',1915),
    ('Y',1917),
    ('Gm',2251),
    ('m2A',2503),
    ('Y',2504),
    ('Y',2580),
    ('Y',2605)
]

In [161]:
for g in rrna_23s:
    gid = g.split('RNA_')[1]
    tmp = {}
    for k,v in d:
        tmp[gid] = {
            'position':str(v),
            'modification':k
        }
        targets = pd.concat([targets,pd.DataFrame.from_dict(tmp).T],join='outer')

##### 16S

In [162]:
rrna_16s = ['RNA_CLJU_c00080', 'RNA_CLJU_c01000', 'RNA_CLJU_c01140', 'RNA_CLJU_c01290', 'RNA_CLJU_c06230', 'RNA_CLJU_c06520', 'RNA_CLJU_c14930', 'RNA_CLJU_c28260', 'RNA_CLJU_c41440']

In [163]:
d = [
    ('Y',516),
    ('m7G',527),
    ('m2G',966),
    ('m5C',967),
    ('Cm',1195),
    ('D',1211),
    ('m4Cm',1402),
    ('m5C',1409),
    ('m3U',1498),
    ('m62A',1518),
    ('m62A',1519)
]

In [164]:
for g in rrna_16s:
    gid = g.split('RNA_')[1]
    tmp = {}
    for k,v in d:
        tmp[gid] = {
            'position':str(v),
            'modification':k
        }
        targets = pd.concat([targets,pd.DataFrame.from_dict(tmp).T],join='outer')

In [165]:
targets.index.name = 'bnum'
targets.to_csv('../building_data/post_transcriptional_modification_of_RNA.txt',sep='\t')

### Get ME-model genes

In [51]:
genes = {}
for m in clostrime.metabolites.query('RNA_CLJU'):
    gid = m.id.split('RNA_')[1]
    if len(m.reactions) < 3:
        continue
    rxns = [r.id for r in m.reactions if 'dilution' not in r.id and 'transcription' not in r.id and 'translation' not in r.id]
    if rxns:
        genes[gid] = rxns
        continue
    pid = 'protein_' + gid
    if not clostrime.metabolites.has_id(pid):
        continue
    rxns = [r.id for r in clostrime.metabolites.get_by_id(pid).reactions if 'dilution' not in r.id and 'translation' not in r.id]
    if rxns:
        genes[gid] = rxns
#     genes.append(gid)

### Some genes in published ME-model have no function

In [98]:
g = 'CLJU_c41100'
for i in clostrime.metabolites.get_by_id('RNA_{}'.format(g)).reactions:
    print(i)
print()
for i in clostrime.metabolites.get_by_id('protein_{}'.format(g)).reactions:
    print(i)

translation_CLJU_c40080
transcription_TU_4358550_4359678
CLJU_c40080_dilution

translation_CLJU_c40080
transcription_TU_9415_14506
protein_CLJU_c40080_dilution


### Number of genes in published ME-model

In [56]:
len(genes)

943

In [57]:
df = pd.DataFrame(index=genes)
df.index.name = 'id'
df.to_csv('./iJL965_genes.txt')

In [58]:
l = [m for m in clostrime.metabolites.query('RNA_')]
for i in l:
    rs = [r for r in i.reactions if 'dilution' not in r.id and 'transcription' not in r.id]
    if not rs:
        continue
    print(i, rs)

RNA_CLJU_c00100 [<Reaction formation_ribosome at 0x7f1671ce88d0>]
RNA_CLJU_c00080 [<Reaction formation_ribosome at 0x7f1671ce88d0>]
RNA_CLJU_c00090 [<Reaction formation_ribosome at 0x7f1671ce88d0>]
RNA_Polymerase [<Reaction formation_RNA_Polymerase at 0x7f1671d1d198>]
RNA_CLJU_c00110 [<Reaction charging_tRNA_CLJU_c00110 at 0x7f1671706be0>]
RNA_CLJU_c00170 [<Reaction translation_CLJU_c00170 at 0x7f1671d00748>]
RNA_CLJU_c00180 [<Reaction charging_tRNA_CLJU_c00180 at 0x7f167171e438>]
RNA_CLJU_c00190 [<Reaction charging_tRNA_CLJU_c00190 at 0x7f1671706dd8>]
RNA_CLJU_c00200 [<Reaction charging_tRNA_CLJU_c00200 at 0x7f16716c04a8>]
RNA_CLJU_c00620 [<Reaction translation_CLJU_c00620 at 0x7f1671d00828>]
RNA_CLJU_c00660 [<Reaction translation_CLJU_c00660 at 0x7f1671b9df28>]
RNA_CLJU_c00670 [<Reaction translation_CLJU_c00670 at 0x7f1671bc1eb8>]
RNA_CLJU_c00680 [<Reaction translation_CLJU_c00680 at 0x7f1671bc1550>]
RNA_CLJU_c00710 [<Reaction translation_CLJU_c00710 at 0x7f1671c92be0>]
RNA_CLJU_c007

### All sigma factors are set to RpoD

In [18]:
for i in clostrime.reactions.query('transcription_'):
    if clostrime.metabolites.get_by_id("RpoD_mono") not in i.metabolites:
        print(i, i.reaction)

transcription_dummy 0.000330396475770925*mu + 0.000129185022026432 RNA_Polymerase + 37.0 atp_c + 16.0 ctp_c + 5.0 gtp_c + 23.0 utp_c --> RNA_dummy + 80.0 ppi_c + 25.749894 rna_biomass
transcription_dummy_membrane 0.000330396475770925*mu + 0.000129185022026432 RNA_Polymerase + 37.0 atp_c + 16.0 ctp_c + 5.0 gtp_c + 23.0 utp_c --> RNA_dummy_membrane + 80.0 ppi_c + 25.749894 rna_biomass
