In [1]:
import json
import glob
import re

Import compounds and reactions

In [6]:
reactions = dict()
for fname in glob.glob("../kegg/2018-09-25/reaction/*"):
    with open(fname) as f:
        reaction_json = json.load(f)[0]
        reactions[reaction_json["entry_id"]] = reaction_json

In [7]:
compounds = dict()
for fname in glob.glob("../kegg/2018-09-25/compound/*"):
    with open(fname) as f:
        compound_json = json.load(f)[0]
        compounds[compound_json["entry_id"]] = compound_json

Add `"elements"` key to `compounds` dict

In [8]:
for i in compounds:
    elements = re.findall(r"([A-Z][a-z]?)",compounds[i]['formula'])
    compounds[i]["elements"] = set(elements)

Identify reactions that spawn new elements

In [9]:
apparate_reactions = dict()
for r in reactions:
    
    elements_sub = set()
    elements_prod = set()
    
    if "substrates" in reactions[r]:
        for c in reactions[r]["substrates"]:
            if c in compounds:
                elements_sub = elements_sub.union(compounds[c]['elements'])
    
    if "products" in reactions[r]:
        for c in reactions[r]["products"]:
            if c in compounds:
                elements_prod = elements_prod.union(compounds[c]['elements'])
        
    if elements_sub != elements_prod:
        apparate_reactions[reactions[r]["entry_id"]] = list(elements_sub^elements_prod)
        print(reactions[r]["entry_id"],elements_sub^elements_prod)
#         print(reactions[r]["entry_id"],elements_sub^elements_prod,elements_sub,elements_prod)
        

R11305 {'P'}
R08431 {'O'}
R07547 {'R'}
R10685 {'S'}
R10390 {'S'}
R04146 {'C'}
R08535 {'O'}
R06317 {'O'}
R07443 {'C', 'O', 'H'}
R10152 {'S', 'Sn'}
R00914 {'S', 'P'}
R08427 {'O'}
R03142 {'C', 'N'}
R03841 {'H', 'P'}
R04886 {'C', 'N'}
R08519 {'P'}
R00083 {'C'}
R04613 {'R'}
R07341 {'P'}
R04644 {'R'}
R00312 {'C', 'Fe', 'O', 'N', 'H'}
R00168 {'P'}
R07838 {'S', 'O'}
R05106 {'R'}
R03656 {'R'}
R09467 {'C', 'O'}
R03986 {'C', 'N', 'P'}
R08299 {'O'}
R11470 {'S'}
R08578 {'R'}
R04808 {'S'}
R02876 {'C', 'H'}
R02460 {'C', 'O', 'H', 'N'}
R09851 {'Cl'}
R09144 {'Cl'}
R09847 {'R'}
R05188 {'R'}
R01348 {'R'}
R08885 {'N'}
R03660 {'R'}
R08779 {'R'}
R03836 {'O', 'R'}
R03135 {'C'}
R11671 {'R', 'N', 'P'}
R01734 {'O'}
R05777 {'P'}
R04977 {'C', 'Fe', 'O', 'H', 'N'}
R00172 {'P', 'Se'}
R02620 {'R'}
R08411 {'N'}
R00164 {'C', 'R', 'N', 'P'}
R08780 {'R'}
R06321 {'O', 'P'}
R01237 {'O', 'C'}
R00077 {'P'}
R04937 {'C'}
R08502 {'O', 'N'}
R06320 {'N'}
R08040 {'O', 'N'}
R00165 {'P'}
R03876 {'R'}
R03175 {'C', 'S', 'O', 'N', 'H'

In [16]:
[r for i,r in enumerate(apparate_reactions) if i<10]

['R11305',
 'R08431',
 'R07547',
 'R10685',
 'R10390',
 'R04146',
 'R08535',
 'R06317',
 'R07443',
 'R10152']

Write identifier to all reactions that don't conserve elements

In [22]:
for r in reactions:
    if r in apparate_reactions:
        reactions[r]["conserves_elements"]=False
    else:
        reactions[r]["conserves_elements"]=True  
    
    ## Write to file
    fout = "../kegg/testdata/reaction/%s.json"%r
    with open(fout, 'w') as outfile:
        json.dump(reactions[r], outfile, indent = 2, ensure_ascii = False) #sort_keys = True,
               

#### Write new `reaction_edges.json` that only include reactions which conserve elements

Load `reaction_edges`

In [25]:
fname = "../kegg/testdata/reaction_edges.json"
with open(fname) as f:
    reaction_edges = json.load(f)

Check size of `reaction_edges`

In [30]:
print(len(reaction_edges["products"]))
print(len(reaction_edges["substrates"]))
print(len(apparate_reactions))
print(len(reaction_edges["substrates"])-len(apparate_reactions))

10287
10287
353
9934


Delete reactions which don't conserve elements

In [31]:
for r in apparate_reactions:
    reaction_edges["products"].pop(r, None)
    reaction_edges["substrates"].pop(r, None)

Ensure sizes match

In [33]:
print(len(reaction_edges["products"]))
print(len(reaction_edges["substrates"]))

9934
9934


Rewrite `reaction_edges`

In [34]:
## Write to file
fout = "../kegg/testdata/reaction_edges_conserved_elements.json"
with open(fout, 'w') as outfile:
    json.dump(reaction_edges, outfile, indent = 2, ensure_ascii = False) #sort_keys = True,