In [64]:
import json
import seaborn as sns
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import csv
import networkx as nx
import itertools

In [65]:
%matplotlib inline

# 2019-08-28

#### Start loading files

In [66]:
with open("../links/Freilich09.json") as f:
    master_targets = json.load(f)

In [67]:
all_5w = list()
for fname in glob.glob("formatted/5w/*.json"):
    with open(fname) as f:
        all_5w.append(json.load(f))

In [68]:
all_5nw = list()
for fname in glob.glob("formatted/5nw/*.json"):
    with open(fname) as f:
        all_5nw.append(json.load(f))

In [69]:
all_10w = list()
for fname in glob.glob("formatted/10w/*.json"):
    with open(fname) as f:
        all_10w.append(json.load(f))

In [70]:
all_10nw = list()
for fname in glob.glob("formatted/10nw/*.json"):
    with open(fname) as f:
        all_10nw.append(json.load(f))

In [71]:
all_expansions = {"5w":all_5w,"5nw":all_5nw,"10w":all_10w,"10nw":all_10nw}

Look at disconnect between master targets and all scope targets (example)

In [72]:
assert len(all_5nw[0]['stats']['scope_targets'])==len(set(all_5nw[0]['stats']['scope_targets']))

In [73]:
missing_targets = set(master_targets)-set(all_5nw[0]['stats']['scope_targets'])

I should check if it's even possible to make these compounds with the set that John used.

In [74]:
with open("../links/reaction_edges.json") as f:
    reaction_edges = json.load(f)

In [75]:
for k in reaction_edges.keys():
    for r in reaction_edges[k]:
        current_set = set(reaction_edges[k][r])
        if len(missing_targets&current_set)>0:
            print(k,r,current_set)

In [76]:
missing_targets

{'C05890', 'C05899'}

These are not possible to generate from the reaction_edges used!! This is why even though they are in the master Freilich set, they are not in the individual expansion scope_targets.

Let's check a little more robustly to see what the missing taget distribution is.

In [77]:
all_missing = dict()
for k in all_expansions:
    all_missing[k] = set()
    for run in all_expansions[k]:
        current_missing = set(master_targets)-set(run['stats']['scope_targets'])
        all_missing[k] = all_missing[k]|current_missing

In [78]:
all_missing

{'5w': {'C05890', 'C05899'},
 '5nw': {'C05890', 'C05899'},
 '10w': {'C05890', 'C05899'},
 '10nw': {'C05890', 'C05899'}}

Yep, so only those two compounds which don't even exist in the KEGG database anymore are missing.

#### What about the difference between the scope targets and what actually gets produced for each randomization?

Example of how to calculate for a single run:

In [79]:
max_gen = max([int(k) for k in all_5nw[0]['generations'].keys()])

In [80]:
set(all_5nw[0]['stats']['scope_targets']) - set(all_5nw[0]['generations'][str(max_gen)]['targets_cumulative'])

{'C00002',
 'C00003',
 'C00004',
 'C00005',
 'C00006',
 'C00008',
 'C00015',
 'C00016',
 'C00020',
 'C00024',
 'C00025',
 'C00035',
 'C00037',
 'C00041',
 'C00043',
 'C00044',
 'C00047',
 'C00049',
 'C00054',
 'C00055',
 'C00062',
 'C00063',
 'C00064',
 'C00065',
 'C00073',
 'C00075',
 'C00078',
 'C00079',
 'C00082',
 'C00097',
 'C00105',
 'C00112',
 'C00116',
 'C00123',
 'C00131',
 'C00135',
 'C00144',
 'C00148',
 'C00152',
 'C00183',
 'C00188',
 'C00234',
 'C00239',
 'C00249',
 'C00255',
 'C00286',
 'C00350',
 'C00360',
 'C00362',
 'C00364',
 'C00399',
 'C00407',
 'C00458',
 'C00459',
 'C00641',
 'C00748',
 'C01050',
 'C05764',
 'C05894',
 'C05980',
 'C06040',
 'C15672',
 'C16221'}

Let's get for all runs:

In [81]:
all_unreached = dict()
for k in all_expansions:
    all_unreached[k] = list()
    for run in all_expansions[k]:
        max_gen = max([int(j) for j in run['generations'].keys()])
        current_unreached = set(run['stats']['scope_targets']) - set(run['generations'][str(max_gen)]['targets_cumulative'])
        all_unreached[k].append(list(current_unreached))

In [82]:
from collections import Counter

In [83]:
for k in all_unreached:
    print(k,len(Counter([i for outer in all_unreached[k] for i in outer])))

5w 63
5nw 63
10w 2
10nw 2


In [84]:
for k in all_unreached:
#     print([len(i) for i in k])
    print(k,Counter([len(i) for i in all_unreached[k]]))

5w Counter({2: 54, 63: 46})
5nw Counter({63: 66, 2: 34})
10w Counter({2: 100})
10nw Counter({2: 100})


All expansions miss Heme O and Siroheme. Weird that the expansions always have one of two outcomes. Makes me slightly suspicious?

In [88]:
Counter([i for outer in all_unreached['5w'] for i in outer])

Counter({'C00748': 100,
         'C15672': 100,
         'C00123': 46,
         'C00064': 46,
         'C00082': 46,
         'C01050': 46,
         'C00002': 46,
         'C00003': 46,
         'C00073': 46,
         'C00047': 46,
         'C00078': 46,
         'C00005': 46,
         'C00350': 46,
         'C00399': 46,
         'C00015': 46,
         'C00407': 46,
         'C00062': 46,
         'C00148': 46,
         'C00362': 46,
         'C00079': 46,
         'C00360': 46,
         'C00286': 46,
         'C00112': 46,
         'C00041': 46,
         'C00055': 46,
         'C00037': 46,
         'C00144': 46,
         'C00008': 46,
         'C00183': 46,
         'C00116': 46,
         'C00249': 46,
         'C00043': 46,
         'C00234': 46,
         'C00131': 46,
         'C00063': 46,
         'C00459': 46,
         'C00065': 46,
         'C00044': 46,
         'C00239': 46,
         'C00020': 46,
         'C00054': 46,
         'C00035': 46,
         'C00458': 46,
         

In [85]:
Counter([i for outer in all_unreached['5nw'] for i in outer])

Counter({'C00123': 66,
         'C00064': 66,
         'C00082': 66,
         'C01050': 66,
         'C00002': 66,
         'C00003': 66,
         'C00073': 66,
         'C00047': 66,
         'C00078': 66,
         'C00005': 66,
         'C00350': 66,
         'C00399': 66,
         'C00015': 66,
         'C00407': 66,
         'C00062': 66,
         'C00148': 66,
         'C00362': 66,
         'C00079': 66,
         'C00360': 66,
         'C00286': 66,
         'C00112': 66,
         'C00041': 66,
         'C00055': 66,
         'C00037': 66,
         'C00144': 66,
         'C00008': 66,
         'C00183': 66,
         'C00116': 66,
         'C00249': 66,
         'C00043': 66,
         'C00234': 66,
         'C15672': 100,
         'C00131': 66,
         'C00063': 66,
         'C00459': 66,
         'C00065': 66,
         'C00044': 66,
         'C00239': 66,
         'C00020': 66,
         'C00054': 66,
         'C00035': 66,
         'C00458': 66,
         'C00025': 66,
         '

In [86]:
Counter([i for outer in all_unreached['10w'] for i in outer])

Counter({'C00748': 100, 'C15672': 100})

In [87]:
Counter([i for outer in all_unreached['10nw'] for i in outer])

Counter({'C00748': 100, 'C15672': 100})

#### Look at number of unique seed sets

In [92]:
all_seeds = dict()
for k in all_expansions:
    all_seeds[k] = list()
    for run in all_expansions[k]:
        all_seeds[k].append(frozenset(run['stats']['scope_seeds']))

In [95]:
len(set(all_seeds['5w']))

37

In [96]:
len(all_seeds['5w'])

100

So this means John allows the same seed set to be drawn more than once. I think that's good...

In [102]:
all_10w[0]['stats']["scope_seeds"]

['C00001',
 'C01326',
 'C00014',
 'C00011',
 'C00697',
 'C00282',
 'C06547',
 'C00237',
 'C01438',
 'C01548']

In [108]:
all_10w[0]['generations'].keys()

dict_keys(['2', '11', '39', '25', '42', '29', '8', '20', '14', '31', '33', '18', '26', '35', '17', '44', '4', '37', '45', '13', '30', '1', '32', '40', '7', '9', '43', '34', '3', '38', '36', '12', '16', '21', '10', '19', '22', '6', '24', '28', '5', '23', '27', '41', '15'])

In [107]:
all_10w[0]['generations']['1']

{'targets_new': [],
 'targets_cumulative': [],
 'compounds_cumulative': ['C00001',
  'C01326',
  'C00014',
  'C00011',
  'C00697',
  'C00282',
  'C06547',
  'C00237',
  'C01438',
  'C01548'],
 'reactions_cumulative': ['R10092',
  'R05539',
  'R09139',
  'R00067',
  'R00132',
  'R05380',
  'R09142',
  'R07316',
  'R10079',
  'R00005',
  'R09158',
  'R00131',
  'R00153',
  'R09094',
  'R09784',
  'R00152',
  'R01408'],
 'compounds_new': ['C06547',
  'C00282',
  'C01326',
  'C00014',
  'C00237',
  'C00011',
  'C01438',
  'C01548',
  'C00697',
  'C00001'],
 'reactions_new': ['R09158',
  'R00005',
  'R07316',
  'R09142',
  'R05539',
  'R10079',
  'R00067',
  'R09784',
  'R10092',
  'R00131',
  'R09139',
  'R00153',
  'R00132',
  'R09094',
  'R00152',
  'R05380',
  'R01408']}

In [101]:
all_10w[0]['stats']["scope_compounds"]

['C01832',
 'C00016',
 'C00422',
 'C00001',
 'C17224',
 'C00024',
 'C15778',
 'C00005',
 'C00080',
 'C00007',
 'C20518',
 'C00028',
 'C02107',
 'C12176',
 'C00506',
 'C00026',
 'C01200',
 'C00009',
 'C00121',
 'C05966',
 'C16504',
 'C05432',
 'C00030',
 'C03826',
 'C05932',
 'C00003',
 'C06098',
 'C10138',
 'C00383',
 'C02091',
 'C00048',
 'C00982',
 'C12366',
 'C03944',
 'C00999',
 'C21529',
 'C11547',
 'C03798',
 'C20864',
 'C00035',
 'C05730',
 'C19938',
 'C18261',
 'C05694',
 'C00509',
 'C19943',
 'C00002',
 'C04071',
 'C01847',
 'C00019',
 'C00108',
 'C00029',
 'C06661',
 'C00138',
 'C15881',
 'C17324',
 'C03752',
 'C04261',
 'C07272',
 'C15485',
 'C03283',
 'C01326',
 'C00957',
 'C03049',
 'C01424',
 'C00132',
 'C01330',
 'C04488',
 'C03576',
 'C00448',
 'C01282',
 'C02798',
 'C00025',
 'C15935',
 'C00187',
 'C00004',
 'C00341',
 'C21643',
 'C10193',
 'C04530',
 'C21569',
 'C10502',
 'C15525',
 'C00008',
 'C16567',
 'C00006',
 'C18357',
 'C00340',
 'C00704',
 'C00473',
 'C01885',