In [2]:
import pandas as pd
import json

import matplotlib.pyplot as mpl
import seaborn as sns

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

import csv

## Read Files

enc_exp0 - found in "results/formatted/ALL", full network expansion of all seeds found within Enceladus plumes

In [3]:
#Full expansion
exp_path = "results/formatted/ALL/enc_exp0.json"

#Brite classification
brite_path = "brite.json"

In [4]:
#Read in expansion data (including numbers of compounds)
with open(exp_path) as f:
    datajson = json.load(f)
generations = pd.DataFrame(datajson["generations"])
generations = generations.transpose()
generations.index = generations.index.astype(int)
generations = generations.sort_index()

original_columns = generations.columns
for col in original_columns:
    generations["n_"+col] = generations[col].str.len() 

generations.head()

Unnamed: 0,compounds_cumulative,compounds_new,reactions_cumulative,reactions_new,targets_cumulative,targets_new,n_compounds_cumulative,n_compounds_new,n_reactions_cumulative,n_reactions_new,n_targets_cumulative,n_targets_new
1,"[C00001, C01326, C00132, C00067, C00014, C1150...","[C06547, C20783, C01548, C00014, C00011, C0006...","[R10092, R05539, R09139, R00067, R00132, R0235...","[R10535, R10079, R10092, R00131, R09139, R0538...",[],[],18,18,23,23,0,0
2,"[C00001, C00080, C00007, C00026, C01326, C0013...","[C00080, C00007, C00026, C00288, C05359, C0002...","[R10092, R00608, R05539, R00279, R09139, R0006...","[R00608, R00279, R03546, R09144, R07803, R0478...",[],[],41,23,59,36,0,0
3,"[C00001, C00080, C00007, C00026, C00048, C0132...","[C00048, C00025, C00704, C00022, C00222, C0021...","[R00782, R10092, R00608, R00258, R05539, R0027...","[R00782, R00258, R00475, R00008, R01874, R0537...",[C00025],[C00025],67,26,147,88,1,1
4,"[C00001, C00080, C00007, C00026, C00048, C0132...","[C00036, C00546, C00097, C00037, C00065, C0006...","[R00340, R00363, R00782, R10092, R10178, R0060...","[R00340, R00363, R10178, R02914, R09648, R0156...","[C00025, C00097, C00037, C00065, C00064, C00041]","[C00097, C00037, C00065, C00064, C00041]",146,79,288,141,6,5
5,"[C00001, C00080, C00007, C02107, C00026, C0382...","[C02107, C03826, C00383, C02091, C00108, C0095...","[R00340, R09186, R10908, R03106, R03551, R0369...","[R09186, R10908, R03106, R03551, R03694, R0696...","[C00025, C00097, C00037, C00065, C00064, C0004...","[C00049, C00188]",244,98,393,105,8,2


In [5]:
#Read in brite data
with open(brite_path) as b:
    britejson = json.load(b)

#list of each classification
#1: Lipds
#2: Nucleic Acids
#3: etc... (given by classes dictionary)
cpd_class_list = []
classes = {"Organic acids": 0, "Lipids": 1, "Carbohydrates": 2, "Nucleic acids": 3, "Peptides": 4, "Vitamins and Cofactors": 5, "Steroids": 6, "Hormones and transmitters": 7, "Antibiotics": 8}
for i in range(9):
    cpd_class_list.append([])

#Append each compound to its correct list (all organic acids go in cpd_class_list[0], etc...)
for classification in britejson["children"]:
    #print("\n" + classification["name"] + "\n")
    for subclass in classification["children"]:
        for subsubclass in subclass["children"]:
            for cpd in subsubclass["children"]:
                #only append compound KEGG name
                cpd_class_list[classes[classification["name"].encode('ascii', 'ignore')]].append(cpd["name"].encode('ascii', 'ignore')[:6])

Note - don't need to read in targets, there is a "targets_cumulative" column in generations which lists all targets generated at each step.

Goal for next step - get list of all compounds at last timestep

### Compound classification (brite)

Link compounds generated by expansion to classifiction (given by brite classifiction in KEGG)

In [6]:
#final_cpds: list of all compounds at final timestep
final_cpds = generations["compounds_cumulative"].iloc[-1]


In [7]:
total = 0
for i in range(len(cpd_class_list)):
    for cpd in cpd_class_list[i]:
        if cpd in final_cpds:
            total += 1
    for c, pos in classes.items():
        if pos == i:
            print(str(c) + ": " + str(total))
    total = 0

Organic acids: 34
Lipids: 44
Carbohydrates: 63
Nucleic acids: 49
Peptides: 61
Vitamins and Cofactors: 31
Steroids: 0
Hormones and transmitters: 16
Antibiotics: 13


### Target Compounds

Find the target compounds produced by random expansion

In [14]:
#define life-specifc reactions
freilich = "links/Freilich09.json"
with open(freilich) as fr:
   datajsonfr = json.load(fr)
    
lc = datajsonfr.keys()

#Find the intersection of the compounds generated and the compounds needed for life
targets = list(set(lc) & set(final_cpds))
print("Number of compounds generated in random expansion: " + str(len(targets)))
print("Number of life compounds: " + str(len(lc)))

Number of compounds generated in random expansion: 61
Number of life compounds: 65


In [29]:
#find all compounds not produced by random expansion
not_produced = []
for c in lc:
    if c not in targets:
        not_produced.append(c)
        
print(not_produced)
#print out compounds and names
with open("../KEGG/accessible_compounds.csv") as translation:
    names_df = pd.read_csv(translation)
    names_df.set_index("Compound", inplace=True)
    
print("Not produced: \n")
not_produced[2] = not_produced[3]
for c in not_produced:
    print(names_df.loc[c, "Name"])

[u'C15672', u'C00748', u'C05899', u'C05890']
Not produced: 

Heme O
Siroheme


KeyError: u'the label [C05890] is not in the [index]'