In [1]:
import os
os.chdir(os.path.join(os.getcwd(),'..'))

In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', '{:.2g}'.format)

In [3]:
from services import DataReader

X_biomass, y = DataReader().read_analyze_solution('biomass_maximization_fba_results', gz=False)
X_biomass = [{k + '_bio': v for k, v in x.items()} for x in X_biomass]

In [4]:
from preprocessing import DynamicPreprocessing

pre = DynamicPreprocessing(['pathway-scoring', 'transport-elimination'])
X_biomass_pathways = pre.fit_transform(X_biomass, y)

In [6]:
from services import feature_importance_anova

df_biomass_pathways = feature_importance_anova(X_biomass_pathways, y)
df_biomass_pathways

labels,bc,h,F,pval
Methionine and cysteine metabolism_bio,95.0,32.0,110.0,7.5e-20
CoA catabolism_bio,230.0,57.0,100.0,5.7e-18
Taurine and hypotaurine metabolism_bio,390.0,110.0,97.0,1.1e-17
Biotin metabolism_bio,380.0,170.0,83.0,1.1e-15
Glycerophospholipid metabolism_bio,-26.0,-1.4,61.0,3.1e-12
Nucleotide interconversion_bio,19.0,6.1,39.0,2.1e-08
Arginine and Proline Metabolism_bio,140.0,89.0,37.0,4.2e-08
Butanoate metabolism_bio,-320.0,-230.0,37.0,4.3e-08
Purine synthesis_bio,12.0,120.0,34.0,1.6e-07
Glutamate metabolism_bio,-170.0,-100.0,31.0,6.3e-07


In [18]:
X, y = DataReader().read_analyze_solution('bc_averaging_disease_analysis#k=0')

pre = DynamicPreprocessing(['flux-diff', 'pathway-scoring', 'transport-elimination'])
X_pathways = pre.fit_transform(X, y)
df_pathways = feature_importance_anova(X_pathways, y)

In [22]:
[ in df_biomass_pathways[df_biomass_pathways.pval < 0.05].index]

['Methionine and cysteine metabolism',
 'CoA catabolism',
 'Taurine and hypotaurine metabolism',
 'Biotin metabolism',
 'Glycerophospholipid metabolism',
 'Nucleotide interconversion',
 'Arginine and Proline Metabolism',
 'Butanoate metabolism',
 'Purine synthesis',
 'Glutamate metabolism',
 'Sphingolipid metabolism',
 'Tyrosine metabolism',
 'ROS detoxification',
 'Pentose phosphate pathway',
 'Hyaluronan metabolism',
 'Aminosugar metabolism',
 'Glycolysis/gluconeogenesis',
 'Eicosanoid metabolism',
 'Purine catabolism',
 'Fatty acid synthesis',
 'Pyruvate metabolism',
 'Vitamin C metabolism',
 'Glutathione metabolism',
 'Phosphatidylinositol phosphate metabolism',
 'Alanine and aspartate metabolism',
 'CoA synthesis',
 'Galactose metabolism',
 'Glycine, serine, alanine and threonine metabolism',
 'Vitamin D metabolism',
 'Cholesterol metabolism',
 'Histidine metabolism']

In [25]:
m_set = set(p[:-4] for p in df_pathways[df_pathways.pval < 0.05].index) 
bio_set = set(p[:-4] for p in df_biomass_pathways[df_biomass_pathways.pval < 0.05].index)

print('Only in metabolitics: %d' % len(m_set - bio_set))
print(m_set - bio_set)
print('Only in biomass: %d' % len(bio_set - m_set))
print(bio_set - m_set)
print('Both: %d' % len(bio_set & m_set))

Only in metabolitics: 8
{'Bile acid synthesis', 'Fatty acid oxidation', 'Inositol phosphate metabolism', 'Tetrahydrobiopterin metabolism', 'Tryptophan metabolism', 'Squalene and cholesterol synthesis', 'Glyoxylate and dicarboxylate metabolism', 'Urea cycle'}
Only in biomass: 7
{'Glutathione metabolism', 'CoA synthesis', 'Biotin metabolism', 'Cholesterol metabolism', 'Vitamin D metabolism', 'Fatty acid synthesis', 'Vitamin C metabolism'}
Both: 24


## Fisher Exact Test

In [6]:
from services import DataReader

X, y = DataReader().read_analyze_solution('bc_averaging_disease_analysis#k=0')

In [7]:
from services import fisher_exact_test_for_pathway
from preprocessing import DynamicPreprocessing

X_dif = DynamicPreprocessing(['flux-diff']).fit_transform(X, y)

In [8]:
pd.Series({k : v[1] for k, v in fisher_exact_test_for_pathway(X_dif, y).items() if k}).sort_values()

Fatty acid oxidation                                2.9e-36
Nucleotide interconversion                          5.3e-15
Fatty acid synthesis                                1.2e-09
Pyrimidine catabolism                               2.4e-05
Lysine metabolism                                   7.8e-05
Bile acid synthesis                                 0.00037
Glycine, serine, alanine and threonine metabolism   0.00041
Sphingolipid metabolism                             0.00069
Oxidative phosphorylation                           0.00071
Heme synthesis                                      0.00071
Arginine and Proline Metabolism                      0.0023
Valine, leucine, and isoleucine metabolism           0.0024
Alanine and aspartate metabolism                     0.0024
Pyruvate metabolism                                  0.0046
Triacylglycerol synthesis                            0.0052
Urea cycle                                           0.0063
Eicosanoid metabolism                   

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GroupKFold

from preprocessing import PathwayReactionEnrichment

pipe = Pipeline([
    ('df', DynamicPreprocessing(['flux-diff'])),
    ('enrichment', PathwayReactionEnrichment()),
    ('vect', DictVectorizer(sparse=False)),
    ('pca', PCA()),
    ('clf', LogisticRegression(C=0.3e-6, random_state=43))
])

kf = StratifiedKFold(n_splits=10, random_state=43)

scores = cross_val_score(pipe, X, y, cv=kf, scoring='f1_micro')
print('kfold test: %s' % scores)
print('mean: %s' % scores.mean().round(3))
print('std: %s' % scores.std().round(3))

kfold test: [ 0.77272727  0.72727273  0.77272727  0.77272727  0.72727273  0.72727273
  0.85714286  0.85714286  0.85        0.9       ]
mean: 0.796
std: 0.061
