In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', '{:.2g}'.format)

In [2]:
from sklearn_utils.utils import SkUtilsIO

X, y = SkUtilsIO('../outputs/bc_sampling_hist.json', gz=True).from_json()

In [3]:
from sklearn.pipeline import Pipeline
from metabolitics.preprocessing import MetaboliticsPipeline
from metabolitics_sampling import SamplingDiffTransformer


pipe_f = Pipeline([
    ('sampling-diff', SamplingDiffTransformer()),
    ('pathway-score', MetaboliticsPipeline([
         'feature-selection',
         'pathway_transformer'
    ]))
])

X_t_f = pipe_f.fit_transform(X, y)

In [4]:
import pandas as pd

from statsmodels.sandbox.stats.multicomp import multipletests
from sklearn.feature_selection import VarianceThreshold, f_classif

def variance_threshold_on_df(df: pd.DataFrame, threshold=0):
    vt = VarianceThreshold(threshold)
    vt.fit(df.values)
    return df.iloc[:, vt.variances_ > threshold]


def feature_importance_anova(X,
                             y,
                             threshold=0.001,
                             correcting_multiple_hypotesis=True,
                             method='fdr_bh',
                             alpha=0.1,
                             sort_by='pval'):
    '''
    Provide signifance for features in dataset with anova using multiple hypostesis testing
    :X: List of dict with key as feature names and values as features
    :y: Labels
    :threshold: Low-variens threshold to eliminate low varience features
    :correcting_multiple_hypotesis: corrects p-val with multiple hypotesis testing
    :method: method of multiple hypotesis testing
    :alpha: alpha of multiple hypotesis testing
    :sort_by: sorts output dataframe by pval or F
    :return: DataFrame with F and pval for each feature with their average values 
    '''
    df = variance_threshold_on_df(
        pd.DataFrame.from_records(X), threshold=threshold)

    F, pvals = f_classif(df.values, y)

    if correcting_multiple_hypotesis:
        _, pvals, _, _ = multipletests(pvals, alpha=alpha, method=method)

    df['labels'] = y
    df_mean = df.groupby('labels').mean().T

    df_mean['F'] = F
    df_mean['pval'] = pvals

    return df_mean.sort_values(sort_by, ascending=True)

In [5]:
df_f = feature_importance_anova(X_t_f,y)

In [6]:
df_f

labels,bc,healthy,F,pval
Alanine and aspartate metabolism,3.7,8.9e-16,210.0,1.9e-32
Nucleotide interconversion,83.0,2.5e-14,150.0,1.2999999999999998e-24
Taurine and hypotaurine metabolism,8.9,4.1e-15,130.0,5e-23
CoA synthesis,10.0,5.6e-15,93.0,1.5e-17
Eicosanoid metabolism,-0.94,-7.6e-17,85.0,1.7e-16
"Transport, lysosomal",11.0,4.3e-15,78.0,2.4e-15
Biotin metabolism,1.5,-1.5e-16,66.0,1.4e-13
Arginine and Proline Metabolism,49.0,3.3e-14,66.0,1.4e-13
Butanoate metabolism,-63.0,2.2e-13,59.0,2.3e-12
"Transport, nuclear",17.0,2.7e-14,55.0,7.7e-12


In [25]:
%matplotlib inline

from ipywidgets import interact

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.cluster.hierarchy import linkage

sns.set(color_codes=True)

@interact(
    top_n=(5, df_f.shape[0], 1),
    metric=['correlation', 'euclidean'], 
    method=['single', 'complete', 'average'])
def visualize(top_n=15, metric='correlation', method='complete'):
    df_sns_f = pd.DataFrame().from_records(X_t_f)[df_f[:top_n].index].T
    df_sns_f.columns = y
    
    row_colors = ['b' if i=='healthy' else 'r' for i in y]
    g = sns.clustermap(df_sns_f, figsize=(22, 22), z_score=0, metric=metric, method=method, col_colors=row_colors)

In [8]:
pipe = Pipeline([
    ('sampling-diff', SamplingDiffTransformer()),
    ('pathway-score', MetaboliticsPipeline([
         'pathway_transformer'
    ]))
])

X_t = pipe.fit_transform(X, y)

In [9]:
df = feature_importance_anova(X_t, y)

In [10]:
df

labels,bc,healthy,F,pval
Arginine and Proline Metabolism,11.0,1.2e-14,100.0,4.5e-18
Butanoate metabolism,-23.0,6.2e-14,53.0,3.2e-10
Vitamin C metabolism,-5.4,1.2e-14,49.0,1.2e-09
Methionine and cysteine metabolism,3.1,7.7e-15,37.0,1.6e-07
"Transport, golgi apparatus",-1.0,-1.1e-15,34.0,3.5e-07
Sphingolipid metabolism,-1.1,2e-15,33.0,4.4e-07
Aminosugar metabolism,-1.5,-1.9e-15,29.0,2.3e-06
Glycerophospholipid metabolism,-2.3,-3.8e-15,29.0,2.3e-06
Histidine metabolism,-3.2,1.7e-14,29.0,2.3e-06
Glutamate metabolism,-12.0,-3.2e-14,24.0,1.8e-05
