In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import CubicSpline
import warnings
warnings.filterwarnings("ignore")
import arviz as az
%run -i '../../../miscellaneous/utils.py'

# Read data

In [5]:
df_bac = pd.read_excel('./16S_relative_abundance.xlsx', index_col=0)
df_meta = pd.read_csv('metadata.csv', index_col=0)

# Generate stan program files

In [6]:
bacterial_taxa = generate_stan_files_for_fiber_respones(
    df_bac, # 16S data (relative or absolute), rows are samples, columns are taxa
    df_meta, # meta data, rows are samples, columns are SubjectID, Day, and Dose
    prefix='healey_2018', # prefix of stan file name
    topN=20, # select the topN taxa to run bayesian regression
    stan_path='./'
) 

# Extract stan output and show significant fiber responses

In [5]:
fit = az.from_cmdstan(["healey_2018_output_%d.csv"%(i) for i in np.arange(1,4)])

lines = []
# basal growth rate
for c in bacterial_taxa:
    var = 'alpha__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['basal_growth_rate', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
# inulin response
for c in bacterial_taxa:
    var = 'epsilon__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['fiber_response', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
# pairwise interactions
for c1 in bacterial_taxa:
    for c2 in bacterial_taxa:
        var = 'beta__%s_%s'%(c1,c2)
        data = []
        for i in np.arange(0,3):
            data.extend(list(fit.posterior[var][i].values))
        hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
        assert len(hpd_mu) == 1
        (x0, x1) = hpd_mu[0]
        lines.append(['pairwise_interaction', (c1,c2), x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
df_stan_output_summary = pd.DataFrame(lines, columns = ['Type','Taxa','Left','Right','Middle','SNR','Significant'])
df_stan_output_summary.to_excel('bayesian_regression_summary_healey_2018.xlsx')

In [6]:
# select only significant ones
df_stan_output_summary[df_stan_output_summary.Significant==True]

Unnamed: 0,Type,Taxa,Left,Right,Middle,SNR,Significant
6,basal_growth_rate,Prevotella,-1.0,-0.03,-0.515,-2.052566,True
22,fiber_response,Bifidobacterium,0.1,0.14,0.12,12.336286,True
24,fiber_response,Anaerostipes,0.06,0.1,0.08,8.065097,True
26,fiber_response,Prevotella,0.01,0.05,0.03,2.737115,True
40,pairwise_interaction,"(Blautia, Blautia)",-3.96,-1.2,-2.58,-3.685521,True
41,pairwise_interaction,"(Blautia, Faecalibacterium)",-3.29,-0.13,-1.71,-2.181092,True
42,pairwise_interaction,"(Blautia, Bifidobacterium)",-3.48,-0.86,-2.17,-3.268102,True
82,pairwise_interaction,"(Bifidobacterium, Bifidobacterium)",1.33,3.93,2.63,3.994283,True
161,pairwise_interaction,"(Prevotella, Faecalibacterium)",0.21,3.4,1.805,2.238596,True
166,pairwise_interaction,"(Prevotella, Prevotella)",-6.46,-4.34,-5.4,-10.151601,True
