In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import CubicSpline
import warnings
warnings.filterwarnings("ignore")
import arviz as az
%run -i '../../miscellaneous/utils.py'

# Read data

In [2]:
df_bac = pd.read_csv('16S_relative_abundance_genus.csv', index_col=0)
df_meta = pd.read_csv('metadata.csv', index_col=0)

# Generate stan program files

In [3]:
selected_bacteria_taxa = generate_stan_files_for_fiber_respones(
    df_bac, # 16S data (relative or absolute), rows are samples, columns are taxa
    df_meta, # meta data, rows are samples, columns are SubjectID, Day, and Dose
    prefix='creswell_2020_genus', # prefix of stan file name
    topN=20, # select the topN taxa to run bayesian regression
    stan_path='/Users/liaoc/Documents/cmdstan-2.25.0/projects/microbiome_fiber_response_LD'
) 

# Extract stan output and show significant fiber responses

In [4]:
fit = az.from_cmdstan(["creswell_2020_genus_output_%d.csv"%(i) for i in [1,2,3,4,5,6,7,8,9]])

lines = []
# basal growth rate
for c in selected_bacteria_taxa:
    var = 'alpha__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['basal_growth_rate', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# fiber response
for c in selected_bacteria_taxa:
    var = 'epsilon__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['fiber_response', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# pairwise interactions
for c1 in selected_bacteria_taxa:
    for c2 in selected_bacteria_taxa:
        var = 'beta__%s_%s'%(c1,c2)
        data = []
        for i in np.arange(0,3):
            data.extend(list(fit.posterior[var][i].values))
        hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
        assert len(hpd_mu) == 1
        (x0, x1) = hpd_mu[0]
        lines.append(['pairwise_interaction', (c1,c2), x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
df_stan_output_summary = pd.DataFrame(lines, columns = ['Type','Taxa','Left','Right','Middle','SNR','Significant'])
df_stan_output_summary.to_excel('bayesian_regression_summary_genus.xlsx')

In [5]:
# select only significant ones
df_stan_output_summary[df_stan_output_summary.Significant==True]

Unnamed: 0,Type,Taxa,Left,Right,Middle,SNR,Significant
23,fiber_response,Bifidobacterium,0.01,0.06,0.035,2.972498,True
25,fiber_response,Bacteroides,-0.05,-0.01,-0.03,-2.514315,True
32,fiber_response,Akkermansia,-0.06,-0.02,-0.04,-3.264742,True
112,pairwise_interaction,"(Bifidobacterium, Akkermansia)",0.45,2.1,1.275,3.041901,True
132,pairwise_interaction,"(Dorea, Akkermansia)",0.07,1.7,0.885,2.148888,True
143,pairwise_interaction,"(Bacteroides, Bifidobacterium)",0.1,1.39,0.745,2.301459,True
146,pairwise_interaction,"(Bacteroides, Roseburia)",0.01,1.83,0.92,2.012361,True
156,pairwise_interaction,"(Bacteroides, Alistipes)",-3.94,-0.4,-2.17,-2.424487,True
208,pairwise_interaction,"(Prevotella, Prevotella)",0.4,1.57,0.985,3.364401,True
250,pairwise_interaction,"(Anaerostipes, Anaerostipes)",0.03,1.98,1.005,2.050879,True
