In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import CubicSpline
import warnings
warnings.filterwarnings("ignore")
import arviz as az
%run -i '../../utils.py'

# Read data

In [5]:
df_bac = pd.read_csv('16S_relative_abundance_lowest_taxonomy.csv', index_col=0)
df_meta = pd.read_csv('metadata.csv', index_col=0)

# Generate stan program files

In [6]:
selected_bacteria_taxa = generate_stan_files_for_fiber_respones(
    df_bac, # 16S data (relative or absolute), rows are samples, columns are taxa
    df_meta, # meta data, rows are samples, columns are SubjectID, Day, and Dose
    prefix='creswell_2020_lowest_taxonomy', # prefix of stan file name
    topN=20, # select the topN taxa to run bayesian regression
    stan_path='/Users/liaoc/Documents/cmdstan-2.25.0/projects/microbiome_fiber_response_LD'
) 

# Extract stan output and show significant fiber responses

In [8]:
fit = az.from_cmdstan(["creswell_2020_lowest_taxonomy_output_%d.csv"%(i) for i in [1,2,3,4,5,6,7,8,9]])

lines = []
# basal growth rate
for c in selected_bacteria_taxa:
    var = 'alpha__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['basal_growth_rate', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# fiber response
for c in selected_bacteria_taxa:
    var = 'epsilon__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['fiber_response', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# pairwise interactions
for c1 in selected_bacteria_taxa:
    for c2 in selected_bacteria_taxa:
        var = 'beta__%s_%s'%(c1,c2)
        data = []
        for i in np.arange(0,3):
            data.extend(list(fit.posterior[var][i].values))
        hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
        assert len(hpd_mu) == 1
        (x0, x1) = hpd_mu[0]
        lines.append(['pairwise_interaction', (c1,c2), x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
df_stan_output_summary = pd.DataFrame(lines, columns = ['Type','Taxa','Left','Right','Middle','SNR','Significant'])
df_stan_output_summary.to_excel('bayesian_regression_summary_lowest_taxonomy.xlsx')

In [9]:
# select only significant ones
df_stan_output_summary[df_stan_output_summary.Significant==True]

Unnamed: 0,Type,Taxa,Left,Right,Middle,SNR,Significant
4,basal_growth_rate,Anaerostipes_hadrus,-0.54,-0.01,-0.275,-2.046499,True
10,basal_growth_rate,Akkermansia_muciniphila,0.18,0.72,0.45,3.271727,True
24,fiber_response,Anaerostipes_hadrus,0.01,0.06,0.035,2.616084,True
26,fiber_response,Prevotella_copri,-0.06,-0.01,-0.035,-2.55326,True
30,fiber_response,Akkermansia_muciniphila,-0.06,-0.01,-0.035,-3.069801,True
90,pairwise_interaction,"(Dorea_longicatena, Akkermansia_muciniphila)",0.07,1.57,0.82,2.139535,True
124,pairwise_interaction,"(Anaerostipes_hadrus, Anaerostipes_hadrus)",0.33,2.07,1.2,2.706596,True
130,pairwise_interaction,"(Anaerostipes_hadrus, Akkermansia_muciniphila)",2.38,3.89,3.135,8.252109,True
150,pairwise_interaction,"(Bifidobacterium_adolescentis, Akkermansia_muc...",0.16,1.66,0.91,2.423329,True
166,pairwise_interaction,"(Prevotella_copri, Prevotella_copri)",0.92,1.92,1.42,5.584872,True
