In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import CubicSpline
import warnings
warnings.filterwarnings("ignore")
import arviz as az
%run -i '../../utils.py'

# Read data

In [14]:
df_bac = pd.read_csv('16S_relative_abundance_family.csv', index_col=0)
df_meta = pd.read_csv('metadata.csv', index_col=0)

# Generate stan program files

In [15]:
selected_bacteria_taxa = generate_stan_files_for_fiber_respones(
    df_bac, # 16S data (relative or absolute), rows are samples, columns are taxa
    df_meta, # meta data, rows are samples, columns are SubjectID, Day, and Dose
    prefix='creswell_2020_family', # prefix of stan file name
    topN=20, # select the topN taxa to run bayesian regression
    stan_path='/Users/liaoc/Documents/cmdstan-2.25.0/projects/microbiome_fiber_response_LD'
) 

# Extract stan output and show significant fiber responses

In [17]:
fit = az.from_cmdstan(["creswell_2020_family_output_%d.csv"%(i) for i in [1,2,3]])

lines = []
# basal growth rate
for c in selected_bacteria_taxa:
    var = 'alpha__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['basal_growth_rate', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# fiber response
for c in selected_bacteria_taxa:
    var = 'epsilon__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['fiber_response', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# pairwise interactions
for c1 in selected_bacteria_taxa:
    for c2 in selected_bacteria_taxa:
        var = 'beta__%s_%s'%(c1,c2)
        data = []
        for i in np.arange(0,3):
            data.extend(list(fit.posterior[var][i].values))
        hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
        assert len(hpd_mu) == 1
        (x0, x1) = hpd_mu[0]
        lines.append(['pairwise_interaction', (c1,c2), x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
df_stan_output_summary = pd.DataFrame(lines, columns = ['Type','Taxa','Left','Right','Middle','SNR','Significant'])
df_stan_output_summary.to_excel('bayesian_regression_summary_family.xlsx')

In [18]:
# select only significant ones
df_stan_output_summary[df_stan_output_summary.Significant==True]

Unnamed: 0,Type,Taxa,Left,Right,Middle,SNR,Significant
0,basal_growth_rate,Lachnospiraceae,0.06,1.14,0.6,2.170792,True
20,fiber_response,Lachnospiraceae,0.02,0.07,0.045,3.874751,True
22,fiber_response,Bifidobacteriaceae,0.01,0.05,0.03,2.555152,True
26,fiber_response,Akkermansiaceae,-0.06,-0.01,-0.035,-2.967982,True
42,pairwise_interaction,"(Lachnospiraceae, Bifidobacteriaceae)",-2.98,-1.27,-2.125,-4.881729,True
43,pairwise_interaction,"(Lachnospiraceae, Bacteroidaceae)",-2.56,-0.75,-1.655,-3.604241,True
45,pairwise_interaction,"(Lachnospiraceae, Prevotellaceae)",-2.78,-1.23,-2.005,-5.099196,True
46,pairwise_interaction,"(Lachnospiraceae, Akkermansiaceae)",4.01,6.06,5.035,9.687111,True
51,pairwise_interaction,"(Lachnospiraceae, Veillonellaceae)",-3.58,-0.26,-1.92,-2.259273,True
55,pairwise_interaction,"(Lachnospiraceae, Enterobacteriaceae)",0.13,2.3,1.215,2.227521,True
