In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import CubicSpline
import warnings
warnings.filterwarnings("ignore")
import arviz as az
%run -i '../../utils.py'

# Read data

In [11]:
df_bac = pd.read_csv('16S_relative_abundance_lowest_taxonomy.csv', index_col=0)
df_meta = pd.read_csv('metadata.csv', index_col=0)

# Generate stan program files

In [12]:
selected_bacteria_taxa = generate_stan_files_for_fiber_respones(
    df_bac, # 16S data (relative or absolute), rows are samples, columns are taxa
    df_meta, # meta data, rows are samples, columns are SubjectID, Day, and Dose
    prefix='creswell_2020_lowest_taxonomy', # prefix of stan file name
    topN=20, # select the topN taxa to run bayesian regression
    stan_path='/Users/liaoc/Documents/cmdstan-2.25.0/projects/microbiome_fiber_response_LD'
) 

# Extract stan output and show significant fiber responses

In [18]:
fit = az.from_cmdstan(["creswell_2020_output_%d.csv"%(i) for i in [1,2,3]])

lines = []
# basal growth rate
for c in selected_bacteria_taxa:
    var = 'alpha__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['basal_growth_rate', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# fiber response
for c in selected_bacteria_taxa:
    var = 'epsilon__%s'%(c)
    data = []
    for i in np.arange(0,3):
        data.extend(list(fit.posterior[var][i].values))
    hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
    assert len(hpd_mu) == 1
    (x0, x1) = hpd_mu[0]
    lines.append(['fiber_response', c, x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
# pairwise interactions
for c1 in selected_bacteria_taxa:
    for c2 in selected_bacteria_taxa:
        var = 'beta__%s_%s'%(c1,c2)
        data = []
        for i in np.arange(0,3):
            data.extend(list(fit.posterior[var][i].values))
        hpd_mu, x_mu, y_mu, modes_mu = hpd_grid(data)
        assert len(hpd_mu) == 1
        (x0, x1) = hpd_mu[0]
        lines.append(['pairwise_interaction', (c1,c2), x0, x1, (x0+x1)/2, np.mean(data)/np.std(data), x0*x1>0])
    
df_stan_output_summary = pd.DataFrame(lines, columns = ['Type','Taxa','Left','Right','Middle','SNR','Significant'])
df_stan_output_summary.to_excel('bayesian_regression_summary.xlsx')

In [19]:
# select only significant ones
df_stan_output_summary[df_stan_output_summary.Significant==True]

Unnamed: 0,Type,Taxa,Left,Right,Middle,SNR,Significant
50,pairwise_interaction,"(Lachnospiraceae, Akkermansia_muciniphila)",0.11,1.63,0.87,2.270559,True
58,pairwise_interaction,"(Lachnospiraceae, Roseburia_intestinalis)",0.59,3.42,2.005,2.780318,True
75,pairwise_interaction,"(Faecalibacterium_prausnitzii, Gemmiger_formic...",0.2,3.54,1.87,2.234029,True
164,pairwise_interaction,"(Prevotella_copri, Anaerostipes_hadrus)",-2.41,-0.63,-1.52,-3.372511,True
165,pairwise_interaction,"(Prevotella_copri, Bifidobacterium_adolescentis)",0.28,1.87,1.075,2.674802,True
166,pairwise_interaction,"(Prevotella_copri, Prevotella_copri)",0.17,1.18,0.675,2.628095,True
229,pairwise_interaction,"(Roseburia_faecis, Roseburia_faecis)",0.11,2.22,1.165,2.202735,True
244,pairwise_interaction,"(Akkermansia_muciniphila, Anaerostipes_hadrus)",-2.08,-0.3,-1.19,-2.648681,True
245,pairwise_interaction,"(Akkermansia_muciniphila, Bifidobacterium_adol...",-1.59,-0.02,-0.805,-2.011412,True
250,pairwise_interaction,"(Akkermansia_muciniphila, Akkermansia_muciniph...",-2.49,-0.97,-1.73,-4.480156,True
