In [1]:
import os
import shutil
import subprocess
import qiime2 as q2
import pandas as pd
from biom import Table
from biom.util import biom_open


### running songbird

This is best done on a compute cluster but the scripts are provided in `seeding-songbird-run`. From these gird searches we determined the best perams. to run songbird with with a $Q^{2}$-value close to one. From these runs we can use the differentials to explore what microbes change between the birth-modes. 


In [2]:
import glob
import numpy as np
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


In [9]:
all_grid_results = {}
input_path = '../data/diff-analysis-new'
intervals = ["0-2", "2-4", "4-17", "17-26", "26-51"]
# get all body site path(s)
partition_path = os.path.join(input_path, 'songbird-grid-search/*')
exclude_ = os.path.join(input_path, 'songbird-grid-search/Baby*.0')
partition_path = set(glob.glob(partition_path)) - set(glob.glob(exclude_))
body_sites = [bs_.split('/')[-1]
                  for bs_ in partition_path]

In [38]:
%%capture
# for each body site repeat 
all_grid_results = {}
input_path = '../data/diff-analysis-new'
intervals = ["2-4", "26-51"]
# get all body site path(s)
partition_path = os.path.join(input_path, 'songbird-grid-search/*')
exclude_ = os.path.join(input_path, 'songbird-grid-search/Baby*.0')
partition_path = set(glob.glob(partition_path)) - set(glob.glob(exclude_))
body_sites = [bs_.split('/')[-1]
                  for bs_ in partition_path] # this string already have body sites with life stage
# run for each body site
for body_site_ in body_sites:
    # for body site subsets

    # get all baseline models CV
    baseline_ls_path =  os.path.join(input_path, 'songbird-grid-search',
                                     body_site_, '1*')
    baseline_models = glob.glob(baseline_ls_path)
    baseline_models = {tuple(id_.split('/')[-1].split('-')[1:]):id_
                       for id_ in baseline_models}
    # retrieve all baseline models
    for id_, path_ in baseline_models.items():
        # get path to data
        event_acc = EventAccumulator(path_)
        event_acc.Reload()
        # get scalar perams
        w_times, step_nums, vals = zip(*event_acc.Scalars('accuracy/cv_error'))
        baseline_models[id_] = [w_times, step_nums, vals]
    # get all fomrula based models CV
    all_ls_path =  os.path.join(input_path, 'songbird-grid-search',
                                body_site_, '*')
    formula_models = glob.glob(all_ls_path)
    exclude_ = glob.glob(baseline_ls_path)
    formula_models = sorted(set(formula_models) - set(exclude_))
    formula_models = {tuple(id_.split('/')[-1].split('-')[:]):id_
                       for id_ in formula_models}
    formula_models = {('-'.join(k[:-3]),k[-3],k[-2],k[-1]):v
                      for k,v in formula_models.items()}
    for id_, path_ in formula_models.items():
        # get path to data
        event_acc = EventAccumulator(path_)
        event_acc.Reload()
        # get scalar perams
        w_times, step_nums, vals = zip(*event_acc.Scalars('accuracy/cv_error'))
        # calc q^2-value
        base_cv = np.mean(baseline_models[id_[1:]][-1][-10:])
        form_cv = np.mean(vals[-10:])
        q_squared = 1 - form_cv/base_cv
        formula_models[id_] = [form_cv, base_cv, q_squared]
    # make dataframe to save
    gird_results = pd.DataFrame(formula_models).T.reset_index()
    gird_results.columns = ['formula', 'min_features', 'batch_size',
                            'differential_prior', 'CV', 'baseline_CV',
                            'q_squared']
    # save for bodysite
    all_grid_results[body_site_] = gird_results


KeyError: 'Key accuracy/cv_error was not found in Reservoir'

In [7]:
# get best
all_grid_df = pd.concat(all_grid_results).reset_index().drop('level_2', axis=1)
all_grid_df = all_grid_df.rename({'level_0':'body_site','level_1':'life_stage'}, axis=1)
all_grid_df_allowed = all_grid_df[all_grid_df.q_squared > 0].copy()
ind_ = all_grid_df_allowed.groupby(['body_site','life_stage','formula'])[['CV']].idxmin().values
all_grid_df_best = all_grid_df_allowed.loc[ind_.flatten(), :]
all_grid_df_best.to_csv('../results/Extended-Data-Table-6.tsv', sep='\t')
all_grid_df_best


Unnamed: 0,body_site,life_stage,formula,min_features,batch_size,differential_prior,CV,baseline_CV,q_squared
50,fecal,Baby-Feces-2-4,"C(birth_mode, Treatment(""CS"")) + host_subject_id",0,28,0.25,30.389586,33.600351,0.095557
74,fecal,Baby-Feces-26-51,"C(birth_mode, Treatment(""CS"")) + host_subject_id",0,42,0.25,10.211426,12.97182,0.212799
96,oral,Baby-Mouth-2-4,"C(birth_mode, Treatment(""CS"")) + host_subject_id",0,17,0.25,35.883822,51.621852,0.304871
120,oral,Baby-Mouth-26-51,"C(birth_mode, Treatment(""CS"")) + host_subject_id",0,20,0.25,26.412946,30.652315,0.138305
25,skin,Baby-Right_Forearm-2-4,"C(birth_mode, Treatment(""CS"")) + host_subject_id",0,14,0.5,13.450201,18.099304,0.256866
0,skin,Baby-Right_Forearm-26-51,"C(birth_mode, Treatment(""CS"")) + host_subject_id",0,16,0.25,2.906172,4.222971,0.311818


In [None]:
### subset best and copy them (not used for quick run)
file_path_ = '../data/seeding-songbird-run/songbird-all-models/'
file_path_copy_ = '../data/seeding-songbird-run/songbird-optimized-models/'
org_names = {'fecal':'Baby-Feces',
             'oral':'Baby-Mouth',
             'skin':'Baby-Right_Forearm'}
paths_copy_ = [(os.path.join(file_path_, bs_,
                             k_.replace('all', org_names[bs_]),
                             '-'.join(df2_.values[0][2:6])),
                os.path.join(file_path_copy_, bs_,
                             k_.replace(org_names[bs_] + '-', ''),
                             '-'.join(df2_.values[0][2:6])),
                os.path.join(file_path_, bs_,
                             k_.replace('all', org_names[bs_]),
                             '1-' + '-'.join(df2_.values[0][3:6])),
                os.path.join(file_path_copy_, bs_,
                             k_.replace(org_names[bs_] + '-', ''),
                             '1-' + '-'.join(df2_.values[0][3:6])))
               for bs_, bsdf_ in all_grid_df_best.groupby('body_site')
               for k_, df_ in bsdf_.groupby('life_stage')
               for k2_, df2_ in df_.groupby('formula')]
# copy the paths
for copy_ in paths_copy_:
    if not os.path.exists(copy_[1]):
        shutil.copytree(copy_[0], copy_[1])
        print(copy_[1])
    if not os.path.exists(copy_[3]):
        shutil.copytree(copy_[2], copy_[3])
        print(copy_[3])
