In [1]:
import os
import shutil
import subprocess
import qiime2 as q2
import pandas as pd
from biom import Table, load_table
from biom.util import biom_open
import glob
import numpy as np
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


## running songbird

This is best done on a compute cluster but the scripts are provided in `seeding-songbird-run`. From these gird searches we determined the best perams. to run songbird with with a $Q^{2}$-value close to one. From these runs we can use the differentials to explore what microbes change between the birth-modes. 


In [29]:
all_grid_results = {}
input_path = '../data/diff-analysis-new/songbird-grid-search/LS-model-no-country-subjectid'
intervals = ["0-2", "2-4", "4-17", "17-26", "26-51"]
# get all body site path(s)
partition_path = os.path.join(input_path, 
                              '*')
exclude_ = os.path.join(input_path, 
                              '*.*')
partition_path = set(glob.glob(partition_path))- set(glob.glob(exclude_))
body_sites = [bs_.split('/')[-1]
                  for bs_ in partition_path]

In [31]:
%%capture
# for each body site repeat 
all_grid_results = {}
input_path = '../data/diff-analysis-new/songbird-grid-search/LS-model-no-country-subjectid'
intervals = ["0-2", "2-4", "4-17", "17-26", "26-51"]
# get all body site path(s)
partition_path = os.path.join(input_path, '*')
exclude_ = os.path.join(input_path, '*.*')
partition_path = set(glob.glob(partition_path))- set(glob.glob(exclude_))
body_sites = [bs_.split('/')[-1]
                  for bs_ in partition_path]# this string already have body sites with life stage
# run for each body site
for body_site_ in body_sites:
    # for body site subsets

    # get all baseline models CV
    baseline_ls_path =  os.path.join(input_path, body_site_, '1*')
    baseline_models = glob.glob(baseline_ls_path)
    baseline_models = {tuple(id_.split('/')[-1].split('-')[1:]):id_
                       for id_ in baseline_models}
    # retrieve all baseline models
    for id_, path_ in baseline_models.items():
        # get path to data
        event_acc = EventAccumulator(path_)
        event_acc.Reload()
        # get scalar perams
        w_times, step_nums, vals = zip(*event_acc.Scalars('accuracy/cv_error'))
        baseline_models[id_] = [w_times, step_nums, vals]
    # get all fomrula based models CV
    all_ls_path =  os.path.join(input_path, body_site_, '*')
    formula_models = glob.glob(all_ls_path)
    exclude_ = glob.glob(baseline_ls_path)
    formula_models = sorted(set(formula_models) - set(exclude_))
    formula_models = {tuple(id_.split('/')[-1].split('-')[:]):id_
                       for id_ in formula_models}
    formula_models = {('-'.join(k[:-3]),k[-3],k[-2],k[-1]):v
                      for k,v in formula_models.items()}
    for id_, path_ in formula_models.items():
        # get path to data
        event_acc = EventAccumulator(path_)
        event_acc.Reload()
        # get scalar perams
        w_times, step_nums, vals = zip(*event_acc.Scalars('accuracy/cv_error'))
        # calc q^2-value
        base_cv = np.mean(baseline_models[id_[1:]][-1][-10:])
        form_cv = np.mean(vals[-10:])
        q_squared = 1 - form_cv/base_cv
        formula_models[id_] = [form_cv, base_cv, q_squared]
    # make dataframe to save
    gird_results = pd.DataFrame(formula_models).T.reset_index()
    gird_results.columns = ['formula', 'min_features', 'batch_size',
                            'differential_prior', 'CV', 'baseline_CV',
                            'q_squared']
    # save for bodysite
    all_grid_results[body_site_] = gird_results

In [39]:
# get best
all_grid_df = pd.concat(all_grid_results).reset_index().drop('level_1', axis=1)
all_grid_df = all_grid_df.rename({'level_0':'body_site_ls'}, axis=1)
all_grid_df_allowed = all_grid_df[all_grid_df.q_squared > 0].copy()
ind_ = all_grid_df_allowed.groupby(['body_site_ls','formula'])[['CV']].idxmin().values
all_grid_df_allowed['best'] = np.nan
all_grid_df_allowed.loc[ind_.flatten(), 'best'] = 'Yes'
all_grid_df_allowed.to_csv('../data/diff-analysis-new/LS-all-countries-grid-search-all-allowed-models.tsv', sep='\t')
all_grid_df_allowed.loc[all_grid_df_allowed.best=='Yes', :]

Unnamed: 0,body_site_ls,formula,min_features,batch_size,differential_prior,CV,baseline_CV,q_squared,best
0,Baby-Feces,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,139,0.25,2.876108,3.725357,0.227964,Yes
26,Baby-Feces-0-2,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,42,0.25,23.200928,32.275113,0.281151,Yes
49,Baby-Feces-17-26,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,15,0.5,21.915614,26.446064,0.171309,Yes
72,Baby-Feces-2-4,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,21,0.25,52.880609,60.824997,0.130611,Yes
98,Baby-Feces-26-51,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,34,0.25,12.517525,13.548411,0.076089,Yes
120,Baby-Feces-4-17,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,24,0.25,36.072392,43.872808,0.177796,Yes
148,Baby-Forearm,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,89,0.25,0.187514,0.264916,0.292178,Yes
170,Baby-Forearm-0-2,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,37,0.25,8.055946,9.024699,0.107345,Yes
197,Baby-Forearm-17-26,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,6,0.5,10.646206,12.832963,0.170402,Yes
210,Baby-Forearm-2-4,"C(birth_mode_ms, Treatment(""Vag"")) + host_subj...",0,16,0.25,3.345365,3.743568,0.10637,Yes


In [48]:
### subset best and copy them (not used for quick run)
file_path_ = '../data/diff-analysis-new/songbird-grid-search/LS-model-no-country-subjectid/'
file_path_copy_ = '../data/diff-analysis-new/songbird-optimized-models/LS-model-no-country-subjectid/'

all_grid_df_best = all_grid_df_allowed.loc[all_grid_df_allowed.best=='Yes', :]
paths_copy_ = [(os.path.join(file_path_, bs_,
                             '-'.join(df_.values[0][1:5])),
                os.path.join(file_path_copy_, bs_,
                             '-'.join(df_.values[0][1:5])),
                os.path.join(file_path_, bs_,
                             '1-' + '-'.join(df_.values[0][2:5])),
                os.path.join(file_path_copy_, bs_,
                             '1-' + '-'.join(df_.values[0][2:5])))
               for bs_, bsdf_ in all_grid_df_best.groupby('body_site_ls')
               for k_, df_ in bsdf_.groupby('formula')]
# copy the paths
for copy_ in paths_copy_:
    if not os.path.exists(copy_[1]):
        shutil.copytree(copy_[0], copy_[1])
        print(copy_[1])
    if not os.path.exists(copy_[3]):
        shutil.copytree(copy_[2], copy_[3])
        print(copy_[3])


../data/seeding-songbird-run/songbird-optimized-models/LS-model-no-country-subjectid/Baby-Feces/C(birth_mode_ms, Treatment("Vag")) + host_subject_id-0-139-0.25
../data/seeding-songbird-run/songbird-optimized-models/LS-model-no-country-subjectid/Baby-Feces/1-0-139-0.25
../data/seeding-songbird-run/songbird-optimized-models/LS-model-no-country-subjectid/Baby-Feces-0-2/C(birth_mode_ms, Treatment("Vag")) + host_subject_id-0-42-0.25
../data/seeding-songbird-run/songbird-optimized-models/LS-model-no-country-subjectid/Baby-Feces-0-2/1-0-42-0.25
../data/seeding-songbird-run/songbird-optimized-models/LS-model-no-country-subjectid/Baby-Feces-17-26/C(birth_mode_ms, Treatment("Vag")) + host_subject_id-0-15-0.5
../data/seeding-songbird-run/songbird-optimized-models/LS-model-no-country-subjectid/Baby-Feces-17-26/1-0-15-0.5
../data/seeding-songbird-run/songbird-optimized-models/LS-model-no-country-subjectid/Baby-Feces-2-4/C(birth_mode_ms, Treatment("Vag")) + host_subject_id-0-21-0.25
../data/seeding-

## Summarizing results

In [2]:
import itertools
from skbio.stats.composition import (alr_inv, alr,
                                     closure, clr)
# warnings filter 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# import taxonomy
taxdf = q2.Artifact.load('../data/processed-data/taxonomy.qza'
                        ).view(q2.Metadata).to_dataframe()

def split_taxonomy(taxonomy):
    feat_map = dict(taxonomy.Taxon)
    taxonomy['Taxon'] = [feat_map[feat]
                         if feat in feat_map.keys()
                         else np.nan
                         for feat in taxonomy.index]
    # add taxonomic levels for grouping later (if available)

    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]

    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxonomy.columns:
            taxonomy_tmp = []
            for tax in taxonomy.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxonomy[lname] = taxonomy_tmp
    return taxonomy

# split the levels into columns
taxdf = split_taxonomy(taxdf)

In [13]:
def center_differentials(clr_diff):
    """ re-centers data around zero """
    
    # center again around zero after completion
    clr_diff = clr_diff \
                - clr_diff.mean(axis=0).values
    clr_diff = clr_diff \
                - clr_diff.mean(axis=1).values.reshape(-1, 1)
    # return the re-centered data
    return clr_diff

def differentials_to_probability(differentials,
                                 numerators,
                                 prefix='C(birth_mode, Treatment("CS"))',
                                 basis_name="P(CS)"):
    """ converts differentials to something like
        a probability using the inverse-alr transform.
    """
    # first recenter the differentials
    differentials = center_differentials(differentials)
    # then take the inverse alr
    prob_differentials = alr_inv(differentials[numerators])
    # make a dataframe to return 
    columns = [col.replace("[T.","P(").replace("]",")").replace(prefix, "")
                for col in numerators] # rename cols
    columns = [basis_name] + columns
    prob_differentials = pd.DataFrame(prob_differentials,
                                      differentials.index,
                                      columns)
    return prob_differentials

In [19]:
# container for differentials
all_differentials = {}
all_metadata = {}
all_tables = {}
# get path info
data_path = '../data/split-data'
input_path = '../data/diff-analysis-new/songbird-optimized-models/LS-model-no-country-subjectid'
intervals = ["0-2", "2-4", "4-17", "17-26", "26-51"]
diffs_use = ['[T.CS]', '[T.CSseed]']
# add a frequency filter
min_freq = 0.0
# get all body site path(s)
partition_path = os.path.join(input_path, '*')
body_sites = [bs_.split('/')[-1]
                  for bs_ in glob.glob(partition_path)]
# run for each body site
for body_site_ in body_sites:
    # for body site subsets

    baseline_ls_path =  os.path.join(input_path, body_site_, '1*')
    all_ls_path =  os.path.join(input_path, body_site_, '*')
    formula_models = glob.glob(all_ls_path)
    exclude_ = glob.glob(baseline_ls_path)
    formula_models = sorted(set(formula_models) - set(exclude_))[0]
    # get diffs. and add taxonomy labels
    diff = pd.read_csv(os.path.join(formula_models, 'differentials.tsv'),
                       sep='\t', index_col=0)
    # get table and metadata for each subset
    data_split_path = os.path.join(data_path, body_site_)
    mf = pd.read_csv(os.path.join(data_split_path, 'metadata.tsv'),
                       sep='\t', index_col=0)
    bt = load_table(os.path.join(data_split_path, 'table.biom'))
    # caclulate the freq. of that feature in the data
    frequncy = pd.DataFrame(bt.matrix_data.toarray().astype(bool).sum(1) / bt.shape[1],
                    bt.ids('observation'), ['feature-frequency'])
    frequncy = frequncy.reindex(diff.index)
    # add to diff to reduce files saved
    diff['feature-frequency'] = frequncy['feature-frequency']
    # apply the filter
    diff = diff[diff['feature-frequency'] >= min_freq]
    cols = [c for c in diff.columns if 'host_subject_id' not in c]
    diff = diff[cols]
    # reindex taxonomy (do not write to file since it takes too much space)
    #taxdf_ = taxdf.reindex(diff.index).drop('Taxon', axis = 1)
    # get the prob. of each microbe in each state
    differential_cols = [col_ for col_ in diff.columns
                         if any(dc in col_ for dc in diffs_use)]
    diff_ = center_differentials(diff[['Intercept'] + differential_cols])
    diff_.columns = ['centered.' + 
                     hh.replace("[T", "diff").replace("]", "").replace('C(birth_mode_ms, Treatment("Vag"))', "") 
                     for hh in diff_.columns]
        
    pdiff = differentials_to_probability(diff[['Intercept'] + differential_cols],
                                         differential_cols,
                                         prefix = 'C(birth_mode_ms, Treatment("Vag"))',
                                         basis_name="P(Vag)")
    pdiff = pdiff.reindex(diff.index)
    # add all together
    #diff = pd.concat([pdiff, diff, diff_, taxdf_], axis=1)
    diff = pd.concat([pdiff, diff, diff_], axis=1)
    # calculate seeding effectiveness metric
    diff['seeding-effectiveness'] = (diff['P(Vag)'] * diff['P(CSseed)'] + 
                                     diff['P(CS)']/4)
    # save the files
    all_differentials[body_site_] = diff.rename({'P(CSseed)':'P(CS-seeded)',
                                                 'P(Vag)':'P(Vaginal)'}, axis=1)
    all_metadata[body_site_] = mf
    all_tables[body_site_] = bt

In [20]:
pd.concat(all_differentials).reset_index().rename(columns = {'level_0':'body_site_ls'}).to_csv(
    '../data/diff-analysis-new/LS-all-countries-best-model-processed.tsv', sep='\t')