In [1]:
import itertools as it
import os

import biom
from matplotlib import rcParams
import matplotlib.colors as mplc
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sn
import statsmodels.api as sms
import statsmodels.formula.api as smf
import skbio

from qiime2 import Artifact, Metadata, MetadataColumn

In [2]:
rcParams['font.sans-serif'] = ['Helvetica', 'Arial']
rcParams['pdf.fonttype'] = 42
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation

In [3]:
%matplotlib inline

In [4]:
meta = pd.read_csv('data/output/simulation/samples/metadata.tsv', sep='\t', dtype=str)
meta.set_index('sample-id', inplace=True)
meta = meta.loc[meta['set'] == '1']
meta['color'] = meta['age'].replace({'infant': '#1f78b4', 
                                     'adult': '#e31a1c'})

In [5]:
methods = ['reference', 'otus', 'asvs', 'sidle']

In [6]:
alpha_metrics = ['faith_pd', 'observed_features', 'shannon', 'pielou_e']
alpha_values = {
    method: {
        metric: [
            Artifact.load(f'data/output/simulation/merged/{method}/'
                          f'rarified-alpha/{metric}/{i}.qza'
                         ).view(pd.Series)
            for i in np.arange(0, 5)
        ]
        for metric in alpha_metrics 
    }
    for method in methods
    
}

In [7]:
reference = alpha_values['reference']

In [8]:
correlations = {
    method: pd.DataFrame({
        metric: np.array([
            scipy.stats.pearsonr(x.loc[y.index], y)[0] 
            for (x, y) in it.product(alpha_values[method][metric], reference[metric])
            ])
        for metric in alpha_metrics
    })
    for method in methods
}

In [9]:
slopes = {
    method: pd.DataFrame({
        metric: np.array([
            sms.OLS(x.loc[y.index], y).fit().params[0]
            for (x, y) in it.product(alpha_values[method][metric], reference[metric])
        ])
        for metric in alpha_metrics
    })
    for method in methods
}

In [10]:
def calculate_effect(alphas, meta=meta):
    """
    Calculates a cohen's d based on age
    """
    alphas = pd.concat(axis=1, objs=alphas)
    alphas.columns = np.arange(0, len(alphas.columns))
    alphas['age'] = meta['age']
    ag = alphas.groupby('age')
    spool = np.sqrt(np.square(ag.std()).sum() / 2).values
    d_ = np.absolute(ag.mean().diff().dropna()) / spool

    return d_.loc['infant']

In [11]:
effects = {
    method: pd.DataFrame({
        metric: calculate_effect(alpha) 
        for metric, alpha in alpha_values[method].items()
    })
    for method in methods
}

In [12]:
infants = meta.index[meta['age'] == 'infant'].values
adults = meta.index[meta['age'] == 'adult'].values

p_vals = {
    method: pd.DataFrame({
        metric: np.array([
            scipy.stats.ttest_ind(alpha.loc[infants], alpha.loc[adults],
                                  equal_var=False,
                                  )[1]
           for alpha in alpha_values[method][metric]
        ])
        for metric in alpha_metrics
    })
    for method in methods
}

In [13]:
comb_effects = pd.concat(axis=1, objs=[
    pd.DataFrame({(c, 'R2', 'mean'): x.mean(axis=0) for c, x in correlations.items()}),
#     pd.DataFrame({(c, 'R2', 'std'): x.std(axis=0) for c, x in correlations.items()}),
    pd.DataFrame({(c, 'm', 'mean'): x.mean(axis=0) for c, x in slopes.items()}),
    pd.DataFrame({(c, 'm', 'std'): x.std(axis=0) for c, x in slopes.items()}),
    pd.DataFrame({(c, 'd', 'mean'): x.mean(axis=0) for c, x in effects.items()}),
    pd.DataFrame({(c, 'd', 'std'): x.std(axis=0) for c, x in effects.items()}),
    pd.DataFrame({(c, 'p', 'max'): x.max(axis=0) for c, x in p_vals.items()})
]).unstack()
comb_effects.index.set_names(['method', 'parameter', 'parameter_value', 'metric'],
                             inplace=True)
comb_effects.name = 'value'
comb_effects = comb_effects.reset_index()

comb_effects['method'] = pd.Categorical(comb_effects['method'], categories=methods)
comb_effects['metric'] = pd.Categorical(comb_effects['metric'], categories=alpha_metrics)
comb_effects['parameter'] = pd.Categorical(comb_effects['parameter'], 
                                           categories=['d', 'p', 'm', 'R2'])

comb_effects.sort_values(['method', 'metric', 'parameter', 'parameter_value'], 
                         inplace=True, ascending=True)

comb_effects = comb_effects.pivot_table(
    index=['metric', 'method'],
    columns=['parameter', 'parameter_value'],
    values='value'
)

comb_effects_tidy = comb_effects.copy()
comb_effects_tidy['d'] = comb_effects['d'].round(2)
comb_effects_tidy['m'] = comb_effects['m'].round(3)
comb_effects_tidy['R2'] = comb_effects['R2'].round(3)

In [14]:
comb_effects_tidy

Unnamed: 0_level_0,parameter,d,d,p,m,m,R2
Unnamed: 0_level_1,parameter_value,mean,std,max,mean,std,mean
metric,method,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
faith_pd,reference,3.92,0.04,5.965195e-21,1.0,0.002,0.995
faith_pd,otus,4.13,0.07,1.101071e-21,1.309,0.003,0.993
faith_pd,asvs,4.23,0.15,1.172583e-21,3.523,0.007,0.992
faith_pd,sidle,4.19,0.11,1.674747e-21,2.91,0.009,0.992
observed_features,reference,5.03,0.05,2.187409e-24,1.0,0.002,0.996
observed_features,otus,5.02,0.1,7.806082e-24,1.337,0.003,0.995
observed_features,asvs,4.89,0.12,4.111537e-24,1.869,0.006,0.995
observed_features,sidle,4.84,0.11,1.3165470000000001e-23,0.998,0.003,0.995
shannon,reference,4.4,0.04,5.232261999999999e-19,1.0,0.0,0.999
shannon,otus,4.21,0.04,1.035065e-18,1.102,0.0,0.997


In [15]:
comb_effects_tidy.to_csv('data/output/tables/table-1-alpha.tsv', sep='\t')