#### Import relevant packages

In [5]:
#!/usr/bin/env python
# coding: utf-8

# Import NumPy and Pandas
import numpy as np
import pandas as pd

#Import relevant libraries
from skbio.stats.distance import permanova, DistanceMatrix
from skbio.stats.ordination import pcoa
from skbio.stats import subsample_counts
from skbio.stats.composition import multiplicative_replacement, closure, clr
from skbio.stats.distance import pwmantel

from scipy.spatial import procrustes
from scipy.stats import spearmanr

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.metrics import pairwise_distances, balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import PCA

import umap as um

import seaborn as sns

import matplotlib.pyplot as plt

from baycomp import two_on_single

from LANDMark import LANDMarkClassifier
from LANDMark import PyEmbTest
from LANDMark import TreeOrdination

from deicode.preprocessing import rclr
from deicode.matrix_completion import MatrixCompletion

from numpy.random import RandomState

from umap import UMAP

from numpy import __version__
print("numpy", __version__)

from pandas import __version__
print("pandas", __version__)

from skbio import __version__
print("skbio", __version__)

from scipy import __version__
print("scipy", __version__)

from sklearn import __version__
print("sklearn", __version__)

from umap import __version__
print("umap", __version__)

from seaborn import __version__
print("seaborn", __version__)

from matplotlib import __version__
print("matplotlib", __version__)

try:
    from baycomp import __version__
    print("baycomp", __version__)
except:
    pass

from deicode import __version__
print("deicode", __version__)

numpy 1.22.4
pandas 1.4.3
skbio 0.5.7
scipy 1.8.1
sklearn 1.0.2
umap 0.5.3
seaborn 0.11.2
matplotlib 3.5.2
deicode 0.2.4


#### Functions for randomization and sub-sampling

In [6]:
#Function for rarefaction
#https://stackoverflow.com/questions/15507993/quickly-rarefy-a-matrix-in-numpy-python
def rarefaction(M, N, y1, y2, seed=0):
    prng = RandomState(seed) # reproducible results
    noccur = np.sum(M, axis=1) # number of occurrences for each sample
    nvar = M.shape[1] # number of variables
    depth = int(np.percentile(noccur, float(N))) # sampling depth

    rem = np.where(noccur < depth, False, True)
    M_ss = M[rem]
    noccur = noccur[rem]
    
    Mrarefied = np.empty_like(M_ss)
    for i in range(M_ss.shape[0]): # for each sample
        p = M_ss[i] / float(noccur[i]) # relative frequency / probability
        choice = prng.choice(nvar, depth, p=p)
        Mrarefied[i] = np.bincount(choice, minlength=nvar)

    return Mrarefied, y1[rem], y2[rem], rem

#Function for creating random data for use in unsupervised learning
def addcl2(X, y):
    
    X_perm = np.copy(X, "C").transpose()
    for col in range(X_perm.shape[0]):
        X_perm[col] = resample(X_perm[col], replace = False, n_samples = X_perm.shape[1])
        
    y_new = ["Original" for _ in range(X.shape[0])]
    y_new.extend(["Randomized" for _ in range(X.shape[0])])
    y_new = np.asarray(y_new)
    
    X_new = np.vstack((X, X_perm.transpose()))
            
    return X_new, y_new

### Code needed to create the synthetic data.

#### Code from: https://github.com/cameronmartino/deicode-benchmarking/blob/master/simulations/scripts/simulations.py

Reference:

Martino C, Morton JT, Marotz CA, Thompson LR, Tripathi A, Knight R, et al. A Novel Sparse Compositional Technique Reveals Microbial Perturbations. mSystems. 2019 Feb;4(1). 


In [7]:
# Code from: https://github.com/cameronmartino/deicode-benchmarking/blob/master/simulations/scripts/simulations.py
#Martino C, Morton JT, Marotz CA, Thompson LR, Tripathi A, Knight R, et al. A Novel Sparse Compositional Technique Reveals Microbial Perturbations. mSystems. 2019 Feb;4(1). 

from __future__ import division
# utils
import pandas as pd
import numpy as np
from collections import Counter
# blocks
from scipy.stats import norm
from numpy.random import poisson, lognormal
from skbio.stats.composition import closure
from scipy.special import kl_div
from scipy.stats import entropy
# minimize model perams
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
# Set random state
rand = np.random.RandomState(42)

def Homoscedastic(X_noise, intensity):
    """ uniform normally dist. noise """
    X_noise = np.array(X_noise)
    err = intensity * np.ones_like(X_noise.copy())
    X_noise = rand.normal(X_noise.copy(), err)

    return X_noise


def Heteroscedastic(X_noise, intensity):
    """ non-uniform normally dist. noise """
    err = intensity * np.ones_like(X_noise)
    i = rand.randint(0, err.shape[0], 5000)
    j = rand.randint(0, err.shape[1], 5000)
    err[i, j] = intensity
    X_noise = abs(rand.normal(X_noise, err))

    return X_noise


def Subsample(X_noise, spar, num_samples):
    """ yij ~ PLN( lambda_{ij}, /phi ) """
    # subsample
    mu = spar * closure(X_noise.T).T
    X_noise = np.vstack([poisson(lognormal(np.log(mu[:, i]), 1))
                         for i in range(num_samples)]).T
    # add sparsity

    return X_noise


def block_diagonal_gaus(
        ncols,
        nrows,
        nblocks,
        overlap=0,
        minval=0,
        maxval=1.0):
    """
    Generate block diagonal with Gaussian distributed values within blocks.

    Parameters
    ----------

    ncol : int
        Number of columns

    nrows : int
        Number of rows

    nblocks : int
        Number of blocks, mucst be greater than one

    overlap : int
        The Number of overlapping columns (Default = 0)

    minval : int
        The min value output of the table (Default = 0)

    maxval : int
        The max value output of the table (Default = 1)


    Returns
    -------
    np.array
        Table with a block diagonal where the rows represent samples
        and the columns represent features.  The values within the blocks
        are gaussian distributed between 0 and 1.
    Note
    ----
    The number of blocks specified by `nblocks` needs to be greater than 1.

    """

    if nblocks <= 1:
        raise ValueError('`nblocks` needs to be greater than 1.')
    mat = np.zeros((nrows, ncols))
    gradient = np.linspace(0, 10, nrows)
    mu = np.linspace(0, 10, ncols)
    sigma = 1
    xs = [norm.pdf(gradient, loc=mu[i], scale=sigma)
          for i in range(len(mu))]
    mat = np.vstack(xs).T

    block_cols = ncols // nblocks
    block_rows = nrows // nblocks
    for b in range(nblocks - 1):

        gradient = np.linspace(5, 5, block_rows)  # samples (bock_rows)
        # features (block_cols+overlap)
        mu = np.linspace(0, 10, block_cols + overlap)
        sigma = 2.0
        xs = [norm.pdf(gradient, loc=mu[i], scale=sigma)
              for i in range(len(mu))]

        B = np.vstack(xs).T * maxval
        lower_row = block_rows * b
        upper_row = min(block_rows * (b + 1), nrows)
        lower_col = block_cols * b
        upper_col = min(block_cols * (b + 1), ncols)

        if b == 0:
            mat[lower_row:upper_row, lower_col:int(upper_col + overlap)] = B
        else:
            ov_tmp = int(overlap / 2)
            if (B.shape) == (mat[lower_row:upper_row,
                             int(lower_col-ov_tmp):int(upper_col+ov_tmp+1)].shape):
                mat[lower_row:upper_row, int(
                    lower_col - ov_tmp):int(upper_col + ov_tmp + 1)] = B
            elif (B.shape) == (mat[lower_row:upper_row,
                               int(lower_col - ov_tmp):int(upper_col + ov_tmp)].shape):
                mat[lower_row:upper_row, int(
                    lower_col - ov_tmp):int(upper_col + ov_tmp)] = B
            elif (B.shape) == (mat[lower_row:upper_row, 
                               int(lower_col - ov_tmp):int(upper_col + ov_tmp - 1)].shape):
                mat[lower_row:upper_row, int(
                    lower_col - ov_tmp):int(upper_col +ov_tmp - 1)] = B

    upper_col = int(upper_col - overlap)
    # Make last block fill in the remainder
    gradient = np.linspace(5, 5, nrows - upper_row)
    mu = np.linspace(0, 10, ncols - upper_col)
    sigma = 4
    xs = [norm.pdf(gradient, loc=mu[i], scale=sigma)
          for i in range(len(mu))]
    B = np.vstack(xs).T * maxval

    mat[upper_row:, upper_col:] = B

    return mat


def build_block_model(
        rank,
        hoced,
        hsced,
        spar,
        C_,
        num_samples,
        num_features,
        overlap=0,
        mapping_on=True):
    """
    Generates hetero and homo scedastic noise on base truth block diagonal with Gaussian distributed values within blocks.

    Parameters
    ----------

    rank : int
        Number of blocks


    hoced : int
        Amount of homoscedastic noise

    hsced : int
        Amount of heteroscedastic noise

    inten : int
        Intensity of the noise

    spar : int
        Level of sparsity

    C_ : int
        Intensity of real values

    num_features : int
        Number of rows

    num_samples : int
        Number of columns

    overlap : int
        The Number of overlapping columns (Default = 0)

    mapping_on : bool
        if true will return pandas dataframe mock mapping file by block


    Returns
    -------
    Pandas Dataframes
    Table with a block diagonal where the rows represent samples
    and the columns represent features.  The values within the blocks
    are gaussian.

    Note
    ----
    The number of blocks specified by `nblocks` needs to be greater than 1.

    """

    # make a mock OTU table
    X_true = block_diagonal_gaus(
        num_samples,
        num_features,
        rank,
        overlap,
        minval=.01,
        maxval=C_)
    if mapping_on:
        # make a mock mapping data
        mappning_ = pd.DataFrame(np.array([['Cluster %s' %
                                            str(x)] *
                                           int(num_samples /
                                               rank) for x in range(1, rank +
                                                                    1)]).flatten(), columns=['example'], index=['sample_' +
                                                                                                                str(x) for x in range(0, num_samples -
                                                                                                                                      2)])

    X_noise = X_true.copy()
    X_noise = np.array(X_noise)
    # add Homoscedastic noise
    X_noise = Homoscedastic(X_noise, hoced)
    # add Heteroscedastic noise
    X_noise = Heteroscedastic(X_noise, hsced)
    # Induce low-density into the matrix
    X_noise = Subsample(X_noise, spar, num_samples)

    # return the base truth and noisy data
    if mapping_on:
        return X_true, X_noise, mappning_
    else:
        return X_true, X_noise


### Create negative and positive controls (DEICODE Code)
#### Code from: https://github.com/cameronmartino/deicode-benchmarking/blob/master/simulations/negative_control.ipynb

Martino C, Morton JT, Marotz CA, Thompson LR, Tripathi A, Knight R, et al. A Novel Sparse Compositional Technique Reveals Microbial Perturbations. mSystems. 2019 Feb;4(1). 


In [8]:
from sklearn.utils import shuffle

depth=2.5e3
overlap_=0
rank_=2
#run model with fit variables and new variants
_,X_signal=build_block_model(rank_,  depth/60, depth/60, 
                             depth, depth
                             ,200,1000,overlap=overlap_
                             ,mapping_on=False)



#run model with fit variables and new variants
_,X_signal=build_block_model(rank_,  depth/60, depth/60, 
                             depth, depth
                             ,200,1000,overlap=overlap_
                             ,mapping_on=False)
X_signal=pd.DataFrame(X_signal,
                      index=['OTU_'+str(x)
                                for x in range(X_signal.shape[0])],
                      columns=['sample_'+str(x) 
                               for x in range(X_signal.shape[1])])

#run model with fit variables and new variants
X_random=np.random.randint(0,np.mean(X_signal.values)*2.3,(1000,200))
X_random=pd.DataFrame(X_random,
                      index=['OTU_'+str(x)
                                for x in range(X_random.shape[0])],
                      columns=['sample_'+str(x) 
                               for x in range(X_random.shape[1])])
X_random.index = shuffle(X_random).index
X_random.columns = shuffle(X_random.T).index
X_random=X_random.T
X_random.sort_index(inplace=True)
X_random=X_random.T
X_random.sort_index(inplace=True)


#meta on cluster
meta = np.array([1]*int(X_signal.shape[1]/2)+[2]*int(X_signal.shape[1]/2)).T
meta = pd.DataFrame(meta,index=X_signal.columns,columns=['group'])

print('X_random mean %.2f seq/sample'%X_random.sum(axis=0).mean())
print('X_signal mean %.2f seq/sample'%X_signal.sum(axis=0).mean())

#RPCA on random
X_random = X_random.transpose()

#RPCA on very clear signal
X_signal = X_signal.transpose()

#Feature names
cluster_names = np.asarray(["ASV_%s" %str(i) for i in range(1000)])


X_random mean 4007.46 seq/sample
X_signal mean 4102.82 seq/sample


### Testing generalization performance of TreeOrdination using 3 x 5 SCV

In [None]:
X_orig = X_random.values
y_sel = meta["group"].values
y_subj_sel = meta["group"].values

splitter = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 0)

scores = []
p_stat = []

for method in ["PA", "CLR", "rCLR"]:
    counter = 0
    if method == "PA":
        metric = "jaccard"
                
    else:
        metric = "euclidean"
    
    print("Method:", method)
    
    #Remove all features with zeros in all samples
    X_transform = X_orig.transpose()
    all_zero = X_transform.sum(axis = 1)
    all_zero = np.where(all_zero > 0, True, False)
    
    X_red = X_orig
    X_red = X_red[:, all_zero]
    
    if method == "PA":         
        X_rare, y_out, y_subj_ss, _ = rarefaction(X_red, 15, y_sel, y_subj_sel, seed=0)
        
        X_trf = np.where(X_rare > 0, 1, 0)
                       
        M = pcoa(DistanceMatrix(pairwise_distances(X_trf, metric = "jaccard").astype(np.float32)), 
                 number_of_dimensions = 2)
        
        X_proj = M.samples.values
                    
    if method == "CLR":
        X_trf = clr(multiplicative_replacement(closure(X_red)))

        y_subj_ss = y_subj_sel
        y_out = y_sel
        
        M = pcoa(DistanceMatrix(pairwise_distances(X_trf, metric = "euclidean").astype(np.float32)), 
                 number_of_dimensions = 2)
        
        X_proj = M.samples.values             
        
    if method == "rCLR":
        y_subj_ss = y_subj_sel
        y_out = y_sel
        
        A = rclr(X_red.transpose()).transpose()
        M = MatrixCompletion(2, max_iterations = 1000).fit(A)
        X_trf = M.solution
        X_proj = M.U
        
    feature_names = cluster_names[all_zero]
    cluster_names = cluster_names[all_zero]
    
    for train, test in splitter.split(X_trf, y_out):
        counter += 1
        print(counter)

        X_training = X_trf[train]
        X_testing = X_trf[test]
        
        X_train_proj = X_proj[train]
        X_test_proj = X_proj[test]
        
        y_train = y_out[train]
        y_test = y_out[test]
        
        y_train_ss = y_subj_ss[train]
        y_test_ss = y_subj_ss[test]
                
        #TreeOrdination
        clf = TreeOrdination(feature_names = cluster_names, metric = metric, n_jobs = 20).fit(X_training, y_train)    
        class_label = clf.predict(X_testing)
        s = balanced_accuracy_score(y_test, class_label)
        print("TreeOrdination (Default):", s)
        scores.append((method, "TreeOrdination (Default)", s))

        clf_knn = KNeighborsClassifier(weights = "distance", metric = metric).fit(clf.R_final, y_train)
        s = balanced_accuracy_score(y_test, clf_knn.predict(clf.transform(X_testing)))
        print("TreeOrdination (5-KNN):", s)
        scores.append((method, "TreeOrdination (5-KNN)", s))

        clf_knn = KNeighborsClassifier(n_neighbors = 3, weights = "distance", metric = metric).fit(clf.R_final, y_train)
        s = balanced_accuracy_score(y_test, clf_knn.predict(clf.transform(X_testing)))
        print("TreeOrdination (3-KNN):", s)
        scores.append((method, "TreeOrdination (3-KNN)", s))

        clf_lm = LANDMarkClassifier(128, n_jobs = 10, use_nnet = False).fit(clf.R_final, y_train)
        s = balanced_accuracy_score(y_test, clf_lm.predict(clf.transform(X_testing)))
        print("TreeOrdination (LANDMark):", s)
        scores.append((method, "TreeOrdination (LANDMark)", s))

        clf_dm = DummyClassifier(strategy = "stratified").fit(clf.R_final, y_train)
        s = balanced_accuracy_score(y_test, clf_dm.predict(clf.transform(X_testing)))
        print("TreeOrdination (Random):", s)
        scores.append((method, "TreeOrdination (Random)", s))

        #Raw and Projected
        clf_knn = KNeighborsClassifier(weights = "distance", metric = metric).fit(X_training, y_train)
        s = balanced_accuracy_score(y_test, clf_knn.predict(X_testing))
        print("5-KNN (Raw):", s)
        scores.append((method, "5-KNN (Raw)", s))
        
        clf_knn = KNeighborsClassifier(n_neighbors = 3, weights = "distance", metric = metric).fit(X_training, y_train)
        s = balanced_accuracy_score(y_test, clf_knn.predict(X_testing))
        print("3-KNN (Raw):", s)
        scores.append((method, "3-KNN (Raw)", s))

        clf_et = ExtraTreesClassifier(128).fit(X_training, y_train)
        s = balanced_accuracy_score(y_test, clf_et.predict(X_testing))
        print("Extra Trees (Raw):", s)
        scores.append((method, "Extra Trees (Raw)", s))

        clf_lm = LANDMarkClassifier(128, n_jobs = 10, use_nnet = False).fit(X_training, y_train)
        s = balanced_accuracy_score(y_test, clf_lm.predict(X_testing))
        print("LANDMark (Raw):", s)
        scores.append((method, "LANDMark (Raw)", s))

        clf_dm = DummyClassifier(strategy = "stratified").fit(X_training, y_train)
        s = balanced_accuracy_score(y_test, clf_dm.predict(X_testing))
        print("Random (Raw)):", s)
        scores.append((method, "Random (Raw)", s))

        clf_knn = KNeighborsClassifier(weights = "distance", metric = "euclidean").fit(X_train_proj, y_train)
        s = balanced_accuracy_score(y_test, clf_knn.predict(X_test_proj))
        print("KNN (Projection):", s)
        scores.append((method, "5-KNN (Projection)", s))

        clf_knn = KNeighborsClassifier(n_neighbors = 3, weights = "distance", metric = "euclidean").fit(X_train_proj, y_train)
        s = balanced_accuracy_score(y_test, clf_knn.predict(X_test_proj))
        print("3-KNN (Projection):", s)
        scores.append((method, "3-KNN (Projection)", s))

        clf_et = ExtraTreesClassifier(128).fit(X_train_proj, y_train)
        s = balanced_accuracy_score(y_test, clf_et.predict(X_test_proj))
        print("Extra Trees (Projection):", s)
        scores.append((method, "Extra Trees (Projection)", s))

        clf_lm = LANDMarkClassifier(128, n_jobs = 10, use_nnet = False).fit(X_train_proj, y_train)
        s = balanced_accuracy_score(y_test, clf_lm.predict(X_test_proj))
        print("LANDMark (Projection):", s)
        scores.append((method, "LANDMark (Projection)", s))

        clf_dm = DummyClassifier(strategy = "stratified").fit(X_train_proj, y_train)
        s = balanced_accuracy_score(y_test, clf_dm.predict(X_test_proj))
        print("Random (Projection)):", s)
        scores.append((method, "Random (Projection)", s))

        #Get statistics for TreeOrdination - Tree Proximity
        pmanova = permanova(DistanceMatrix(pairwise_distances(clf.R_final, 
                                                              metric = "jaccard").astype(np.float32)), 
                            y_train)

        pseudo_f, pval = pmanova.values[4:6]
        R2 = 1 - 1 / (1 + pmanova.values[4] * pmanova.values[4] / (pmanova.values[2] - pmanova.values[3] - 1))
        print("TreeOrdination (Tree Proximity)", pseudo_f, pval, R2)
        p_stat.append((method, "TreeOrdination (Tree Proximity)", pseudo_f, pval, R2))

        #Get statistics for TreeOrdination - Tree Proximity Embedding in Euclidean Space
        pmanova = permanova(DistanceMatrix(pairwise_distances(clf.R_PCA_emb, 
                                                                  metric = "euclidean").astype(np.float32)), 
                            y_train)

        pseudo_f, pval = pmanova.values[4:6]
        R2 = 1 - 1 / (1 + pmanova.values[4] * pmanova.values[4] / (pmanova.values[2] - pmanova.values[3] - 1))
        print("TreeOrdination", pseudo_f, pval, R2)
        p_stat.append((method, "TreeOrdination (Embedding)", pseudo_f, pval, R2))

        #Get statistics for Transformation
        pmanova = permanova(DistanceMatrix(pairwise_distances(X_train_proj, 
                                                              metric = "euclidean").astype(np.float32)), 
                            y_train)

        pseudo_f, pval = pmanova.values[4:6]
        R2 = 1 - 1 / (1 + pmanova.values[4] * pmanova.values[4] / (pmanova.values[2] - pmanova.values[3] - 1))
        print("%s (Projection)" %method, pseudo_f, pval, R2)
        p_stat.append((method, "Projection", pseudo_f, pval, R2))
        
        #Get statistics for Raw
        pmanova = permanova(DistanceMatrix(pairwise_distances(X_training, 
                                                              metric = metric).astype(np.float32)), 
                            y_train)

        pseudo_f, pval = pmanova.values[4:6]
        R2 = 1 - 1 / (1 + pmanova.values[4] * pmanova.values[4] / (pmanova.values[2] - pmanova.values[3] - 1))
        print("%s (Raw)" %method, pseudo_f, pval, R2)
        p_stat.append((method, "Raw", pseudo_f, pval, R2))

#Save CSVs
scores = pd.DataFrame(scores, columns = ["Transform", "Model", "Balanced Accuracy Score"])
scores.to_csv("Results/BAS.csv")
    
p_stat = pd.DataFrame(p_stat, columns = ["Transform", "Method", "F-Statistic", "P-value", "R2"])
p_stat.to_csv("Results/PerMANOVA.csv")
    



Method: PA
1


Data was converted to boolean for metric jaccard
The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it's probably safe to ignore them, but if they are large in magnitude, the results won't be useful. See the Notes section for more details. The smallest eigenvalue is -0.0037210455629974604 and the largest is 0.11983256787061691.


### Plot Graphs of Balanced Accuracy Score, F-Statistics, P-values

In [38]:
#Plot Graphs - Balanced Accuracy
df_perm = pd.read_csv("Results/BAS.csv")

g = sns.catplot(x = "Model", y = "Balanced Accuracy Score", 
            hue = "Model", 
            col = "Transform", 
            data = df_perm, 
            kind = "boxen",
            dodge = False)

for axes in g.axes.flat:
    _ = axes.set_xticklabels(axes.get_xticklabels(), rotation=90, fontsize = 8)

plt.tight_layout()
    
plt.savefig("Results/BAS.svg")
plt.close()

#Plot Graphs - F Statistic
df_perm = pd.read_csv("Results/PerMANOVA.csv")

g = sns.catplot(x = "Method", y = "F-Statistic", 
            hue = "Method", 
            col = "Transform", 
            data = df_perm, 
            kind = "boxen",
            dodge = False)

for axes in g.axes.flat:
    _ = axes.set_xticklabels(axes.get_xticklabels(), rotation=90, fontsize = 8)

plt.tight_layout()
    
plt.savefig("Results/PerMANOVA_F_pc.svg")
plt.close()

#Plot Graphs - F Statistic
df_perm = pd.read_csv("Results/PerMANOVA.csv")

g = sns.catplot(x = "Method", y = "P-value", 
            hue = "Method", 
            col = "Transform", 
            data = df_perm, 
            kind = "boxen",
            dodge = False)

for axes in g.axes.flat:
    _ = axes.set_xticklabels(axes.get_xticklabels(), rotation=90, fontsize = 8)

plt.tight_layout()
    
plt.savefig("Results/PerMANOVA_P_pc.svg")
plt.close()

### Bayesian T-Tests of BAS and F-statistics

In [36]:
#Bayesian T-tests
df_perm = pd.read_csv("Results/BAS.csv")

#Balanced accuracy data
transform_types = ["rCLR", "PA", "CLR"]
model_types = list(set(df_perm["Model"]))

for transform in transform_types:
    data = np.where(df_perm["Transform"] == transform, True, False)
    data = df_perm[data]
        
    final_df = []
        
    for i in range(0, len(model_types) - 1):
        for j in range(i + 1, len(model_types)):
            model_a = model_types[i]
            model_b = model_types[j]
            
            data_a = np.where(data["Model"] == model_a, True, False)
            data_a = data[data_a]["Balanced Accuracy Score"].values
            mu_a = np.mean(data_a)
            std_a = np.std(data_a, ddof = 1)
            
            data_b = np.where(data["Model"] == model_b, True, False)
            data_b = data[data_b]["Balanced Accuracy Score"].values
            mu_b = np.mean(data_b)
            std_b = np.std(data_b, ddof = 1)
            
            p_left, p_rope, p_right = two_on_single(data_a, data_b, rope = 0.05, runs = 3)
            print(transform, "%s-%s" %(model_a, model_b), mu_a, std_a, mu_b, std_b, p_left, p_rope, p_right)
            
            final_df.append((transform, model_a, model_b, mu_a, std_a, mu_b, std_b, p_left, p_rope, p_right))
            
    final_df = pd.DataFrame(final_df, index = None, columns = ["Transform", "Comparison A", "Comparison B", "Mean A", "Std Dev A", 
                                                               "Mean B", "Std Dev B", "Left", "ROPE", "Right"])
    final_df.to_csv("Results/%s_ttest_bas.csv" %transform)

#F-Statistics
df_perm = pd.read_csv("Results/PerMANOVA.csv")

transform_types = ["rCLR", "PA", "CLR"]
model_types = list(set(df_perm["Method"]))

for transform in transform_types:
    data = np.where(df_perm["Transform"] == transform, True, False)
    data = df_perm[data]
        
    final_df = []
        
    for i in range(0, len(model_types) - 1):
        for j in range(i + 1, len(model_types)):
            model_a = model_types[i]
            model_b = model_types[j]
            
            data_a = np.where(data["Method"] == model_a, True, False)
            data_a = data[data_a]["F-Statistic"].values
            mu_a = np.mean(data_a)
            std_a = np.std(data_a, ddof = 1)
            
            data_b = np.where(data["Method"] == model_b, True, False)
            data_b = data[data_b]["F-Statistic"].values
            mu_b = np.mean(data_b)
            std_b = np.std(data_b, ddof = 1)
            
            p_left, p_rope, p_right = two_on_single(data_a, data_b, rope = 0.05, runs = 3)
            print(transform, "%s-%s" %(model_a, model_b), p_left, p_rope, p_right)
            
            final_df.append((transform, model_a, model_b, mu_a, std_a, mu_b, std_b, p_left, p_rope, p_right))
            
    final_df = pd.DataFrame(final_df, index = None, columns = ["Transform", "Comparison A", "Comparison B", "Mean A", "Std Dev A", 
                                                               "Mean B", "Std Dev B", "Left", "ROPE", "Right"])
    final_df.to_csv("Results/%s_ttest_permanova.csv" %transform)


rCLR TreeOrdination (LANDMark)-3-KNN (Raw) 0.4600000000000001 0.09368979548370131 0.4875 0.060380736442455986 0.05885003901958352 0.6273698931008643 0.31378006787955215
rCLR TreeOrdination (LANDMark)-Random (Raw) 0.4600000000000001 0.09368979548370131 0.51 0.11130538571376002 0.08166041314147723 0.4183395868585228 0.5
rCLR TreeOrdination (LANDMark)-TreeOrdination (Random) 0.4600000000000001 0.09368979548370131 0.53 0.08399735445569434 0.010797376814516818 0.3165829214484719 0.6726197017370112
rCLR TreeOrdination (LANDMark)-TreeOrdination (Default) 0.4600000000000001 0.09368979548370131 0.4925 0.05533985905294664 0.015007997208642742 0.6857869287794345 0.29920507401192276
rCLR TreeOrdination (LANDMark)-Random (Projection) 0.4600000000000001 0.09368979548370131 0.47250000000000003 0.07587453093393358 0.13478788423334642 0.6161543314760273 0.2490577842906263
rCLR TreeOrdination (LANDMark)-Extra Trees (Projection) 0.4600000000000001 0.09368979548370131 0.475 0.07817359599705713 0.138067091