This notebook is apply balances with DS-FDR to rarefied DME dataset

In [57]:
import pandas as pd
import numpy as np
import scipy as sp
from skbio.stats import subsample_counts
from biom import load_table
from gneiss.cluster import correlation_linkage
from gneiss.util import rename_internal_nodes, match
from gneiss.balances import balance_basis
from skbio.stats.composition import ilr
import skbio
import qiime2

import dsfdr
import statistics
import transform

%matplotlib notebook
import matplotlib.pyplot as plt
import pylab

In [58]:
def convert_biom_to_pandas(table):
    otu_table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                             index=table.ids(axis='sample'),
                             columns=table.ids(axis='observation'))
    return otu_table

In [88]:
# read in biom table
table = load_table('../data/dme.biom')
otu_table = convert_biom_to_pandas(table)

In [89]:
otu_table.shape

(80, 3774)

In [90]:
# rarefy
otu_table2 = np.array(otu_table).astype(int)
otu_sum = np.sum(otu_table2, axis=1)
sp.stats.describe(otu_sum)

DescribeResult(nobs=80, minmax=(588, 3845), mean=1913.5374999999999, variance=182540.55553797467, skewness=1.2306648629128023, kurtosis=4.988620859863893)

In [131]:
labels = np.array(otu_sum >= 1500) # minmax = (1k, 10k)
otu_table3 = otu_table2[labels==True, :]
otu_table3.shape

(75, 3774)

In [132]:
otu_table_r = np.zeros(np.shape(otu_table3))
for i in range(np.shape(otu_table3)[0]):
    col_r = subsample_counts(otu_table3[i, :], 1500)
    otu_table_r[i, :] = col_r

In [133]:
# generate the tree
otu_table_r = pd.DataFrame(otu_table_r)
otu_tree = correlation_linkage(otu_table_r + 1)

In [134]:
basis, _ = balance_basis(otu_tree)
balances = ilr(otu_table_r + 1, basis)

In [135]:
balance_ids = [n.name for n in otu_tree.levelorder() if not n.is_tip()]
balances_df = pd.DataFrame(balances, index=otu_table.index[labels], columns=balance_ids)

In [136]:
# read in mapping file
mapping = pd.read_table("../data/dme.map.txt", sep='\t', header=0, index_col=0)

In [137]:
# choose intersted categories to compare with
mapping2 = mapping.loc[mapping['misc_param'].isin(['cesarean delivery','vaginal delivery'])]

In [138]:
# magic function (match SampleID in order and delete unmatched ones)
mapping2, balances_df = match(mapping2, balances_df)

In [139]:
# fix floating point error in balances output
balances_df = np.array(balances_df)
for col in range(np.shape(balances_df)[1]):
    for row in range(np.shape(balances_df)[0]):
        close = np.isclose(balances_df[row, col], balances_df[:, col])
        balances_df[close, col] = balances_df[row, col] 

In [140]:
balances_dme = np.transpose(balances_df)
labels_dme = np.array((mapping2['misc_param'] == 'cesarean delivery').astype(int))

In [141]:
balances_dme.shape

(3773, 42)

In [142]:
labels_dme.shape

(42,)

In [161]:
# apply FDR methods to balances
results1 = dsfdr.dsfdr(data = balances_dme, labels = labels_dme, fdr_method='dsfdr')
results2 = dsfdr.dsfdr(data = balances_dme, labels = labels_dme, fdr_method='bhfdr')

In [162]:
# compare results of DS-FDR and BH method
ds = np.sum(results1[0])
bh = np.sum(results2[0])
print(ds, bh, ds-bh, (ds-bh)/bh*100)

913 773 140 18.1112548512
