In [1]:
import pandas as pd
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data
import os
import numpy as np
from scipy.sparse import coo_matrix
import random
from statsmodels.stats.proportion import binom_test


def parse_genome(df):
    genome_id = df['#query'][0].split('_')[0]
    keggs = df['KEGG_ko'].replace('-', None).dropna()
    keggs = list(map(lambda x: x.split(','), keggs.values))
    keggs = sum(keggs, [])
    keggs = pd.DataFrame({'KEGG_ko': keggs})
    keggs['genome_id'] = genome_id
    return keggs


def to_sparse_matrix(func_df, genome_id='genome_id', kegg_id='KEGG_ko'):
    # create genome-specific index
    ogus = list(set(func_df[genome_id]))
    ogu_lookup = pd.Series(np.arange(0, len(ogus)), ogus)
    # create KEGG-specific index
    keggs = list(set(func_df[kegg_id]))
    kegg_lookup = pd.Series(np.arange(0, len(keggs)), keggs)
    # rename names as numbers
    ogu_id = func_df[genome_id].apply(lambda x: ogu_lookup.loc[x]).astype(np.int64)
    kegg_id = func_df[kegg_id].apply(lambda x: kegg_lookup.loc[x]).astype(np.int64)
    # assign the presence / absence of a gene
    func_df['count'] = 1
    c = func_df['count'].values
    # format into a matrix
    data = coo_matrix((c, (ogu_id, kegg_id)))
    ko_ogu = pd.DataFrame(data.todense(), index=ogus, columns=keggs)
    return ko_ogu

def btest(pa1, pa2, seed=0, return_proportions=False):
    """ Performs genome wide binomial test between two groups of taxa
    Parameters
    ----------
    df1 : pd.DataFrame
        Rows are taxa, columns are genes
    df2 : pd.DataFrame
        Rows are taxa, columns are genes
    Returns
    -------
    pd.Series : list of genes associated with df1
    pd.Series : list of genes associated with df2
    """
    np.random.seed(seed)
    random.seed(seed)
    #pa1 = df1 > 0
    #pa2 = df2 > 0
    idx = list(set(pa1.columns) | set(pa2.columns))
    idx.sort()
    pa1 = pa1.sum(axis=0).reindex(idx).fillna(0)
    pa2 = pa2.sum(axis=0).reindex(idx).fillna(0)
    n = pa1 + pa2
    obs = list(zip(list(pa1.values), list((pa2.values + 1) / (pa2 + 1).sum()), list(n.values)))
    pvals = pd.Series([binom_test(a, n, b, 'two-sided') for (a, b, n) in obs],
                      index=n.index)
    if return_proportions:
        res = pd.DataFrame({'groupA': pa1, 'groupB': pa2, 'pval': pvals})
        def relabel_f(x):
            if x['groupA'] < x['groupB']:
                return 'groupB'
            else:
                return 'groupA'
        res['side'] = res.apply(relabel_f, axis=1)
        return res

    return pvals

def _naive_mean_permutation_test(mat,cats,permutations=1000):
    """
    mat: numpy 2-d matrix
         columns: features (e.g. OTUs)
         rows: samples
         matrix of features
    cats: numpy array
         Array of categories to run group signficance on
    Note: only works on binary classes now
    Returns
    =======
    test_stats:
        List of mean test statistics
    pvalues:
        List of corrected p-values
    This module will conduct a mean permutation test using
    the naive approach
    """
    def _mean_test(values,cats):
        #calculates mean for binary categories
        return abs(values[cats==0].mean()-values[cats==1].mean())

    rows,cols = mat.shape
    pvalues = np.zeros(rows)
    test_stats = np.zeros(rows)
    for r in range(rows):
        values = mat[r,:].transpose()
        test_stat = _mean_test(values,cats)
        perm_stats = np.empty(permutations, dtype=np.float64)
        for i in range(permutations):
            perm_cats = np.random.permutation(cats)
            perm_stats[i] = _mean_test(values,perm_cats)
        p_value = ((perm_stats >= test_stat).sum() + 1.) / (permutations + 1.)
        pvalues[r] = p_value
        test_stats[r] = test_stat
    #_,pvalues,_,_ = multipletests(pvalues)
    return test_stats, pvalues

In [16]:
import pydeseq2

def pydeseq2_permutation_test(counts, group_labels, num_permutations=10):
    """Permutation test for differential expression using PyDESeq2.

    Parameters
    ----------
    counts : microbial count matrix, (n_samples, m_microbes).
    group_labels : array_like
        The group labels of shape (n_samples,).
    num_permutations : int, optional (default=1000)
        The number of permutations to perform.

    Returns
    -------
    p_values : array_like
        The p-values of the permutation test for each microbe.
    """
    # Convert counts and group labels to PyDESeq2 input format
    dds = DeseqDataSet(
    counts,
    group_labels,
    design_factors="disease",  # compare samples based on the "disease"
    # column ("B" vs "A")
    refit_cooks=True,
    n_cpus=8,
)
    dds.deseq2()
    stat_res = DeseqStats(dds, n_cpus=8)
    # Get the test statistics for each gene
    res = stat_res.summary()
    
    # Initialize the null distribution of the test statistic
    null_distribution = np.zeros((num_permutations, counts.shape[1]))

    # Perform the permutations
    for i in range(num_permutations):
        # Shuffle the group labels
#         permuted_labels = np.random.permutation(group_labels)
#         permuted_labels = pd.DataFrame(permuted_labels)
        permuted_labels = np.random.permutation(group_labels['disease'])
        permuted_labels = pd.DataFrame(permuted_labels, index=group_labels.index)
        permuted_labels = pd.DataFrame(permuted_labels)
        permuted_labels.columns = ['disease']
        # Convert the permuted group labels to PyDESeq2 input format
        # Run the DESeq2 analysis with the permuted group labels
        counts.reindex(permuted_labels.index)
        permuted_dds = DeseqDataSet(
        counts, 
        permuted_labels, 
        design_factors="disease")
        permuted_dds.deseq2()
        permuted_stat_res = DeseqStats(permuted_dds, n_cpus=8)
        # Get the test statistics for each gene with the permuted group labels
        permuted_res = permuted_stat_res.summary()

        # Add the permuted test statistic to the null distribution
        null_distribution[i, :] = permuted_stat_res.results_df['log2FoldChange']

    # Calculate the p-values for each gene
    p_values = np.zeros(counts.shape[1])
    for i in range(counts.shape[1]):
        if stat_res.results_df['log2FoldChange'].values[i] > 0:
            p_values[i] = np.mean(null_distribution[:, i] > stat_res.results_df['log2FoldChange'].values[i])
        else:
            p_values[i] = np.mean(null_distribution[:, i] < stat_res.results_df['log2FoldChange'].values[i])

    return p_values


In [17]:
pydeseq2_permutation_test(table_20microbes,metadata_CD)

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.02 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 9 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.00 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,0.206622,0.243936,0.847032,0.396978,0.443681
28L sp000177555,37.402176,-4.204268,1.151062,-3.652513,0.00026,0.000705
43-108 sp001915545,259.797831,-2.243019,0.521027,-4.304998,1.7e-05,0.000159
51-20 sp001917175,117.079188,-1.204608,1.027269,-1.172632,0.240943,0.305195
51-20 sp900539605,0.0,0.0,0.0,0.0,,
51-20 sp900542055,3.016793,-2.255008,1.5748,-1.431932,0.152163,0.206552
51-20 sp900762565,26.256248,-3.484711,1.50375,-2.317348,0.020485,0.032434
AF33-28 sp003477885,1467.801782,-1.182807,0.317764,-3.722288,0.000197,0.000625
AM07-15 sp003477405,3153.225764,-1.741027,0.361086,-4.821638,1e-06,2.7e-05
AM51-8 sp003478275,1146.681384,-0.837528,0.306216,-2.735093,0.006236,0.010772


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 13 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.17 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,0.207396,0.244724,0.847469,0.3967337,0.607486
28L sp000177555,18.420105,1.153641,1.248978,0.923668,0.355659,0.607486
43-108 sp001915545,202.972852,-0.614233,0.530829,-1.157119,0.2472236,0.549386
51-20 sp001917175,265.370709,-5.036562,1.008061,-4.996288,5.84443e-07,1.2e-05
51-20 sp900539605,0.215148,-0.833399,2.976167,-0.280024,0.7794589,0.820483
51-20 sp900542055,2.878339,-1.555427,1.593057,-0.976379,0.3288768,0.607486
51-20 sp900762565,11.966473,-1.116193,1.586272,-0.703658,0.4816458,0.607486
AF33-28 sp003477885,1467.801782,0.244834,0.332655,0.735999,0.4617311,0.607486
AM07-15 sp003477405,2286.070983,0.915333,0.335729,2.726406,0.006402815,0.064028
AM51-8 sp003478275,1146.681384,-0.04004,0.314939,-0.127134,0.8988342,0.898834


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.37 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 11 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.37 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 0.17 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,0.584688,0.239713,2.439114,0.014723,0.073617
28L sp000177555,19.076579,2.139718,1.22676,1.744202,0.081124,0.191612
43-108 sp001915545,202.972852,0.341877,0.531721,0.642964,0.520248,0.592834
51-20 sp001917175,261.255283,4.374648,1.015705,4.307008,1.7e-05,0.000331
51-20 sp900539605,0.215148,2.041105,2.982835,0.684284,0.493796,0.592834
51-20 sp900542055,2.878339,1.258796,1.574511,0.799484,0.42401,0.592834
51-20 sp900762565,26.256248,3.859203,1.502487,2.568543,0.010213,0.068085
AF33-28 sp003477885,1236.134444,0.191233,0.307155,0.622595,0.533551,0.592834
AM07-15 sp003477405,2286.070983,-0.336051,0.342919,-0.979973,0.3271,0.545166
AM51-8 sp003478275,1276.787324,-0.701986,0.3277,-2.142162,0.03218,0.102083


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.34 seconds.

Refitting 11 outlier genes.

Fitting dispersions...
... done in 0.00 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.15 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,-0.14118,0.245215,-0.575742,0.564789,0.664458
28L sp000177555,18.420105,-0.334492,1.256968,-0.26611,0.790154,0.790154
43-108 sp001915545,202.972852,1.001365,0.526925,1.900393,0.057382,0.18878
51-20 sp001917175,160.265862,-3.677112,1.01913,-3.608091,0.000308,0.003085
51-20 sp900539605,0.215148,-1.814116,2.979115,-0.608945,0.542561,0.664458
51-20 sp900542055,3.387255,-1.793661,1.540508,-1.164331,0.24429,0.402024
51-20 sp900762565,17.868174,-0.636577,1.57908,-0.403132,0.686851,0.763168
AF33-28 sp003477885,1467.801782,-0.262325,0.332609,-0.788691,0.430293,0.614704
AM07-15 sp003477405,2286.070983,0.229184,0.343552,0.667102,0.504707,0.664458
AM51-8 sp003478275,1146.681384,-0.374991,0.313392,-1.196556,0.23148,0.402024


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.37 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.39 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 9 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.17 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,-0.222526,0.244213,-0.911198,0.362191,0.564656
28L sp000177555,26.37702,3.287507,1.192139,2.757653,0.005822,0.019406
43-108 sp001915545,202.972852,0.451559,0.531173,0.850118,0.395259,0.564656
51-20 sp001917175,93.489061,-0.950738,1.011794,-0.939656,0.347394,0.564656
51-20 sp900539605,0.141464,1.861375,2.990038,0.622525,0.533597,0.711462
51-20 sp900542055,3.387255,0.5937,1.532147,0.387495,0.69839,0.821635
51-20 sp900762565,16.608166,1.573384,1.596868,0.985293,0.32448,0.564656
AF33-28 sp003477885,1901.10231,1.424905,0.350825,4.061586,4.9e-05,0.000487
AM07-15 sp003477405,3153.225764,1.502579,0.367599,4.087548,4.4e-05,0.000487
AM51-8 sp003478275,1146.681384,0.064177,0.314684,0.203941,0.8384,0.8384


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 13 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.14 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,-0.101403,0.245018,-0.413859,0.678978,0.75442
28L sp000177555,19.076579,-2.638781,1.212857,-2.175674,0.02958,0.204792
43-108 sp001915545,202.972852,-0.702446,0.529202,-1.32737,0.184386,0.460966
51-20 sp001917175,93.489061,0.557867,1.015531,0.549335,0.582775,0.685618
51-20 sp900539605,0.141464,-1.850959,2.987777,-0.61951,0.53558,0.678725
51-20 sp900542055,1.999911,-0.050041,1.65072,-0.030315,0.975816,0.975816
51-20 sp900762565,11.966473,-1.65043,1.562502,-1.056274,0.290843,0.598348
AF33-28 sp003477885,1467.801782,0.262122,0.332529,0.78827,0.430539,0.678725
AM07-15 sp003477405,2286.070983,0.257756,0.343472,0.750444,0.452987,0.678725
AM51-8 sp003478275,1146.681384,-0.677784,0.309526,-2.189751,0.028542,0.204792


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 11 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.37 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Running Wald tests...
... done in 0.10 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,0.024858,0.245079,0.10143,0.919209,0.919209
28L sp000177555,29.445261,3.166746,1.211275,2.614391,0.008939,0.035753
43-108 sp001915545,259.797831,1.6362,0.538,3.041266,0.002356,0.011779
51-20 sp001917175,93.489061,1.093592,1.009653,1.083136,0.278748,0.46458
51-20 sp900539605,0.141464,1.568497,2.991235,0.524364,0.600025,0.705912
51-20 sp900542055,2.508828,1.786648,1.540864,1.159511,0.246248,0.447724
51-20 sp900762565,11.966473,0.847623,1.578673,0.536921,0.591322,0.705912
AF33-28 sp003477885,1467.801782,0.228326,0.332664,0.686358,0.492487,0.65665
AM07-15 sp003477405,2286.070983,1.117156,0.331687,3.368108,0.000757,0.007569
AM51-8 sp003478275,1378.088431,1.243562,0.325904,3.815734,0.000136,0.002716


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 12 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.44 seconds.

Fitting LFCs...
... done in 0.43 seconds.

Running Wald tests...
... done in 0.16 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,-0.329393,0.243401,-1.353295,0.175961,0.440128
28L sp000177555,19.076579,-2.143078,1.228274,-1.744788,0.081022,0.324087
43-108 sp001915545,202.972852,0.131246,0.532631,0.246411,0.805364,0.847752
51-20 sp001917175,93.489061,-0.656695,1.014128,-0.647546,0.517278,0.795813
51-20 sp900539605,0.141464,-1.395492,2.987842,-0.467057,0.640459,0.847752
51-20 sp900542055,3.387255,-0.452618,1.533416,-0.29517,0.767864,0.847752
51-20 sp900762565,16.608166,-2.288943,1.577754,-1.450761,0.146846,0.440128
AF33-28 sp003477885,1467.801782,0.447957,0.331081,1.353014,0.176051,0.440128
AM07-15 sp003477405,3153.225764,1.5628,0.366204,4.267564,2e-05,0.000198
AM51-8 sp003478275,1146.681384,-0.084292,0.314627,-0.26791,0.788769,0.847752


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.38 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Refitting 12 outlier genes.

Fitting dispersions...
... done in 0.00 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.35 seconds.

Running Wald tests...
... done in 0.10 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,-0.206718,0.244365,-0.845938,0.397587,0.795175
28L sp000177555,26.37702,-1.259846,1.252184,-1.006119,0.314358,0.71648
43-108 sp001915545,259.797831,1.960655,0.530474,3.696042,0.000219,0.00438
51-20 sp001917175,210.460117,-2.573725,1.041999,-2.469987,0.013512,0.135118
51-20 sp900539605,0.215148,-1.686359,2.979747,-0.56594,0.571434,0.806634
51-20 sp900542055,2.508828,0.754538,1.576297,0.478678,0.632168,0.806634
51-20 sp900762565,11.966473,-0.536063,1.580295,-0.339217,0.734446,0.816052
AF33-28 sp003477885,1236.134444,0.13288,0.307494,0.43214,0.66564,0.806634
AM07-15 sp003477405,2286.070983,-0.237578,0.343558,-0.691522,0.489237,0.806634
AM51-8 sp003478275,1146.681384,-0.02082,0.314777,-0.066141,0.947265,0.947265


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.01 seconds.

Refitting 11 outlier genes.

Fitting dispersions...
... done in 0.00 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.16 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,-0.78328,0.235837,-3.321278,0.000896,0.01691
28L sp000177555,26.37702,1.679776,1.248729,1.345189,0.178564,0.430172
43-108 sp001915545,162.447912,-0.12999,0.511952,-0.253911,0.799564,0.888405
51-20 sp001917175,117.079188,1.389633,1.027433,1.352529,0.176206,0.430172
51-20 sp900539605,0.141464,-1.054347,2.987337,-0.352939,0.724134,0.888405
51-20 sp900542055,2.508828,0.214396,1.608711,0.133272,0.893978,0.918455
51-20 sp900762565,17.868174,1.113992,1.569122,0.709946,0.477738,0.796229
AF33-28 sp003477885,1236.134444,0.092624,0.307809,0.300913,0.763481,0.888405
AM07-15 sp003477405,2286.070983,0.25937,0.343443,0.755206,0.450126,0.796229
AM51-8 sp003478275,1146.681384,-0.897738,0.305706,-2.936604,0.003318,0.01691


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.37 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.37 seconds.

Fitting LFCs...
... done in 0.02 seconds.

Refitting 11 outlier genes.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.46 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,-0.107656,0.245022,-0.439372,0.660392,0.960581
28L sp000177555,18.420105,0.016571,1.251029,0.013246,0.989432,0.989432
43-108 sp001915545,259.797831,-1.042248,0.548477,-1.90026,0.057399,0.430783
51-20 sp001917175,117.079188,1.076446,1.030785,1.044298,0.296348,0.74062
51-20 sp900539605,0.215148,2.040931,2.986253,0.683442,0.494327,0.960581
51-20 sp900542055,1.999911,0.715092,1.654577,0.43219,0.665603,0.960581
51-20 sp900762565,17.868174,0.189895,1.574539,0.120603,0.904005,0.989432
AF33-28 sp003477885,1236.134444,0.062647,0.307764,0.203555,0.838701,0.989432
AM07-15 sp003477405,2286.070983,0.431859,0.342256,1.261799,0.207021,0.690071
AM51-8 sp003478275,1146.681384,-0.023085,0.314878,-0.073315,0.941555,0.989432


array([0.2, 0. , 0. , 0.3, 0.6, 0. , 0. , 0. , 0. , 0.1, 0. , 0.7, 0.1,
       0. , 0.2, 0.1, 0.1, 0. , 0.6, 0. ])

In [3]:
#count table
table = pd.read_table('../../Combined_datasets/biom_to_txt/Franzosa2019IBD_CD.txt', sep = "\t", skiprows=1)

#table.set_index(table.iloc[0].values)
table = table.set_index ('#OTU ID')
metadata = pd.read_table('../../adonis/metadata_12dataset_1280_adonis_number_gender.txt', sep = "\t")
metadata_CD = metadata.loc[(metadata['Study'] == 'Franzosa2019IBD_CD')]
metadata_CD = metadata_CD.set_index ('featureid')
metadata_CD = metadata_CD.sort_index()
table_20microbes = table.head(20)
table_20microbes = table_20microbes.T
table_20microbes = table_20microbes.sort_index()
table_20microbes

#OTU ID,1XD42-69 sp014287635,28L sp000177555,43-108 sp001915545,51-20 sp001917175,51-20 sp900539605,51-20 sp900542055,51-20 sp900762565,AF33-28 sp003477885,AM07-15 sp003477405,AM51-8 sp003478275,AM51-8 sp900546435,AM51-8 sp900761925,Absicoccus porci,Absicoccus sp000434355,Acetatifactor sp003447295,Acetatifactor sp900066365,Acetatifactor sp900066565,Acetatifactor sp900554205,Acetatifactor sp900755865,Acetatifactor sp900760705
SRR6468502,108.0,26.0,0.0,0.0,0.0,0.0,0.0,798.0,444.0,281.0,148.0,66.0,0.0,114.0,564.0,876.0,366.0,151.0,493.0,284.0
SRR6468505,162.0,0.0,301.0,14.0,0.0,0.0,0.0,369.0,610.0,288.0,287.0,284.0,27.0,74.0,449.0,1172.0,3068.0,76.0,284.0,215.0
SRR6468507,286.0,0.0,0.0,12.0,0.0,0.0,0.0,369.0,804.0,921.0,280.0,42.0,17.0,36.0,114.0,2112.0,595.0,101.0,171.0,328.0
SRR6468514,369.0,0.0,10.0,0.0,0.0,0.0,0.0,508.0,607.0,414.0,776.0,144.0,10.0,159.0,1410.0,11471.0,4756.0,201.0,256.0,645.0
SRR6468517,219.0,10.0,14.0,0.0,0.0,0.0,0.0,127.0,340.0,252.0,222.0,56.0,0.0,63.0,252.0,634.0,306.0,89.0,241.0,406.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6468707,426.0,0.0,53.0,11.0,0.0,0.0,10.0,202.0,282.0,427.0,1810.0,58.0,0.0,61.0,471.0,1401.0,2056.0,121.0,31.0,624.0
SRR6468709,813.0,0.0,116.0,0.0,0.0,0.0,0.0,570.0,339.0,744.0,603.0,73.0,14.0,295.0,4646.0,12763.0,85821.0,223.0,235.0,428.0
SRR6468711,212.0,0.0,45.0,0.0,0.0,0.0,0.0,38.0,76.0,208.0,285.0,19.0,0.0,0.0,223.0,588.0,852.0,25.0,14.0,146.0
SRR6468712,94.0,0.0,0.0,0.0,0.0,0.0,0.0,160.0,2957.0,106.0,615.0,739.0,0.0,69.0,662.0,1491.0,9444.0,54.0,53.0,1390.0


In [4]:
#metadata_CD

table_20microbes.reindex(metadata_CD.index)

#OTU ID,1XD42-69 sp014287635,28L sp000177555,43-108 sp001915545,51-20 sp001917175,51-20 sp900539605,51-20 sp900542055,51-20 sp900762565,AF33-28 sp003477885,AM07-15 sp003477405,AM51-8 sp003478275,AM51-8 sp900546435,AM51-8 sp900761925,Absicoccus porci,Absicoccus sp000434355,Acetatifactor sp003447295,Acetatifactor sp900066365,Acetatifactor sp900066565,Acetatifactor sp900554205,Acetatifactor sp900755865,Acetatifactor sp900760705
featureid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SRR6468502,108.0,26.0,0.0,0.0,0.0,0.0,0.0,798.0,444.0,281.0,148.0,66.0,0.0,114.0,564.0,876.0,366.0,151.0,493.0,284.0
SRR6468505,162.0,0.0,301.0,14.0,0.0,0.0,0.0,369.0,610.0,288.0,287.0,284.0,27.0,74.0,449.0,1172.0,3068.0,76.0,284.0,215.0
SRR6468507,286.0,0.0,0.0,12.0,0.0,0.0,0.0,369.0,804.0,921.0,280.0,42.0,17.0,36.0,114.0,2112.0,595.0,101.0,171.0,328.0
SRR6468514,369.0,0.0,10.0,0.0,0.0,0.0,0.0,508.0,607.0,414.0,776.0,144.0,10.0,159.0,1410.0,11471.0,4756.0,201.0,256.0,645.0
SRR6468517,219.0,10.0,14.0,0.0,0.0,0.0,0.0,127.0,340.0,252.0,222.0,56.0,0.0,63.0,252.0,634.0,306.0,89.0,241.0,406.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6468707,426.0,0.0,53.0,11.0,0.0,0.0,10.0,202.0,282.0,427.0,1810.0,58.0,0.0,61.0,471.0,1401.0,2056.0,121.0,31.0,624.0
SRR6468709,813.0,0.0,116.0,0.0,0.0,0.0,0.0,570.0,339.0,744.0,603.0,73.0,14.0,295.0,4646.0,12763.0,85821.0,223.0,235.0,428.0
SRR6468711,212.0,0.0,45.0,0.0,0.0,0.0,0.0,38.0,76.0,208.0,285.0,19.0,0.0,0.0,223.0,588.0,852.0,25.0,14.0,146.0
SRR6468712,94.0,0.0,0.0,0.0,0.0,0.0,0.0,160.0,2957.0,106.0,615.0,739.0,0.0,69.0,662.0,1491.0,9444.0,54.0,53.0,1390.0


In [5]:
#deseq2
counts_df = table_20microbes

metadata_df = metadata_CD

dds = DeseqDataSet(
    counts_df,
    metadata_df,
    design_factors="disease",  # compare samples based on the "disease"
    # column ("B" vs "A")
    refit_cooks=True,
    n_cpus=8,
)
dds.deseq2()
stat_res = DeseqStats(dds, n_cpus=8)
stat_res.summary()

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.01 seconds.

Fitting dispersion trend curve...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Refitting 9 outlier genes.

Fitting dispersions...
... done in 0.00 seconds.

Fitting MAP dispersions...
... done in 0.00 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.00 seconds.

Log2 fold change & Wald test p-value: disease Healthy vs CD


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1XD42-69 sp014287635,1134.55089,0.206622,0.243936,0.847032,0.396978,0.443681
28L sp000177555,37.402176,-4.204268,1.151062,-3.652513,0.00026,0.000705
43-108 sp001915545,259.797831,-2.243019,0.521027,-4.304998,1.7e-05,0.000159
51-20 sp001917175,117.079188,-1.204608,1.027269,-1.172632,0.240943,0.305195
51-20 sp900539605,0.0,0.0,0.0,0.0,,
51-20 sp900542055,3.016793,-2.255008,1.5748,-1.431932,0.152163,0.206552
51-20 sp900762565,26.256248,-3.484711,1.50375,-2.317348,0.020485,0.032434
AF33-28 sp003477885,1467.801782,-1.182807,0.317764,-3.722288,0.000197,0.000625
AM07-15 sp003477405,3153.225764,-1.741027,0.361086,-4.821638,1e-06,2.7e-05
AM51-8 sp003478275,1146.681384,-0.837528,0.306216,-2.735093,0.006236,0.010772


In [278]:
stat_res.results_df['log2FoldChange']

#OTU ID
1XD42-69 sp014287635         0.206622
28L sp000177555             -4.204268
43-108 sp001915545          -2.243019
51-20 sp001917175           -1.204608
51-20 sp900539605            0.000000
51-20 sp900542055           -2.255008
51-20 sp900762565           -3.484711
AF33-28 sp003477885         -1.182807
AM07-15 sp003477405         -1.741027
AM51-8 sp003478275          -0.837528
AM51-8 sp900546435          -0.977013
AM51-8 sp900761925           0.056634
Absicoccus porci            -0.825523
Absicoccus sp000434355      -1.243387
Acetatifactor sp003447295    0.274790
Acetatifactor sp900066365    0.800935
Acetatifactor sp900066565    0.743313
Acetatifactor sp900554205   -1.178547
Acetatifactor sp900755865   -0.047552
Acetatifactor sp900760705   -1.284628
Name: log2FoldChange, dtype: float64

In [146]:
table_lfc = stat_res.results_df

In [152]:
table_lfc['CI_5'] = table_lfc['log2FoldChange'] - table_lfc['lfcSE']*1.96
table_lfc['CI_95'] = table_lfc['log2FoldChange'] + table_lfc['lfcSE']*1.96
i_negative = table_lfc.sort_values(by=['CI_95'],ascending=True).head(2)
i_positive = table_lfc.sort_values(by=['CI_5'],ascending=False).head(2)

In [153]:
i_negative

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,CI_5,CI_95
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
28L sp000177555,37.402176,-4.204268,1.151062,-3.652513,0.00026,0.000705,-6.460349,-1.948187
43-108 sp001915545,259.797831,-2.243019,0.521027,-4.304998,1.7e-05,0.000159,-3.264231,-1.221806


In [154]:
i_positive

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,CI_5,CI_95
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Acetatifactor sp900066365,9698.845418,0.800935,0.230302,3.477755,0.000506,0.001201,0.349543,1.252328
Acetatifactor sp900066565,13033.09105,0.743313,0.262469,2.832005,0.004626,0.008789,0.228874,1.257752


In [156]:
eggNOG_dir = 'http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0/species_catalogue/'
metadata_new = pd.read_table('../../Meta_diseases_analyses/table/eggNOG_species_rep.txt')

In [162]:
set_cd_positive = set(i_positive.index)
set_cd_negative = set(i_negative.index)
cd_positive_rep = metadata_new[metadata_new['Species'].isin(set_cd_positive)]
cd_negative_rep = metadata_new[metadata_new['Species'].isin(set_cd_negative)]
Species_rep_ids_cd_positive = cd_positive_rep['Species_rep'].drop_duplicates()
Species_rep_ids_cd_negative = cd_negative_rep['Species_rep'].drop_duplicates()


In [163]:
Species_rep_ids_cd_positive

40     MGYG000000041
216    MGYG000000217
Name: Species_rep, dtype: object

In [164]:
Species_rep_ids_cd_positive

40     MGYG000000041
216    MGYG000000217
Name: Species_rep, dtype: object

In [173]:
#pull down all the eggNOG annotation of the species involved in Negative
os.mkdir('../Species_table/permutation_test_n')
for i in Species_rep_ids_cd_negative:
    os.system("wget '{}/{}/{}/genome/{}_eggNOG.tsv' -O {}/{}_eggNOG.tsv".format(eggNOG_dir, i[:-2], i, i, '../Species_table/permutation_test_n/', i))

--2023-02-13 22:04:54--  http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0/species_catalogue//MGYG0000012/MGYG000001281/genome/MGYG000001281_eggNOG.tsv
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.138|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 444028 (434K) [text/tab-separated-values]
Saving to: ‘../Species_table/permutation_test_n//MGYG000001281_eggNOG.tsv’

     0K .......... .......... .......... .......... .......... 11%  267K 1s
    50K .......... .......... .......... .......... .......... 23%  533K 1s
   100K .......... .......... .......... .......... .......... 34% 27.1M 1s
   150K .......... .......... .......... .......... .......... 46%  533K 0s
   200K .......... .......... .......... .......... .......... 57%  544K 0s
   250K .......... .......... .......... .......... .......... 69%  532K 0s
   300K .......... .......... .......... .......... ....

In [174]:
os.mkdir('../Species_table/permutation_test_p')
for i in Species_rep_ids_cd_positive:
    os.system("wget '{}/{}/{}/genome/{}_eggNOG.tsv' -O {}/{}_eggNOG.tsv".format(eggNOG_dir, i[:-2], i, i, '../Species_table/permutation_test_p/', i))

--2023-02-13 22:05:08--  http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0/species_catalogue//MGYG0000000/MGYG000000041/genome/MGYG000000041_eggNOG.tsv
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.138
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.138|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1044171 (1020K) [text/tab-separated-values]
Saving to: ‘../Species_table/permutation_test_p//MGYG000000041_eggNOG.tsv’

     0K .......... .......... .......... .......... ..........  4%  266K 4s
    50K .......... .......... .......... .......... ..........  9%  529K 3s
   100K .......... .......... .......... .......... .......... 14% 97.7M 2s
   150K .......... .......... .......... .......... .......... 19%  533K 2s
   200K .......... .......... .......... .......... .......... 24%  539K 1s
   250K .......... .......... .......... .......... .......... 29%  533K 1s
   300K .......... .......... .......... .......... ..

In [184]:
#CD positive
df_list_p = []
for i in Species_rep_ids_cd_positive:
    f_name = '../Species_table/permutation_test_p/{}_eggNOG.tsv'.format(i)
    df_parsed = parse_genome(pd.read_table(f_name))
    df_list_p.append(df_parsed)
    
df_cat_p = pd.concat(df_list_p, axis=0)
genome_kegg_counts_p = to_sparse_matrix(df_cat_p)
genome_kegg_counts_p.to_csv('../Species_table/genome_kegg_counts_p_CD_test.txt', sep = '\t') 

In [185]:
genome_kegg_counts_p

Unnamed: 0,ko:K07029,ko:K07098,ko:K05896,ko:K10188,ko:K10193,ko:K20866,ko:K01628,ko:K02315,ko:K02028,ko:K18707,...,ko:K07736,ko:K21479,ko:K14540,ko:K02189,ko:K13049,ko:K07139,ko:K05306,ko:K02073,ko:K05350,ko:K02652
MGYG000000217,1,3,1,1,1,0,2,2,5,1,...,1,1,1,0,1,1,1,1,1,1
MGYG000000041,1,3,1,2,0,1,1,2,5,1,...,1,2,1,1,1,1,0,1,1,1


In [186]:
#CD negative
df_list_n = []
for i in Species_rep_ids_cd_negative:
    f_name = '../Species_table/permutation_test_n/{}_eggNOG.tsv'.format(i)
    df_parsed = parse_genome(pd.read_table(f_name))
    df_list_n.append(df_parsed)
    
df_cat_n = pd.concat(df_list_n, axis=0)
genome_kegg_counts_n = to_sparse_matrix(df_cat_n)
genome_kegg_counts_n.to_csv('../Species_table/genome_kegg_counts_n_CD_test.txt', sep = '\t') 

In [187]:
genome_kegg_counts_n

Unnamed: 0,ko:K08169,ko:K07098,ko:K05896,ko:K20866,ko:K01628,ko:K02028,ko:K18707,ko:K02111,ko:K17722,ko:K01579,...,ko:K07657,ko:K06920,ko:K02065,ko:K14540,ko:K02189,ko:K09774,ko:K07139,ko:K05306,ko:K03640,ko:K02652
MGYG000001281,0,1,1,0,1,2,1,1,0,1,...,0,1,1,1,1,2,1,0,0,1
MGYG000003992,1,2,0,1,0,1,1,1,1,1,...,1,1,1,0,0,0,1,1,1,0


In [192]:
#btest
kegg = btest(genome_kegg_counts_p, genome_kegg_counts_n, return_proportions=True)
kegg = kegg.loc[kegg['side'] == 'groupB']
kegg = kegg.loc[kegg['pval'] <= 0.001]
kegg.to_csv('../Species_table/kegg_cd_p_vs_n_more_abundant_in_cases.txt', sep = '\t')
#gene number
#41

In [196]:
len(kegg.index)

41

In [198]:
#btest
kegg2 = btest(genome_kegg_counts_p, genome_kegg_counts_n, return_proportions=True)
kegg2 = kegg2.loc[kegg2['side'] == 'groupA']
kegg2 = kegg2.loc[kegg2['pval'] <= 0.001]
kegg2.to_csv('../Species_table/kegg_cd_p_vs_n_more_abundant_in_controls.txt', sep = '\t')
kegg2
#gene number
#1216

Unnamed: 0,groupA,groupB,pval,side
ko:K00002,1.0,0.0,2.470966e-04,groupA
ko:K00003,5.0,1.0,1.767894e-16,groupA
ko:K00008,2.0,0.0,6.105674e-08,groupA
ko:K00012,3.0,1.0,4.826023e-10,groupA
ko:K00013,2.0,1.0,7.324395e-07,groupA
...,...,...,...,...
ko:K22278,2.0,0.0,6.105674e-08,groupA
ko:K22341,1.0,0.0,2.470966e-04,groupA
ko:K22432,2.0,0.0,6.105674e-08,groupA
ko:K22445,2.0,0.0,6.105674e-08,groupA
