### This notebook is to use spearman correlation to check association between 24,25-(OH)2D and interested taxa

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests

import matplotlib.pylab as plt
%matplotlib inline

### merge taxonomy with mapping file

In [2]:
taxa = pd.read_csv('../data/RF_taxa_2425.txt', sep='\t', index_col='#OTU ID')

In [3]:
print(taxa.shape)
taxa.head()

(5, 2)


Unnamed: 0_level_0,Taxon,importance
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1
AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGATTGGCAAGTTGGGAGTGAAATCTATGGGCTCAACCCATAAATTGCTTTCAAAACTGTCAGTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGTAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002546
TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAAACCCGGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGGGGAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002119
AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002336
TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATCGAAGCTTAACTTCGGTAAGCCATGGAAACCGTAGAGCTAGAGTGTGTGAGAGGATCGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGCGCATAACTGACGCTCAGTCCCGAAAGCGTGGGGAGCAAATA,k__Bacteria; p__Firmicutes; c__Erysipelotrichi...,0.002441
TACGTAGGTGACAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGACTGTCAAGTCAGTCGTGAAATACCGGGGCTTAACCCCGGGGCTGCGATTGAAACTGACAGCCTTGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGTGTGGGGAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002223


In [4]:
otu = pd.read_csv('../Qiime_updated/feature-table-rare5807.txt', sep='\t', skiprows=1, index_col='#OTU ID')
otu.head()

Unnamed: 0_level_0,MN1696,PI4923,PA3754,PI4717,PA3960,MN1521,PA3828,PO6848,PA3199,PO7473,...,PO7027,BI0686,MN2037,PO7199,MN2405,MN2498,BI0397,SD8175,PO7488,PO6966
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATGTCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGGCTCGAAAGTGTGGGTATCAAAC,2011.0,0.0,32.0,37.0,0.0,0.0,18.0,111.0,0.0,157.0,...,0.0,0.0,0.0,256.0,0.0,0.0,0.0,0.0,438.0,443.0
TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTCGAAACTGGCAGGCTAGAGTCTTGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAAC,520.0,250.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,916.0,0.0,235.0,0.0,0.0,51.0,0.0,0.0,0.0
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAAC,457.0,66.0,0.0,14.0,64.0,0.0,0.0,0.0,295.0,21.0,...,31.0,0.0,0.0,8.0,123.0,242.0,0.0,131.0,0.0,10.0
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTTTTAAGTCAGCGGTGAAAGTCTGTGGCTCAACCATAGAATTGCCGTTGAAACTGGGGGGCTTGAGTATGTTTGAGGCAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTGCCAAGCCATTACTGACGCTGATGCACGAAAGCGTGGGGATCAAAC,460.0,0.0,0.0,160.0,0.0,0.0,93.0,0.0,0.0,0.0,...,70.0,119.0,0.0,0.0,41.0,0.0,0.0,0.0,9.0,463.0
AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCATGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAAC,380.0,108.0,12.0,0.0,13.0,253.0,90.0,120.0,0.0,0.0,...,304.0,0.0,0.0,0.0,0.0,287.0,26.0,163.0,0.0,315.0


In [5]:
# combine taxa with otu
taxa = pd.merge(taxa, otu, left_index=True, right_index=True).transpose()
taxa.shape

(601, 5)

In [6]:
taxa.head()

#OTU ID,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGATTGGCAAGTTGGGAGTGAAATCTATGGGCTCAACCCATAAATTGCTTTCAAAACTGTCAGTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGTAGCAAAC,TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAAACCCGGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGGGGAGCAAAC,AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAAC,TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATCGAAGCTTAACTTCGGTAAGCCATGGAAACCGTAGAGCTAGAGTGTGTGAGAGGATCGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGCGCATAACTGACGCTCAGTCCCGAAAGCGTGGGGAGCAAATA,TACGTAGGTGACAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGACTGTCAAGTCAGTCGTGAAATACCGGGGCTTAACCCCGGGGCTGCGATTGAAACTGACAGCCTTGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGTGTGGGGAGCAAAC
Taxon,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,k__Bacteria; p__Firmicutes; c__Erysipelotrichi...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...
importance,0.00254559,0.00211931,0.00233583,0.00244112,0.00222336
MN1696,5,2,0,0,0
PI4923,4,2,103,2,0
PA3754,5,10,37,15,10


In [7]:
mf = pd.read_csv('../data/mros_mapping_alpha.txt', sep='\t', index_col='#SampleID')

In [8]:
mf = mf[['OHV1D3', 'OHV24D3', 'OHVD3', 'ratio_activation', 'ratio_catabolism', 'VDstatus']]
mf.head()

Unnamed: 0_level_0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism,VDstatus
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BI0023,0.0393,1.77,25.8,0.001523,0.068605,sufficiency
BI0056,0.0619,3.91,39.2,0.001579,0.099745,sufficiency
BI0131,0.0521,1.49,23.1,0.002255,0.064502,sufficiency
BI0153,0.0431,2.14,27.3,0.001579,0.078388,sufficiency
BI0215,0.0502,3.62,33.0,0.001521,0.109697,sufficiency


In [9]:
# need to substract the six otu abundance for each subject
sample_ids = mf.index
taxa = taxa.loc[sample_ids]
taxa.head()

#OTU ID,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGATTGGCAAGTTGGGAGTGAAATCTATGGGCTCAACCCATAAATTGCTTTCAAAACTGTCAGTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGTAGCAAAC,TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAAACCCGGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGGGGAGCAAAC,AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAAC,TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATCGAAGCTTAACTTCGGTAAGCCATGGAAACCGTAGAGCTAGAGTGTGTGAGAGGATCGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGCGCATAACTGACGCTCAGTCCCGAAAGCGTGGGGAGCAAATA,TACGTAGGTGACAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGACTGTCAAGTCAGTCGTGAAATACCGGGGCTTAACCCCGGGGCTGCGATTGAAACTGACAGCCTTGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGTGTGGGGAGCAAAC
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BI0023,17,2,79,4,1
BI0056,9,2,0,0,1
BI0131,1,2,100,0,1
BI0153,4,1,712,7,16
BI0215,2,8,227,0,7


In [10]:
dat = pd.merge(mf, taxa, left_index=True, right_index=True)
dat.head()

Unnamed: 0_level_0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism,VDstatus,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGATTGGCAAGTTGGGAGTGAAATCTATGGGCTCAACCCATAAATTGCTTTCAAAACTGTCAGTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGTAGCAAAC,TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAAACCCGGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGGGGAGCAAAC,AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAAC,TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATCGAAGCTTAACTTCGGTAAGCCATGGAAACCGTAGAGCTAGAGTGTGTGAGAGGATCGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGCGCATAACTGACGCTCAGTCCCGAAAGCGTGGGGAGCAAATA,TACGTAGGTGACAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGACTGTCAAGTCAGTCGTGAAATACCGGGGCTTAACCCCGGGGCTGCGATTGAAACTGACAGCCTTGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGTGTGGGGAGCAAAC
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BI0023,0.0393,1.77,25.8,0.001523,0.068605,sufficiency,17,2,79,4,1
BI0056,0.0619,3.91,39.2,0.001579,0.099745,sufficiency,9,2,0,0,1
BI0131,0.0521,1.49,23.1,0.002255,0.064502,sufficiency,1,2,100,0,1
BI0153,0.0431,2.14,27.3,0.001579,0.078388,sufficiency,4,1,712,7,16
BI0215,0.0502,3.62,33.0,0.001521,0.109697,sufficiency,2,8,227,0,7


In [11]:
vars_vd = np.array(['OHVD3', 'OHV1D3', 'OHV24D3', 'ratio_activation', 'ratio_catabolism'])
dat[vars_vd] = dat[vars_vd].apply(pd.to_numeric, errors='coerce')

In [12]:
dat[vars_vd].describe()

Unnamed: 0,OHVD3,OHV1D3,OHV24D3,ratio_activation,ratio_catabolism
count,556.0,567.0,567.0,556.0,556.0
mean,35.229137,0.057775,3.430864,0.001772,0.094776
std,12.450758,0.019773,1.834771,0.000735,0.02977
min,7.8,0.0107,0.3,0.000398,0.018788
25%,27.4,0.0441,2.175,0.001316,0.074216
50%,33.65,0.0555,3.18,0.00166,0.092821
75%,41.825,0.0663,4.235,0.002081,0.112849
max,104.0,0.156,14.07,0.006727,0.197786


### correlation

In [13]:
otu_cols = dat.columns[mf.shape[1]:dat.shape[1]]
len(otu_cols)

5

In [14]:
bt = pd.read_csv('../data/RF_taxa_2425.txt', sep='\t', index_col='#OTU ID')
bt.head()

Unnamed: 0_level_0,Taxon,importance
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1
AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGATTGGCAAGTTGGGAGTGAAATCTATGGGCTCAACCCATAAATTGCTTTCAAAACTGTCAGTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGTAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002546
TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGTGGCAAGGCAAGCCAGAAGTGAAAACCCGGGGCTCAACCGCGGGATTGCTTTTGGAACTGTCATGCTAGAGTGCAGGAGGGGTGAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGAGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGGGGAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002119
AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002336
TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATCGAAGCTTAACTTCGGTAAGCCATGGAAACCGTAGAGCTAGAGTGTGTGAGAGGATCGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGCGCATAACTGACGCTCAGTCCCGAAAGCGTGGGGAGCAAATA,k__Bacteria; p__Firmicutes; c__Erysipelotrichi...,0.002441
TACGTAGGTGACAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGACTGTCAAGTCAGTCGTGAAATACCGGGGCTTAACCCCGGGGCTGCGATTGAAACTGACAGCCTTGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGTGTGGGGAGCAAAC,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.002223


In [15]:
bt.index == dat.columns[mf.shape[1]:dat.shape[1]]

array([ True,  True,  True,  True,  True])

In [16]:
results= []
i = 2
print(vars_vd[i])
for j in range(len(otu_cols)):
    tmp = dat[[vars_vd[i], otu_cols[j]]].dropna(axis=0, how='any')
    rho, pval = spearmanr(tmp[vars_vd[i]], tmp[otu_cols[j]])
    tax = bt['Taxon'][otu_cols[j]]
    results.append([vars_vd[i], otu_cols[j], tax, rho, pval])
    results.append([vars_vd[i], otu_cols[j], rho, pval])

# output table    
results = pd.DataFrame(results, columns=['vars', 'otu', 'tax',
                                         'rho', 'pval']).dropna(axis=0, how='any')
results['fdr pval'] = multipletests(results['pval'], method = 'fdr_bh')[1]
results = results.sort_values(['fdr pval'], ascending=True)

# specific bacteria
index = results.loc[results['fdr pval'] <= 0.05].index
for i in range(len(index)):
    print(results.tax[index[i]], results['fdr pval'][index[i]])

OHV24D3
k__Bacteria; p__Firmicutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__; s__ 0.00017421833910782926
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__ 0.021614439348257845
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__; g__; s__ 0.021614439348257845


In [17]:
# check
results

Unnamed: 0,vars,otu,tax,rho,pval,fdr pval
6,OHV24D3,TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGG...,k__Bacteria; p__Firmicutes; c__Erysipelotrichi...,0.172903,3.5e-05,0.000174
0,OHV24D3,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.10429,0.012969,0.021614
2,OHV24D3,TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.108855,0.009486,0.021614
4,OHV24D3,AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.066873,0.111695,0.139618
8,OHV24D3,TACGTAGGTGACAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.061203,0.145533,0.145533


In [18]:
results.to_csv('../data/correlation_2425.txt', sep='\t')

## double check results

In [22]:
dat.rename(columns={dat.columns[6]: bt.Taxon[dat.columns[6]],
                    dat.columns[7]: bt.Taxon[dat.columns[7]],
                    dat.columns[8]: bt.Taxon[dat.columns[8]],
                    dat.columns[9]: bt.Taxon[dat.columns[9]],
                    dat.columns[10]: bt.Taxon[dat.columns[10]]}, inplace=True)
dat.head()

Unnamed: 0_level_0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism,VDstatus,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__; g__; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Faecalibacterium; s__prausnitzii,k__Bacteria; p__Firmicutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BI0023,0.0393,1.77,25.8,0.001523,0.068605,sufficiency,17,2,79,4,1
BI0056,0.0619,3.91,39.2,0.001579,0.099745,sufficiency,9,2,0,0,1
BI0131,0.0521,1.49,23.1,0.002255,0.064502,sufficiency,1,2,100,0,1
BI0153,0.0431,2.14,27.3,0.001579,0.078388,sufficiency,4,1,712,7,16
BI0215,0.0502,3.62,33.0,0.001521,0.109697,sufficiency,2,8,227,0,7


In [23]:
tmp = dat[['OHV24D3', 'k__Bacteria; p__Firmicutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=0.17290299872094958, pvalue=3.4843667821565854e-05)

In [24]:
tmp = dat[['OHV24D3', 'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=array([[1.        , 0.10429007, 0.06120272],
       [0.10429007, 1.        , 0.00913884],
       [0.06120272, 0.00913884, 1.        ]]), pvalue=array([[0.        , 0.01296866, 0.14553281],
       [0.01296866, 0.        , 0.82810217],
       [0.14553281, 0.82810217, 0.        ]]))

In [25]:
tmp = dat[['OHV24D3', 'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=0.1088545734552748, pvalue=0.009486339065688413)