### This notebook is to use spearman correlation to check association between 1,25-(OH)2D and interested taxa

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests

import matplotlib.pylab as plt
%matplotlib inline

### merge taxonomy with mapping file

In [3]:
taxa = pd.read_csv('../data/RF_taxa_125.txt', sep='\t', index_col='#OTU ID')

In [4]:
print(taxa.shape)
taxa.head()

(13, 3)


Unnamed: 0_level_0,Taxon,Confidence,importance
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,k__Bacteria; p__Bacteroidetes; c__Bacteroidia;...,0.999945,0.002495
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGACAGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCTGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG,k__Bacteria; p__Bacteroidetes; c__Bacteroidia;...,0.871193,0.002257
TACGGAGGGTGCAAGCGTTAATCGGAATCACTGGGCGTAAAGCGCACGTAGGCGGCTTGGTAAGTCAGGGGTGAAATCCCACAGCCCAACTGTGGAACTGCCTTTGATACTGCCAGGCTTGAGTACCGGAGAGGGTGGCGGAATTCCAGG,k__Bacteria; p__Proteobacteria; c__Deltaproteo...,1.0,0.002239
TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGCATTTGAAACTGTAGAGCTTGAGTGAAGTAGAGGCAGGCGGAATTCCCCG,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.999736,0.002144
TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGGAGCAAGTCAGAAGTGAAAGCCCGGGGCTCAACCCCGGGACGGCTTTTGAAACTGCCCTGCTTGATTTCAGGAGAGGTAAGCGGAATTCCTAG,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.998628,0.002623


In [5]:
otu = pd.read_csv('../data/57316_mros_otus_rare_exp/57316_feature-table-rare.txt', sep='\t', skiprows=1, index_col='#OTU ID',dtype = str)
print(otu.shape)
otu.head()

(25292, 598)


Unnamed: 0_level_0,MN1837,PO7410,SD8837,MN2250,SD8603,PI5263,MN2373,PI5340,MN1590,PI4847,...,BI0904,PA3993,PA3846,PO7476,BI0730,PO6998,BI0539,MN2181,PO7226,BI0552
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTAGACGCTCAACGTCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGG,3080.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,1.0,286.0,1.0,0.0,0.0,145.0,0.0
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTAGATGCTCAACATCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGG,1062.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,178.0,1.0,0.0,0.0,331.0,0.0
TACGGAAGGTCCAGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCGGCGGCGTAAGTCAGTTGTGAAATCGTGCGGCTTAACCGTGCAATTGCAGTTGATACTGCGTCGCTTGAGTGCACACAGGGATGTTGGAATTCATGG,527.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTAGAGGCTCAACCTCTGCACTGCAGCGCGAACTGGTCTTCTTGAGTACGCACAACGTGGGCGGAATTCGTGG,505.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGTTTGTTAAGCGTGTTGTGAAATGTCGGGGCTCAACCTGGGCATTGCAGCGCGAACTGGCAGACTTGAGTGCGCGGGAAGTAGGCGGAATTCGTCG,499.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# subset otu table with intersted ones only
otu_sub = otu.loc[taxa.index].transpose()
print(otu_sub.shape)
otu_sub.head()

(598, 13)


#OTU ID,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGACAGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCTGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG,TACGGAGGGTGCAAGCGTTAATCGGAATCACTGGGCGTAAAGCGCACGTAGGCGGCTTGGTAAGTCAGGGGTGAAATCCCACAGCCCAACTGTGGAACTGCCTTTGATACTGCCAGGCTTGAGTACCGGAGAGGGTGGCGGAATTCCAGG,TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGCATTTGAAACTGTAGAGCTTGAGTGAAGTAGAGGCAGGCGGAATTCCCCG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGGAGCAAGTCAGAAGTGAAAGCCCGGGGCTCAACCCCGGGACGGCTTTTGAAACTGCCCTGCTTGATTTCAGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATTGCAAGTCAGATGTGAAAACTGGGGGCTCAACCTCCAGCCTGCATTTGAAACTGTAGTTCTTGAGTGCTGGAGAGGCAATCGGAATTCCGTG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCAAGTCTGATGTGAAAGGCGGGGGCTCAACCCCTGGACTGCATTGGAAACTGTTAGTCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAAGGCAAGTCAGATGTGAAAACTATGGGCTCAACCCATAGCCTGCATTTGAAACTGTTTTTCTTGAGTGCTGGAGAGGCAATCGGAATTCCGTG,TACGTAGGTGGCGAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGTGTAGGCGGGACGACAAGTCAGATGTGAAAATTGCAGGCTCAACCTGGAAAGTGCATTTGAAACTGCCGTTCTTGAGAGTCGGAGAGGTAAATGGAATTCCCGG,TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGTGTAGGCGGGATATCAAGTCAGAAGTGAAAATTACGGGCTCAACTCGTAACCTGCTTTTGAAACTGACATTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCTAG,TACGGAGGATTCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAAGTTAGAGGTGAAATTTCGGGGCTCAACCCTGAACGTGCCTCTAATACTGTTGAGCTAGAGAGTAGTTGCGGTAGGCGGAATGTATGG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGCAGCAAGTCTGATGTGAAAGGCGGGGGCTCAACCCCCGGACTGCATTGGAAACTGTTGATCTTGAGTACCGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATGGCAAGTAGAATGTTAAATCCATCGGCTCAACCGGTGGCTGCGTTCTAAACTGCCGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGT
MN1837,262.0,32.0,17.0,21.0,6.0,9.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0
PO7410,790.0,7.0,28.0,91.0,9.0,2.0,2.0,5.0,0.0,1055.0,245.0,0.0,0.0
SD8837,427.0,250.0,35.0,34.0,11.0,2.0,5.0,6.0,5.0,5.0,280.0,0.0,0.0
MN2250,533.0,124.0,13.0,1.0,0.0,26.0,0.0,0.0,0.0,254.0,150.0,149.0,6.0
SD8603,2490.0,274.0,33.0,51.0,1.0,14.0,3.0,4.0,0.0,7.0,0.0,0.0,6.0


In [7]:
mf = pd.read_csv('../data/mros_mapping_alpha.txt', sep='\t', index_col='#SampleID')

In [8]:
mf = mf[['OHV1D3', 'OHV24D3', 'OHVD3', 'ratio_activation', 'ratio_catabolism']]
print(mf.shape)
mf.head()

(598, 5)


Unnamed: 0_level_0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BI0023,0.0393,1.77,25.8,0.001523,0.068605
BI0056,0.0619,3.91,39.2,0.001579,0.099745
BI0131,0.0521,1.49,23.1,0.002255,0.064502
BI0153,0.0431,2.14,27.3,0.001579,0.078388
BI0215,0.0502,3.62,33.0,0.001521,0.109697


In [9]:
dat = pd.merge(mf, otu_sub, left_index=True, right_index=True)
print(dat.shape)
dat.head()

(598, 18)


Unnamed: 0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGACAGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCTGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG,TACGGAGGGTGCAAGCGTTAATCGGAATCACTGGGCGTAAAGCGCACGTAGGCGGCTTGGTAAGTCAGGGGTGAAATCCCACAGCCCAACTGTGGAACTGCCTTTGATACTGCCAGGCTTGAGTACCGGAGAGGGTGGCGGAATTCCAGG,TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGCATTTGAAACTGTAGAGCTTGAGTGAAGTAGAGGCAGGCGGAATTCCCCG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGGAGCAAGTCAGAAGTGAAAGCCCGGGGCTCAACCCCGGGACGGCTTTTGAAACTGCCCTGCTTGATTTCAGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATTGCAAGTCAGATGTGAAAACTGGGGGCTCAACCTCCAGCCTGCATTTGAAACTGTAGTTCTTGAGTGCTGGAGAGGCAATCGGAATTCCGTG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCAAGTCTGATGTGAAAGGCGGGGGCTCAACCCCTGGACTGCATTGGAAACTGTTAGTCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAAGGCAAGTCAGATGTGAAAACTATGGGCTCAACCCATAGCCTGCATTTGAAACTGTTTTTCTTGAGTGCTGGAGAGGCAATCGGAATTCCGTG,TACGTAGGTGGCGAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGTGTAGGCGGGACGACAAGTCAGATGTGAAAATTGCAGGCTCAACCTGGAAAGTGCATTTGAAACTGCCGTTCTTGAGAGTCGGAGAGGTAAATGGAATTCCCGG,TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGTGTAGGCGGGATATCAAGTCAGAAGTGAAAATTACGGGCTCAACTCGTAACCTGCTTTTGAAACTGACATTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCTAG,TACGGAGGATTCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAAGTTAGAGGTGAAATTTCGGGGCTCAACCCTGAACGTGCCTCTAATACTGTTGAGCTAGAGAGTAGTTGCGGTAGGCGGAATGTATGG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGCAGCAAGTCTGATGTGAAAGGCGGGGGCTCAACCCCCGGACTGCATTGGAAACTGTTGATCTTGAGTACCGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATGGCAAGTAGAATGTTAAATCCATCGGCTCAACCGGTGGCTGCGTTCTAAACTGCCGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGT
BI0023,0.0393,1.77,25.8,0.001523,0.068605,2181.0,77.0,11.0,325.0,4.0,0.0,43.0,8.0,0.0,0.0,108.0,0.0,0.0
BI0056,0.0619,3.91,39.2,0.001579,0.099745,3718.0,238.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,52.0,98.0,0.0,0.0
BI0131,0.0521,1.49,23.1,0.002255,0.064502,222.0,40.0,6.0,34.0,1.0,0.0,7.0,0.0,0.0,12.0,61.0,0.0,0.0
BI0153,0.0431,2.14,27.3,0.001579,0.078388,217.0,39.0,30.0,0.0,5.0,11.0,6.0,14.0,0.0,0.0,234.0,0.0,0.0
BI0215,0.0502,3.62,33.0,0.001521,0.109697,542.0,211.0,50.0,38.0,1.0,1.0,40.0,25.0,5.0,0.0,235.0,0.0,1.0


In [10]:
# convert to float type
dat = dat.apply(pd.to_numeric, errors='coerce')
dat.describe()

Unnamed: 0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGACAGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCTGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG,TACGGAGGGTGCAAGCGTTAATCGGAATCACTGGGCGTAAAGCGCACGTAGGCGGCTTGGTAAGTCAGGGGTGAAATCCCACAGCCCAACTGTGGAACTGCCTTTGATACTGCCAGGCTTGAGTACCGGAGAGGGTGGCGGAATTCCAGG,TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGCATTTGAAACTGTAGAGCTTGAGTGAAGTAGAGGCAGGCGGAATTCCCCG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGGAGCAAGTCAGAAGTGAAAGCCCGGGGCTCAACCCCGGGACGGCTTTTGAAACTGCCCTGCTTGATTTCAGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATTGCAAGTCAGATGTGAAAACTGGGGGCTCAACCTCCAGCCTGCATTTGAAACTGTAGTTCTTGAGTGCTGGAGAGGCAATCGGAATTCCGTG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCAAGTCTGATGTGAAAGGCGGGGGCTCAACCCCTGGACTGCATTGGAAACTGTTAGTCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAAGGCAAGTCAGATGTGAAAACTATGGGCTCAACCCATAGCCTGCATTTGAAACTGTTTTTCTTGAGTGCTGGAGAGGCAATCGGAATTCCGTG,TACGTAGGTGGCGAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGTGTAGGCGGGACGACAAGTCAGATGTGAAAATTGCAGGCTCAACCTGGAAAGTGCATTTGAAACTGCCGTTCTTGAGAGTCGGAGAGGTAAATGGAATTCCCGG,TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGTGTAGGCGGGATATCAAGTCAGAAGTGAAAATTACGGGCTCAACTCGTAACCTGCTTTTGAAACTGACATTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCTAG,TACGGAGGATTCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAAGTTAGAGGTGAAATTTCGGGGCTCAACCCTGAACGTGCCTCTAATACTGTTGAGCTAGAGAGTAGTTGCGGTAGGCGGAATGTATGG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGCAGCAAGTCTGATGTGAAAGGCGGGGGCTCAACCCCCGGACTGCATTGGAAACTGTTGATCTTGAGTACCGGAGAGGTAAGCGGAATTCCTAG,TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATGGCAAGTAGAATGTTAAATCCATCGGCTCAACCGGTGGCTGCGTTCTAAACTGCCGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGT
count,566.0,566.0,555.0,555.0,555.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0
mean,0.057802,3.431343,35.224865,0.001774,0.094795,1148.324415,171.076923,28.862876,70.904682,4.757525,15.998328,19.125418,11.026756,2.909699,106.132107,139.319398,5.510033,1.371237
std,0.01978,1.836358,12.461583,0.000735,0.029794,1258.928111,342.4499,39.500117,170.446195,9.034815,33.194796,58.332145,12.907451,7.345177,214.872821,155.227622,28.921642,3.146236
min,0.0107,0.3,7.8,0.000398,0.018788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.044125,2.1675,27.4,0.001319,0.074162,117.75,15.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0
50%,0.0555,3.185,33.6,0.001661,0.092838,769.5,69.5,12.0,19.5,2.0,4.0,7.0,7.0,0.0,10.0,113.0,0.0,0.0
75%,0.06635,4.2375,41.85,0.002081,0.113049,1748.0,195.75,41.75,72.75,6.0,18.0,20.0,16.0,2.0,106.75,221.0,1.0,1.0
max,0.156,14.07,104.0,0.006727,0.197786,8696.0,5363.0,258.0,2640.0,113.0,472.0,1202.0,104.0,61.0,1780.0,1942.0,336.0,32.0


In [11]:
dat[dat.columns[7]].dtype

dtype('float64')

### correlation

In [12]:
otu_cols = dat.columns[mf.shape[1]:dat.shape[1]]
otu_cols

Index(['TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG',
       'TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGACAGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCTGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG',
       'TACGGAGGGTGCAAGCGTTAATCGGAATCACTGGGCGTAAAGCGCACGTAGGCGGCTTGGTAAGTCAGGGGTGAAATCCCACAGCCCAACTGTGGAACTGCCTTTGATACTGCCAGGCTTGAGTACCGGAGAGGGTGGCGGAATTCCAGG',
       'TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGCATTTGAAACTGTAGAGCTTGAGTGAAGTAGAGGCAGGCGGAATTCCCCG',
       'TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCGGAGCAAGTCAGAAGTGAAAGCCCGGGGCTCAACCCCGGGACGGCTTTTGAAACTGCCCTGCTTGATTTCAGGAGAGGTAAGCGGAATTCCTAG',
       'TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGATTGCAAGTCAGATGTGAAAACTGGGGGCTCAACCTCCAGCCTGCATTTGAAACTGTAGTTCTTGAGTGCTGGAGAGGCAATCGGAATTCCGTG',
       'TACGTAGGGGGCAAGCGTTATCCGGA

In [13]:
vars_vd = np.array(['OHV1D3', 'OHV24D3', 'OHVD3', 'ratio_activation', 'ratio_catabolism'])

In [14]:
results= []
i = 0
print(vars_vd[i])
for j in range(len(otu_cols)):
    tmp = dat[[vars_vd[i], otu_cols[j]]].dropna(axis=0, how='any')
    rho, pval = spearmanr(tmp[vars_vd[i]], tmp[otu_cols[j]])
    tax = taxa['Taxon'][otu_cols[j]]
    results.append([vars_vd[i], otu_cols[j], tax, rho, pval])
    results.append([vars_vd[i], otu_cols[j], rho, pval])

# output table    
results = pd.DataFrame(results, columns=['vars', 'otu', 'tax',
                                         'rho', 'pval']).dropna(axis=0, how='any')
results['fdr pval'] = multipletests(results['pval'], method = 'fdr_bh')[1]
results = results.sort_values(['fdr pval'], ascending=True)

# specific bacteria
index = results.loc[results['fdr pval'] <= 0.05].index
for i in range(len(index)):
    print(results.tax[index[i]], results['rho'][index[i]])

OHV1D3
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Oscillospira; s__ -0.23605084920967676
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Ruminococcus; s__ 0.19855961073907846
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Blautia; s__ -0.19587413160442294
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Coprococcus; s__catus 0.12734589430401755
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Blautia; s__obeum 0.10981067834195901
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Anaerotruncus; s__ -0.0988819997748866


In [15]:
results

Unnamed: 0,vars,otu,tax,rho,pval,fdr pval
10,OHV1D3,TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,-0.236051,1.316354e-08,1.711261e-07
18,OHV1D3,TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.19856,1.927086e-06,1.153895e-05
22,OHV1D3,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,-0.195874,2.662835e-06,1.153895e-05
8,OHV1D3,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.127346,0.002402777,0.007809025
12,OHV1D3,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.109811,0.008932822,0.02322534
24,OHV1D3,TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,-0.098882,0.01862034,0.04034407
16,OHV1D3,TACGTAGGTGGCGAGCGTTATCCGGAATTACTGGGTGTAAAGGGTG...,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.082958,0.04853277,0.08193005
20,OHV1D3,TACGGAGGATTCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTG...,k__Bacteria; p__Bacteroidetes; c__Bacteroidia;...,0.082276,0.05041849,0.08193005
0,OHV1D3,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAG...,k__Bacteria; p__Bacteroidetes; c__Bacteroidia;...,-0.037949,0.3674952,0.5308264
2,OHV1D3,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAG...,k__Bacteria; p__Bacteroidetes; c__Bacteroidia;...,-0.026513,0.5290311,0.6877404


In [16]:
results.to_csv('../data/correlation_125.txt', sep='\t')

In [16]:
dat.head()

Unnamed: 0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism,TACGGAGGATTCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAAGTTAGAGGTGAAATTTCGGGGCTCAACCCTGAACGTGCCTCTAATACTGTTGAGCTAGAGAGTAGTTGCGGTAGGCGGAATGTATGGTGTAGCGGTGAAATGCTTAGAGATCATACAGAACACCGATTGCGAAGGCAGCTTACCAAACTATATCTGACGTTGAGGCACGAAAGCGTGGGGAGCAAAC,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATGGCAAGTCTGAAGTGAAAACCCAGGGCTCAACCCTGGGACTGCTTTGGAAACTGTCAAGCTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTGGGGAGCAAAC,TACGTAGGGGGCGAGCGTTATCCGGATTCATTGGGCGTAAAGCGCGCGTAGGCGGCCCGGCAGGCCGGGGGTCGAAGCGGGGGGCTCAACCCCCCGAAGCCCCCGGAACCTCCGCGGCTTGGGTCCGGTAGGGGAGGGTGGAACACCCGGTGTAGCGGTGGAATGCGCAGATATCGGGTGGAACACCGGTGGCGAAGGCGGCCCTCTGGGCCGAGACCGACGCTGAGGCGCGAAAGCTGGGGGAGCGAAC,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTAAAGCAAGTCTGAAGTGAAAGCCCGCGGCTCAACTGCGGGACTGCTTTGGAAACTGTTTAACTGGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTACTGGACGATAACTGACGTTGAGGCTCGAAAGCGTGGGGAGCAAAC,TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGGAGGGCAAGTCAGATGTGAAATCCACGGGCTCAACTCGTGAACTGCATTTGAAACTACTCTTCTTGAGTATCGGAGAGGCAATCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGATTGCTGGACGACAACTGACGGTGAGGCGCGAAAGCGTGGGGAGCAAAC,TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGTGTAGGCGGGATATCAAGTCAGAAGTGAAAATTACGGGCTCAACTCGTAACCTGCTTTTGAAACTGACATTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAAC,TACGTAGGGGGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGTTAATTAAGTTGGATGTGAAATTCCCGGGCTTAACTTGGGAGCTGCATTCAAAACTGGTTAACTAGAGTTCAGGAGAGGGAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTTCTGGACTGACACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAAC
BI0023,0.0393,1.77,25.8,0.001523,0.068605,56.0,0.0,5.0,0.0,125.0,0.0,16.0
BI0056,0.0619,3.91,39.2,0.001579,0.099745,50.0,4.0,2.0,0.0,0.0,24.0,0.0
BI0131,0.0521,1.49,23.1,0.002255,0.064502,39.0,0.0,0.0,0.0,18.0,10.0,6.0
BI0153,0.0431,2.14,27.3,0.001579,0.078388,176.0,0.0,9.0,0.0,114.0,0.0,0.0
BI0215,0.0502,3.62,33.0,0.001521,0.109697,150.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
# check
dat.rename(columns={dat.columns[5]: taxa.Taxon[dat.columns[5]],
                    dat.columns[6]: taxa.Taxon[dat.columns[6]],
                    dat.columns[7]: taxa.Taxon[dat.columns[7]],
                    dat.columns[8]: taxa.Taxon[dat.columns[8]],
                    dat.columns[9]: taxa.Taxon[dat.columns[9]],
                    dat.columns[10]: taxa.Taxon[dat.columns[10]],
                    dat.columns[11]: taxa.Taxon[dat.columns[11]]}, inplace=True)
dat.head()

Unnamed: 0,OHV1D3,OHV24D3,OHVD3,ratio_activation,ratio_catabolism,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__,k__Bacteria; p__Actinobacteria; c__Coriobacteriia; o__Coriobacteriales; f__Coriobacteriaceae; g__Collinsella; s__aerofaciens,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__.1,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__; g__; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Christensenellaceae; g__; s__
BI0023,0.0393,1.77,25.8,0.001523,0.068605,56.0,0.0,5.0,0.0,125.0,0.0,16.0
BI0056,0.0619,3.91,39.2,0.001579,0.099745,50.0,4.0,2.0,0.0,0.0,24.0,0.0
BI0131,0.0521,1.49,23.1,0.002255,0.064502,39.0,0.0,0.0,0.0,18.0,10.0,6.0
BI0153,0.0431,2.14,27.3,0.001579,0.078388,176.0,0.0,9.0,0.0,114.0,0.0,0.0
BI0215,0.0502,3.62,33.0,0.001521,0.109697,150.0,0.0,1.0,0.0,0.0,0.0,0.0


In [18]:
tmp = dat[['OHV1D3', 'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=0.19262928366201476, pvalue=3.836328870511795e-06)

In [19]:
tmp = dat[['OHV1D3', 'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=array([[ 1.        , -0.1194831 , -0.17774718],
       [-0.1194831 ,  1.        ,  0.3248879 ],
       [-0.17774718,  0.3248879 ,  1.        ]]), pvalue=array([[0.00000000e+00, 4.38517803e-03, 2.07116211e-05],
       [4.38517803e-03, 0.00000000e+00, 2.09903585e-15],
       [2.07116211e-05, 2.09903585e-15, 0.00000000e+00]]))

In [20]:
tmp = dat[['OHV1D3', 'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Christensenellaceae; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=0.16856828105409885, pvalue=5.484660631719191e-05)

In [21]:
tmp = dat[['OHV1D3', 'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=0.15339088959899927, pvalue=0.0002462150444195747)

In [22]:
tmp = dat[['OHV1D3', 'k__Bacteria; p__Actinobacteria; c__Coriobacteriia; o__Coriobacteriales; f__Coriobacteriaceae; g__Collinsella; s__aerofaciens']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=0.1246241651960069, pvalue=0.0029531935642018786)

In [23]:
tmp = dat[['OHV1D3', 'k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__; s__']].dropna(axis=0, how='any')
spearmanr(tmp[tmp.columns[0]], tmp[tmp.columns[1]])

SpearmanrResult(correlation=0.0717610361397878, pvalue=0.08778725235208422)