#### This notebook is to perform Non-Redundant Analysis on alpha diversities
#### reference: Falony G, Joossens M, Vieira-Silva S, et al. Population-level analysis of gut microbiome variation. Science 2016;352:560-4

In [1]:
library(vegan)

Loading required package: permute
Loading required package: lattice
This is vegan 2.5-2


In [2]:
mf = read.csv('../data/mapping_sleep_alpha.txt', stringsAsFactors=FALSE, sep='\t')
colnames(mf)[1] = 'SampleName'
dim(mf)

In [3]:
head(mf)

SampleName,BarcodeSequence,LinkerPrimerSequence,Experiment_Design_Description,Library_Construction_Protocol,Linker,Platform,Center_Name,Center_Project,Instrument_Model,⋯,M1ADEPR,M1BENZO,AMAMPT_C1,AMFVT_C1,AMPHIT_15SD,Description,alpha_pd,observed_otus,shannon,pielou_e
BI0023,TCTGGTGACATT,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D study,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,⋯,0: No,0: No,Missing:not collected,Missing:not collected,Missing:not collected,Orwoll.BI0023.BI,27.77117,302,5.727116,0.6951729
BI0056,CAAGCATGCCTA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D study,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,⋯,0: No,0: No,1.0,1.0,0.0,Orwoll.BI0056.BI,17.93266,173,3.888281,0.522996
BI0131,CTATTTGCGACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D study,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,⋯,0: No,0: No,1.0,1.0,0.0,Orwoll.BI0131.BI,21.45366,223,4.05073,0.5192645
BI0153,ATCGGCGTTACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D study,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,⋯,0: No,0: No,1.0,1.0,-1.0,Orwoll.BI0153.BI,18.46968,223,4.894253,0.6273959
BI0215,CCTCTCGTGATC,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D study,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,⋯,0: No,0: No,1.0,0.0,-1.0,Orwoll.BI0215.BI,20.04983,222,5.295055,0.6793396
BI0353,TGCCATCTGAAT,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D study,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,⋯,0: No,0: No,1.0,1.0,0.0,Orwoll.BI0353.BI,13.75183,155,4.213016,0.5790192


In [4]:
#sapply(mf, class)

In [5]:
names(mf)

In [6]:
# correlated variables are not selected, such as the same variable yet one categorical and one continuous
# for example, choose categorical not continuous: 'AMAMPT_C1', 'AMFVT_C1', 'AMPHIT_15SD', 'PQBADSLP', 'PQPEFFIC'

# convert categorial to factors
vars_cat = c('GIERACE', 'SITE', 'GIMSTAT', 'MHDIAB', 'MHRHEU1', 'MHOA', 'MHCHF', 'MHMI', 'MHDEPR', 
             'TUDRAMT_REVISED', 'PQPSLMED', 'PQPSQUAL', 'QLCOMP', 'SLEEPHRS', 'PQBADSLP', 
             'TURSMOKE', 'M1ADEPR', 'M1BENZO', 'PQPEFFIC', 'AMAMPT_C1', 'AMFVT_C1', 'AMPHIT_15SD')
mf[vars_cat] = lapply(mf[vars_cat], factor)

# # convert continuous to numeric
vars_cts = c('Age', 'BMI', 'PASCORE', 'alpha_pd')
mf[vars_cts] = lapply(mf[vars_cts], as.numeric)

In [7]:
length(vars_cat)
length(vars_cts)

In [8]:
dat = mf[c(vars_cat, vars_cts)]
dim(dat)
head(dat)

GIERACE,SITE,GIMSTAT,MHDIAB,MHRHEU1,MHOA,MHCHF,MHMI,MHDEPR,TUDRAMT_REVISED,⋯,M1ADEPR,M1BENZO,PQPEFFIC,AMAMPT_C1,AMFVT_C1,AMPHIT_15SD,Age,BMI,PASCORE,alpha_pd
1:WHITE,Birmingham,2: Widowed,0: No,0: No,0: No,0: No,0: No,0: No,1: Less than one drink per week,⋯,0: No,0: No,0:>85% EFFICIENCY,Missing:not collected,Missing:not collected,Missing:not collected,83,28.89012,91.0,27.77117
1:WHITE,Birmingham,1: Married,0: No,1: Yes,1: Yes,1: Yes,1: Yes,0: No,0:None drinker,⋯,0: No,0: No,3:<65% EFFICIENCY,1.0,1.0,0.0,81,28.5398,199.17857,17.93266
1:WHITE,Birmingham,1: Married,0: No,0: No,0: No,0: No,0: No,0: No,0:None drinker,⋯,0: No,0: No,0:>85% EFFICIENCY,1.0,1.0,0.0,83,25.01424,161.71429,21.45366
1:WHITE,Birmingham,1: Married,0: No,0: No,0: No,0: No,0: No,0: No,4: 6-13 drinks per week,⋯,0: No,0: No,0:>85% EFFICIENCY,1.0,1.0,-1.0,79,30.87637,88.21429,18.46968
1:WHITE,Birmingham,1: Married,0: No,0: No,1: Yes,0: No,0: No,0: No,3: 3-5 drinks per week,⋯,0: No,0: No,0:>85% EFFICIENCY,1.0,0.0,-1.0,81,33.58739,256.82143,20.04983
1:WHITE,Birmingham,1: Married,0: No,0: No,0: No,0: No,0: No,0: No,0:None drinker,⋯,0: No,0: No,1:>=75% TO <=85% EFFICIENCY,1.0,1.0,0.0,80,26.41523,179.57143,13.75183


In [9]:
summary(dat)

               GIERACE             SITE                         GIMSTAT   
 1:WHITE           :520   Birmingham : 75   1: Married              :428  
 2:AFRICAN AMERICAN: 24   Minneapolis: 91   2: Widowed              :120  
 3:ASIAN           : 34   Palo Alto  : 86   4: Divorced             : 35  
 4:HISPANIC        : 12   Pittsburgh : 92   5: Single, never married: 16  
 5:OTHER           :  9   Portland   :121                                 
                          San Diego  :134                                 
                                                                          
    MHDIAB      MHRHEU1        MHOA        MHCHF         MHMI        MHDEPR   
 0: No :508   0: No :552   0: No :450   0: No :550   0: No :520   0: No :542  
 1: Yes: 91   1: Yes: 47   1: Yes:149   1: Yes: 49   1: Yes: 79   1: Yes: 57  
                                                                              
                                                                              
     

In [10]:
dat = dat[complete.cases(dat), ]
print(dim(dat))
alpha = dat$alpha_pd
dat = dat[, -which(names(dat) %in% 'alpha_pd')]
dim(dat)

[1] 599  26


In [11]:
mod0 <- rda(alpha ~ 1., dat)  # Model with intercept only
mod1 <- rda(alpha ~ ., dat)  # Model with all explanatory variables

In [12]:
step.res <- ordiR2step(mod0, mod1, perm.max = 1000)

Step: R2.adj= 0 
Call: alpha ~ 1 
 
                    R2.adjusted
<All variables>    0.0760813638
+ MHDIAB           0.0282094594
+ GIERACE          0.0186947251
+ SITE             0.0165868860
+ PQBADSLP         0.0129516906
+ TUDRAMT_REVISED  0.0114233656
+ M1ADEPR          0.0090012065
+ M1BENZO          0.0053595118
+ PQPSQUAL         0.0051267187
+ BMI              0.0048169669
+ AMPHIT_15SD      0.0037344622
+ GIMSTAT          0.0037162637
+ PASCORE          0.0036274312
+ PQPEFFIC         0.0022141982
+ QLCOMP           0.0020128693
+ SLEEPHRS         0.0015793774
+ Age              0.0008848537
+ MHDEPR           0.0006627175
+ AMFVT_C1         0.0002148739
<none>             0.0000000000
+ MHRHEU1         -0.0008302351
+ MHOA            -0.0010655942
+ MHMI            -0.0014456000
+ MHCHF           -0.0015798390
+ TURSMOKE        -0.0023219533
+ AMAMPT_C1       -0.0031391180
+ PQPSLMED        -0.0036370345

         Df    AIC      F Pr(>F)   
+ MHDIAB  1 2203.7 18.359  0.00

In [13]:
table = step.res$anova
table

Unnamed: 0,R2.adj,Df,AIC,F,Pr(>F)
+ MHDIAB,0.02820946,1.0,2203.715,18.358943,0.002
+ GIERACE,0.04439227,4.0,2197.629,3.527485,0.006
+ SITE,0.0551421,5.0,2195.78,2.349336,0.048
+ M1ADEPR,0.06599851,1.0,2189.838,7.834645,0.002
+ BMI,0.07331497,1.0,2186.106,5.634544,0.016
<All variables>,0.07608136,,,,


In [14]:
table$ES.RDA = c(table$R2.adj[1], table$R2.adj[2]-table$R2.adj[1], 
                       table$R2.adj[3]-table$R2.adj[2], table$R2.adj[4]-table$R2.adj[3],
                       table$R2.adj[5]-table$R2.adj[4], table$R2.adj[6]-table$R2.adj[5])
table = table[-6, ]

In [15]:
step.res$call

rda(formula = alpha ~ MHDIAB + GIERACE + SITE + M1ADEPR + BMI, 
    data = dat)

In [16]:
table

Unnamed: 0,R2.adj,Df,AIC,F,Pr(>F),ES.RDA
+ MHDIAB,0.02820946,1,2203.715,18.358943,0.002,0.028209459
+ GIERACE,0.04439227,4,2197.629,3.527485,0.006,0.016182807
+ SITE,0.0551421,5,2195.78,2.349336,0.048,0.010749839
+ M1ADEPR,0.06599851,1,2189.838,7.834645,0.002,0.010856409
+ BMI,0.07331497,1,2186.106,5.634544,0.016,0.007316461


In [17]:
rownames(table) = c('Diabetes', 'Race', 'Site', 'Antidepressant Use', 'BMI')

In [18]:
library(ggplot2)
pdf('../figures/RDA_PD_alpha.pdf')
covariates = rownames(table)
ggplot(table, aes(x=reorder(covariates, ES.RDA), y=ES.RDA, fill=covariates)) +
  labs(x = 'Non-redundant Covariants', y = 'Effect Size (PD Alpha Diversity)') +
  geom_bar(stat='identity') +
 theme(axis.text=element_text(size=10), 
       axis.title=element_text(size=14,face="bold"), 
       legend.position="none") + 
  coord_flip()
dev.off()