#### This notebook is to perform Principal Component Analysis on visocreen data, and color them by American Gut question on diet type 

## Prepare data for PCA analysis

In [2]:
options(warn=-1)

In [3]:
AG = read.csv('ag_map_with_alpha.txt', sep='\t', row.names=1)

In [4]:
# select vioscreen variables and convert to numeric
library(dplyr)
vios_cols = select(AG,contains("vioscreen"))
indx = sapply(vios_cols, is.factor)
vios_cols[indx] = lapply(vios_cols[indx], function(x) as.numeric(as.character(x))) 


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [5]:
# drop samples without vioscreen information
ids_keep = vios_cols[apply(vios_cols, 1, sum, na.rm=TRUE) != 0, ] # 1762 * 258

# drop categroical vioscreen variables, or with mean zero or with sd zero
vios_zero = names(ids_keep)[apply(ids_keep, 2, mean, na.rm=TRUE) == 0 | apply(ids_keep, 2, sd, na.rm=TRUE) == 0]
dat = ids_keep[, -which(names(ids_keep) %in% vios_zero)] # 1762 * 252

# keep only vioscreen variables with more than 90% response
dat_binary = +(dat > 0)
n = dim(dat_binary)[1]
cols_90 = names(dat)[apply(dat_binary, 2, sum, na.rm=TRUE) > 0.9*n] # 207
dat_90 = dat[, which(names(dat) %in% cols_90)] # 1762 * 207

# drop samples with missing vioscreen information
dat_90_complete = na.omit(dat_90) # 1596 * 207

# add AG diet_type variable
AG_diet = subset(AG, select='diet_type') # 9511 * 1
df_90 = merge(AG_diet, dat_90_complete, by='row.names')
colnames(df_90)[1] = 'SampleID'

In [6]:
dim(df_90)

In [7]:
head(df_90)

SampleID,diet_type,vioscreen_add_sug,vioscreen_calcium_from_dairy_servings,vioscreen_calcium_servings,vioscreen_discfat_oil,vioscreen_discfat_sol,vioscreen_d_cheese,vioscreen_d_milk,vioscreen_d_total,⋯,vioscreen_vitc,vioscreen_vitd,vioscreen_vitd3,vioscreen_vitd_iu,vioscreen_vite_iu,vioscreen_vitk,vioscreen_water,vioscreen_wgrain,vioscreen_xylitol,vioscreen_zinc
10317.000001171,Omnivore,20.982847,1.32359,3.12474,9.975316,34.70888,0.053890411,0.26652053,1.32358897,⋯,94.02155,4.974822,4.974,198.9929,10.89772,265.6982,3481.122,0.7428219,0.02747945,8.860438
10317.00000123,Omnivore,4.427753,0.66414,3.57156,32.137814,113.47641,0.336602747,0.32753423,0.66413701,⋯,280.86652,23.219454,23.216467,928.7782,30.10846,288.4739,6306.16,1.0293424,0.04073973,58.210789
10317.000001377,Omnivore,6.436877,0.19877,3.51477,56.263641,39.73383,0.002191781,0.19654794,0.19876711,⋯,46.22097,14.212247,7.457671,568.4899,21.95663,155.8026,2319.67,0.7225479,0.01980822,15.659973
10317.000001792,Omnivore,4.513343,0.02841,1.57992,22.547892,62.004,0.0,0.02838356,0.02841096,⋯,209.71599,6.07674,6.07674,243.0696,18.84419,430.3969,7217.774,0.0,0.08460273,15.860876
10317.00000182,Omnivore,7.931206,0.62581,2.91416,47.920521,38.58582,0.071999997,0.1799452,0.62580824,⋯,205.79251,10.839206,10.839206,433.5682,24.2191,543.6381,4430.21,5.2092605,0.03843836,16.354656
10317.000001882,Omnivore,7.475096,1.95184,4.07726,64.560745,88.72372,0.455945253,0.99594516,1.95183551,⋯,211.77725,12.909616,12.907616,516.3846,28.27493,581.557,5959.042,0.2600822,0.07169863,22.326469


## Perform PCA on vioscreen data

In [8]:
pr.out = prcomp(df_90[, -c(1,2)], scale=TRUE)  # exclude sampleID and diet_type

In [9]:
# variation explained 
pr.var = pr.out$sdev^2
pve = pr.var/sum(pr.var)
pve[1:2]

In [10]:
table(df_90$diet_type)
colors  = rep('black', length(df_90$diet_type))
colors[df_90$diet_type == 'Omnivore'] = 'red'
colors[df_90$diet_type == 'Omnivore but do not eat red meat'] = 'orange'
colors[df_90$diet_type == 'Vegan'] = 'green'
colors[df_90$diet_type == 'Vegetarian'] = 'purple'
colors[df_90$diet_type == 'Vegetarian but eat seafood'] = 'blue'


                        Omnivore Omnivore but do not eat red meat 
                            1263                              108 
                     Unspecified                            Vegan 
                              21                               48 
                      Vegetarian       Vegetarian but eat seafood 
                              74                               82 

In [11]:
# color first 2 principal components with diet_type
pdf('vioscreen_pca.pdf')
par(xpd=TRUE, mar=par()$mar + c(0, 0, 0, 7))
plot(pr.out$x[, 1:2], pch=20, xlab='PC1 (31.7% variation explained)', 
     ylab='PC2 (14.2% variation explained)', col=colors, bty='n')
legend(20, 20, c('Unspecified', 'Omnivore', 'Omnivore but \n do not eat red meat', 
                 'Vegan', 'Vegetarian', 'Vegetarian but \n eat seafood'), col= c('black', 'red',
                                                                              'orange', 'green', 'purple', 'blue'), 
       pch=c(19,19,19,19,19, 19))
par(mar=c(5,4,4,2) + 0.1)
dev.off()

In [12]:
# export PC files
pc_info = cbind(df_90$SampleID, pr.out$x)
write.table(pc_info, file='vioscreen_pc_orig.txt', sep='\t', row.names = FALSE) # delete index columns
eigenvals = pr.out$sdev^2
write(paste(as.character(eigenvals), collapse='\t'), 'eigenvals.txt', append='TRUE')
prop_pc = eigenvals/sum(eigenvals)
write(paste(as.character(prop_pc), collapse='\t'), 'prop_pc.txt', append='TRUE')