In [None]:
library(FactoMineR)
library(factoextra)
library(corrplot)
library(NbClust)
library(cluster)
library(ggplot2)
library(reshape2)
library(repr)

In [None]:
env_data_file = "/Volumes/KeithSSD/CB_V4/otu_data/WaterQualityData/matched_cleaned_data/all_mdata_with_habitat.txt"
env_data <- read.delim(env_data_file, row.names=1)

transect_data_f = "../otu_data/WaterQualityData/matched_cleaned_data/transect_mdata_colset_1.tsv"
tran_df <- read.delim(transect_data_f, row.names=1)
missing_columns = setdiff(colnames(tran_df), colnames(env_data))

for (i in missing_columns){
    env_data[,i] <- NA
    env_data[rownames(tran_df),i] = tran_df[rownames(tran_df), i]
}

In [None]:
fapro_fn = "/Users/login/Google Drive/SiYi_Xiaotong_Materials/FAPROTAX_counts.txt"
fapro_df <- read.delim(fapro_fn, row.names=1)
col_Sum <- apply(fapro_df, MARGIN=2, FUN=sum)
fapro_set1 =  fapro_df[rownames(env_data), names(col_Var[col_Var != 0])]
fapro_lset = fapro_set1
for (i in 1:length(colnames(fapro_lset))){
    fapro_lset[,colnames(fapro_lset)[i]] = log(fapro_lset[,colnames(fapro_lset)[i]] + 1)
}

In [None]:
to_drop = c('methylotrophy', 'aerobic_ammonia_oxidation', 'sulfate_respiration',
            'dark_sulfite_oxidation', 'arsenate_respiration', 'nitrite_ammonification',
             'dissimilatory_arsenate_reduction', 'nitrite_denitrification', 
             'nitrous_oxide_denitrification', 'nitrate_denitrification', 'fumarate_respiration',
             'mammal_gut', 'plant_pathogen',  'cyanobacteria', 'phototrophy', 
            'anoxygenic_photoautotrophy_S_oxidizing', 'anoxygenic_photoautotrophy_Fe_oxidizing',
           'nitrate_respiration', 'aliphatic_non_methane_hydrocarbon_degradation', 
            'aerobic_chemoheterotrophy', 'nitrite_respiration')

fapro_set2 = fapro_set1[,!(colnames(fapro_set1) %in% to_drop)]

fapro_setl2 = fapro_lset[,!(colnames(fapro_set1) %in% to_drop)]

print(dim(fapro_set2))

for (var1 in 1:length(colnames(fapro_set2))){
    var1n = colnames(fapro_set2)[var1]
    for (var2 in var1:length(colnames(fapro_set2))){
        var2n = colnames(fapro_set2)[var2]
        this_cor = abs(cor(fapro_set2[,var1n], fapro_set2[,var2n]))
        if ((this_cor > 0.95) & (var1 != var2 )) {
            two_names = paste(colnames(fapro_set2)[var1], colnames(fapro_set2)[var2], sep=" & ")
            print(c(two_names, this_cor))
        }
    }
}


In [None]:
options(repr.plot.width=6, repr.plot.height=6)
res.pca <- PCA(fapro_setl2, graph = FALSE)
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

In [None]:
variables_ <- get_pca_var(res.pca)
for (i in 1:4){
    subvar = sort(abs(variables_$coord[,c(i)]), decreasing = TRUE)
    print(c("DIM", i))
    print(variables_$coord[names(subvar[1:40]),c(i)])
}

In [None]:
fapro_scaled = scale(fapro_setl2)

In [None]:
selected = c("ratkowsky", "pseudot2", "duda", "kl", "ch", "hartigan", "ball", 
             "cindex", "db", "silhouette", "sdbw", "ptbiserial", "dindex",
             "dunn", "sdindex", "mcclain", "frey", "hubert")

results = list()
for (i in 1:length(selected)) {
    print(selected[i])
    results[[selected[i]]] <- try(NbClust(data=fapro_scaled, distance="euclidean", min.nc=2, max.nc=15, method='ward.D2', index=selected[i]) )
    print(results[[selected[i]]]$Best.nc['Number_clusters'])
}



In [None]:
# Contributions of variables to PC1
#fviz_contrib(res.pca, choice = "var", axes = 1, top = 20)
# Contributions of variables to PC2
#fviz_contrib(res.pca, choice = "var", axes = 2, top = 20)
first_two_vars = c('photoautotrophy', 'hydrocarbon_degradation', 'anoxygenic_phototrophy', 
                   'respiration_of_sulfur_compounds', 'methanogenesis', 'sulfur_respiration',
                   'fermentation', 'nitrogen_respiration', 'iron_respiration', 'chemoheterotrophy',
                   'human_pathogens_all', 'denitrification', 'dark_sulfur_oxidation', 'chloroplasts',
                   'photoheterotrophy')
fviz_pca_var(res.pca, axes = c(1, 2), col.var = "black", select.var = list(name = first_two_vars), repel=TRUE )

In [None]:
fviz_pca_var(res.pca, axes = c(3, 4), col.var = "black")

In [None]:
pam.res <- pam(fapro_scaled, 3, diss=FALSE, metric='euclidean')
fviz_pca_ind(res.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = as.factor(pam.res$clustering), # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, ellipse.type = "convex",# Concentration ellipses
             legend.title = "Groups")

In [None]:
library(glmnet)
library(caret)

#
mean(abs(error))



select_cols = c('Year', 'Month','Discharge_Susquehanna_14', 'day_length', 'Latitude', 'Depth', 
                'habitat', 'enspie', 'faith_pd', 'StationName', 'CollectionAgency', 'anti_day_length',
                'Month_Year', 'WTEMP', 'SALINITY', 'DO', 'DOP', 'NO2F', 'PC', 'PHEO', 'NO3F', 'NH4F', 
                'TP', 'CHLA', 'TDN')

scores_per_model = list()
folds = 10

score_matrix = data.frame(data=matrix(nrow=length(select_cols), ncol=folds), row.names=select_cols)
scores_per_model['ridge'] = score_matrix
scores_per_model['lasso'] = scores_per_model['ridge']
scores_per_model['randomforest'] = scores_per_model['lasso']

counter = 0
for (a_col in select_cols){
    for (fold in colnames(score_matrix)){
        counter = counter + 1
        set.seed(123*counter)
        resp_vector = env_data[!is.na(env_data[,a_col]), a_col]
        names(resp_vector) <- rownames(env_data)[!is.na(env_data[,a_col])]
        pred_matrix = fapro_setl2[names(response_vector),]
        pred_resp = cbind(pred_matrix, resp_vector)
        colnames(pred_resp) <- c(colnames(pred_matrix), a_col)
        
        smp_size <- floor(0.75 * nrow(pred_resp))
        train_ind <- sample(seq_len(nrow(pred_resp)), size = smp_size)
        train_df <- pred_resp[train_ind, ]
        test_df <- pred_resp[-train_ind, ]
        
        lambdas <- 10^seq(2, -3, by = -.1)
        cv_ridge <- cv.glmnet(train_df[,colnames(fapro_setl2)], 
                              train_df[,a_col], alpha = 0, lambda = lambdas)
        ridge_pred_test <- predict(cv_ridge, s = cv_ridge$lambda.min, 
                                   newx = test_df[,colnames(fapro_setl2)])
        
        lasso_reg <- cv.glmnet(train_df[,colnames(fapro_setl2)], train_df[,a_col],
                               alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
        lasso_pred_test <- predict(lasso_reg, s = lasso_reg$lambda.min
                                   newx = test_df[,colnames(fapro_setl2)])
        
        
        control <- trainControl(method="repeatedcv", number=5, repeats=1)
        tunegrid <- expand.grid(.mtry=c(1:20))
        fit <- train(x=train_df[,colnames(fapro_setl2)], y=train_df[,a_col], 
                     method="rf", metric='RMSE', tuneGrid=tunegrid, trControl=control,
                     ntree=1000)
        opt_mtry = fit$results[max(fit$results[,c(2)]) == fit$results[,c(2)], c(1)]
        rf_model = randomForest(x = train_df[,colnames(fapro_setl2)],
                                y = train_df[,a_col],
                                ntree = 1000, random_state = counter, mtry=opt_mtry)
        rf_pred_test = predict(rf_model, newdata = test_df[,colnames(fapro_setl2)])

        scores_per_model['ridge'][a_col, fold] = mean(abs(ridge_pred_test - test_df[,a_col]))
        scores_per_model['lasso'][a_col, fold] = mean(abs(lasso_pred_test - test_df[,a_col]))
        scores_per_model['randomforest'][a_col, fold] = mean(abs(rf_pred_test - test_df[,a_col]))
    }
}



In [None]:
library(vegan)


In [None]:
fapro_melt1 = melt(data = fapro_set1, id.vars = "Sample", measure.vars = rownames(fapro_set1))
colnames(fapro_melt1) <- c('samp_name', 'fxn', 'rel_abund')
p = ggplot(fapro_melt1, aes(fill=fxn, y=rel_abund, x=samp_name)) + geom_bar(position="stack", stat="identity")
p <- p + theme(legend.position="bottom")
p <- p + guides(fill=guide_legend(nrow=5, byrow=TRUE))
p

In [None]:
# Stacked + percent
ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar(position="fill", stat="identity")