In [None]:
library(FactoMineR)
library(factoextra)
library(corrplot)
library(NbClust)
library(cluster)
library(repr)
options(repr.plot.width=4, repr.plot.height=4)
library(randomForest)
library(mlbench)
library(caret)


In [None]:
all_data_f = "../otu_data/WaterQualityData/matched_cleaned_data/all_mdata_colset_2.tsv"
all_df <- read.delim(all_data_f, row.names=1)
transect_data_f = "../otu_data/WaterQualityData/matched_cleaned_data/transect_mdata_colset_1.tsv"
tran_df <- read.delim(transect_data_f, row.names=1)
print(dim(tran_df))
print(dim(all_df))

In [None]:
active_tran = c('WTEMP', 'SPCOND', 'DO', 'DOP', 'NO2F', 'PC', 'PHEO', 'NO3F', 'NH4F', 'TP', 'CHLA')
active_all = c("WTEMP", "SALINITY", "DO", "PH")

all_active <- all_df[,active_all]
tran_active <- tran_df[,active_tran]

print(dim(all_active))
print(dim(tran_active))
print(colnames(tran_df))

In [None]:
res.pca <- PCA(tran_active, graph = FALSE)
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

In [None]:
fviz_pca_var(res.pca, axes = c(1, 2), col.var = "black")


In [None]:
fviz_pca_var(res.pca, axes = c(3, 4), col.var = "black")

In [None]:
options(repr.plot.width=6, repr.plot.height=4)
var <- get_pca_var(res.pca)
corrplot(t(var$contrib), is.corr=FALSE)

In [None]:
tran_active_sc <- scale(tran_active)
number <- NbClust(tran_active_sc, distance="euclidean", min.nc=2, max.nc=15, method='ward.D2', index='all')

In [None]:
pam.res <- pam(tran_active_sc, 3, diss=FALSE, metric='euclidean')
fviz_pca_ind(res.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = as.factor(pam.res$clustering), # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, ellipse.type = "convex",# Concentration ellipses
             legend.title = "Groups")

In [None]:
options(repr.plot.width=6, repr.plot.height=4)
fviz_pca_biplot(res.pca, label ="var", col.var = "black",
                geom.ind = "point", # show points only (nbut not "text")
                col.ind = tran_df[,'Depth_Percentage'], # color by groups
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
                legend.title = "Pct of Total Depth",
                select.var = list(name = c('SPCOND', 'PHEO', 'WTEMP', 'NO2F', 'TP', 'NO3F', 'DO')),
                repel = TRUE)


In [None]:
# extract the classification
n_clusts = length(unique(pam.res$clustering))
habitat = as.data.frame(pam.res$clustering)
colnames(habitat) <- 'HABITAT'
habitat$HABITAT <- factor(habitat$HABITAT)

# add a few more variables of interest
active_tran_plus = c(active_tran, 'day_length', 'Depth_Percentage', 'Latitude')
tran_active_plus =  tran_df[rownames(habitat), active_tran_plus]

mean_cols = vector(length=n_clusts); iqr_cols = vector(length=n_clusts)
for (i in 1:n_clusts){
    mean_cols[[i]] = paste(as.character(i), "Med")
    iqr_cols[[i]] = paste(as.character(i), "IQR", sep = "-")
    }

clust_med = aggregate(x=tran_active_plus, by=habitat, FUN=median)
print(mean_cols)
rownames(clust_med) <- mean_cols

clust_sd = aggregate(x=tran_active_plus, by=habitat, FUN=IQR)
print(iqr_cols)
rownames(clust_sd) <- iqr_cols

clust_data = rbind(clust_med, clust_sd)

clust_data[,1] <- NULL
for (cn in colnames(clust_data)){
    clust_data[mean_cols,cn] = round(((clust_data[mean_cols,cn]-mean(tran_active_plus[,cn]))/sd(tran_active_plus[,cn])), 2)
    clust_data[iqr_cols,cn] = round((clust_data[iqr_cols,cn])/sd(tran_active_plus[,cn]), 2)
    clust_data['separation',cn] <- sum(abs(clust_data[mean_cols,cn]))
}

clust_data_t = as.data.frame(t(clust_data[sort(rownames(clust_data)),]))
print(sum(clust_data_t$separation))
clust_data_t[order(-clust_data_t$separation),]


In [None]:
habitat = pam.res$clustering
active_all = c('WTEMP', 'SALINITY', 'DO', 'PH', "Latitude", 'Depth_Percentage')
test_rows <- setdiff(rownames(all_df), names(habitat))
print(c(length(test_rows), length(rownames(all_df)), length(names(habitat))))
test_dataset <- all_df[test_rows, active_all]
all_in_tran = all_df[names(habitat), active_all] 
simple_dataset = cbind(all_in_tran, factor(habitat))
colnames(simple_dataset) <- c(active_all, 'HABITAT')

print(str(simple_dataset))
print(str(test_dataset))

In [None]:
control <- trainControl(method="repeatedcv", number=10, repeats=2)
tunegrid <- expand.grid(.mtry=c(1:5))
modellist <- list()
set.seed(04)
fit <- train(HABITAT~., data=simple_dataset, method="rf", metric='Accuracy',
             tuneGrid=tunegrid, trControl=control, ntree=1500)
#print(fit)
attributes(fit)

In [None]:
fit$results[max(fit$results[,c(2)]) == fit$results[,c(2)], c(1)]

In [None]:
set.seed(95)
smp_size <- floor(0.75 * nrow(simple_dataset))
train_ind <- sample(seq_len(nrow(simple_dataset)), size = smp_size)
train_df <- simple_dataset[train_ind, ]
test_df <- simple_dataset[-train_ind, ]

classifier = randomForest(x = train_df[,active_all],
                          y = train_df[,'HABITAT'],
                          ntree = 1000, random_state = 0, mtry=2)

y_pred = predict(classifier, newdata = test_df[,active_all])
cm = table(test_df[,'HABITAT'], y_pred)
cm

In [None]:
y2_pred = predict(classifier, newdata=test_dataset)
cm2 = table(y2_pred, y2_pred)
cm2 / 110

In [None]:
ad_out = "../otu_data/WaterQualityData/matched_cleaned_data/all_mdata_with_habitat.txt"
all_df[,'habitat'] <- rep(0, dim(all_df)[1])
print(sum(all_df[,'habitat'] == 0))
all_df[rownames(simple_dataset),'habitat'] <- simple_dataset[,'HABITAT']
print(sum(all_df[,'habitat'] == 0))
all_df[names(y2_pred),'habitat'] <- y2_pred
print(sum(all_df[,'habitat'] == 0))
write.table(x = all_df,  file = ad_out, sep = '\t')