Skip to content

kmiller222/es494_data

Repository files navigation

# Capstone Final Plot Code
# load packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(stringr)
library(vegan) # for diversity indices
library(gridExtra)
# import data file 1
dat <- read.csv("data_sheet.csv")
dat1 <- dat %>% 
  select(ID, Site, Date, Diatoms, Dinoflagellates, 
         Cyanobacteria, Chlorophyta, Chrysophyta, TotalCells) %>% 
  na.omit(dat) # omit rows with NA values
# df with proportions
dat_prop <- dat1 %>% 
  mutate(Diatoms_prop = Diatoms/TotalCells,
        Dinoflagellates_prop = Dinoflagellates/TotalCells,
        Cyanobacteria_prop = Cyanobacteria/TotalCells,
        Chlorophyta_prop = Chlorophyta/TotalCells,
        Chrysophyta_prop = Chrysophyta/TotalCells,
        uniqueID = paste(Date, Site),
        Date = as.character(Date),
        Month = str_split_fixed(Date, "/", 2)[ ,1],
        Day = str_split_fixed(Date, "/", 2)[ ,2],
        Date = dmy(paste(Day, Month, 2020)))
# df for pie chart 
dat_pie_prop <- dat_prop %>% 
  select(-Diatoms, -Dinoflagellates, -Cyanobacteria, -Chlorophyta, -Chrysophyta, -TotalCells) %>% 
  pivot_longer(cols = 4:8, names_to = "phyto_type") %>% 
  group_by(ID) 
# pie chart of proportions for each sample
ggplot(dat_pie_prop, aes(x="", y=value, fill=phyto_type)) +
  facet_wrap(~ ID, ncol = 10) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y") + 
  scale_fill_manual(values=c("#24AB55", "#FF7A32", "#FFC433", 
                             "#308DF5","#D83535"),
                    labels=c("Chlorophytes", "Chrysophytes", "Cyanobacteria", 
                             "Diatoms", "Dinoflagellates"),
                    name="Phytoplankton taxonmic group") +
  ylab("Site") + xlab("Date") +
  theme(axis.text.x=element_blank()) +
  ggtitle("Proportion of phytoplankton taxonomic groups in each sample")
# line plot proportions for each site over time
dat_line_prop <- dat_prop %>% 
  select(-Diatoms, -Dinoflagellates, -Cyanobacteria, -Chlorophyta, -Chrysophyta, -TotalCells) %>% 
  pivot_longer(cols = 4:8, names_to = "phyto_type") %>% 
  group_by(Site)
ggplot(dat_line_prop, aes(x=Date, y=value, color=phyto_type)) +
  facet_wrap(~ Site, nrow =2) + geom_line() + 
  scale_color_manual(values=c("#24AB55", "#FF7A32", "#FFC433", 
                               "#308DF5","#D83535", "black"),
                     labels=c("Chlorophytes", "Chrysophytes", "Cyanobacteria", 
                              "Diatoms", "Dinoflagellates"),
                     name="Phytoplankton taxonomic group") +
  theme_classic() + ylab("Proportion") +
  theme(legend.position="bottom") +
  guides(col = guide_legend(nrow = 1, byrow = TRUE)) +
  theme(legend.text=element_text(size=12))
# 5 taxonomic groups averaged by date over time
dateavg_datp <- dat_prop %>% 
  group_by(Date) %>% 
  summarise(Diatoms_mean = mean(Diatoms_prop),
            Dinoflagellates_mean = mean(Dinoflagellates_prop),
            Cyanobacteria_mean = mean(Cyanobacteria_prop),
            Chlorophyta_mean = mean(Chlorophyta_prop),
            Chrysophyta_mean = mean(Chrysophyta_prop)) %>% 
  pivot_longer(cols = -1, names_to = "phyto_type")
# Create date vector (IMPORTANT FOR ALL PLOTS FOLLOWING)
dates <- unique(dateavg_datp$Date)
# most important capstone plot: phyto community structure over time
ggplot(dateavg_datp, aes(x=Date, y=value, color=phyto_type)) + geom_line() + geom_point() +
  scale_color_manual(values=c("#24AB55", "#FF7A32", "#FFC433", 
                              "#308DF5","#D83535", "black"),
                     labels=c("Chlorophytes", "Chrysophytes", "Cyanobacteria", 
                              "Diatoms", "Dinoflagellates"),
                     name="Phytoplankton taxonomic group") +
  theme_classic() + ylab("Proportion") + 
  scale_x_date(breaks = dates, date_labels = "%b %d") +
  theme(legend.position="bottom") +
  guides(col = guide_legend(nrow = 3, byrow = TRUE))
# Simpson and Shannon Diversity indices plots
phytoData2 <- read.csv("sites_data.csv") #load phytoplankton data
phytoData3 <- phytoData2 %>% 
  mutate(Date = as.character(Date),
         Month = str_split_fixed(Date, "/", 2)[ ,1],
         Day = str_split_fixed(Date, "/", 2)[ ,2],
         Date = dmy(paste(Day, Month, 2020)))
phytoData3$simpDiversity <- diversity(phytoData2[,3:7], "simpson") #simpson index
phytoData3$shanDiversity <- diversity(phytoData2[,3:7], "shannon") #shannon index
# simpson boxplot
ggplot(phytoData3, aes(x = Date, y = simpDiversity, group = Date)) + 
  geom_boxplot() + ylab("Simpson Diversity Index") +
  scale_x_date(breaks = dates, date_labels = "%b %d") + theme_classic()
# shannon boxplot
ggplot(phytoData3, aes(x = Date, y = shanDiversity, group = Date)) + 
  geom_boxplot(fill = "bisque") + ylab("Shannon Diversity Index")
# Boxplot and ANOVA tests
# boxplots
ggplot(dat_prop, aes(x = Date, y = Cyanobacteria_prop, group = Date)) + 
  geom_boxplot(fill = "bisque") + ylab("Proportion of Cyanobacteria") +
  theme_classic() + scale_x_date(breaks = dates, date_labels = "%b %d")
# anova
anova.test <- anova(lm(Cyanobacteria_prop ~ Date, dat_prop))
# anova test for difference in date
as.factor(dat_prop$Date) -> dat_prop$Date
aov(Cyanobacteria_prop ~ Date, dat_prop) -> aov.test
summary(aov.test)
TukeyHSD(aov.test)
# anova test for difference in site
anova.test_cyano_site <- anova(lm(Cyanobacteria_prop ~ Site, dat_prop))
aov.test_cyano_site <- aov(Cyanobacteria_prop ~ Site, dat_prop)
summary(aov.test_cyano_site)
TukeyHSD(aov.test_cyano_site)
# Environmental Variables plots
# import the data
env_dat <- read.csv("environmental_variables_data.csv")
env_dat2 <- env_dat %>% 
  mutate(Date = as.character(Date),
         Month = str_split_fixed(Date, "/", 2)[ ,1],
         Day = str_split_fixed(Date, "/", 2)[ ,2],
         Date = dmy(paste(Day, Month, 2020)))
# Chl-a and cyano final plot
coeff <- 100
# create plot 
ggplot(env_dat2, aes(x=Date)) +
  geom_line(aes(y=Prop_cyano, color = "Proportion of Cyanobacteria")) + 
  geom_point(aes(y=Prop_cyano, color = "Proportion of Cyanobacteria")) +
  geom_line(aes(y=Chl_a_conc / coeff, color = "Chl-a concentration")) + 
  geom_point(aes(y=Chl_a_conc / coeff, color = "Chl-a concentration")) +
  scale_y_continuous(name = "Proportion of Cyanobacteria",
    sec.axis = sec_axis(~.*coeff, name="Chl-a concentration")) +
  theme_classic() + scale_x_date(breaks = dates, date_labels = "%b %d") +
  theme(legend.position="bottom") +
  guides(col = guide_legend(nrow = 1, byrow = TRUE)) +
  theme(legend.text=element_text(size=12), 
        axis.text=element_text(size=12),
        axis.title =element_text(size =14))
# load the nutrient data
nut_datb <- read.csv("nutrientdat copy.csv")
# correlation: cyano vs nutrient ratio 
ggplot(nut_datb, aes(x = din_srp_ratio, y=cyano_prop)) + geom_point() +
  ylab("Proportion of Cyanobacteria") + xlab("DIN:SRP ratio") +
  theme_classic() + stat_smooth(method ="lm")
regression1 <- lm(cyano_prop ~ din_srp_ratio, dat = nut_datb)
summary(regression1)
# Abiotic factors plots
se_samp_dat <- read.csv("sites_environmental_data.csv")
se_samp_dat2 <- se_samp_dat %>% 
  mutate(Date = as.character(Date),
         Month = str_split_fixed(Date, "/", 2)[ ,1],
         Day = str_split_fixed(Date, "/", 2)[ ,2],
         Date = dmy(paste(Day, Month, 2020))) %>% 
  select(-Month, -Day)
# average by date
se_samp_dat_sum <- se_samp_dat2 %>%   
  group_by(Date) %>% 
  summarise(avg_s.m = mean(S.m),
            avg_temp = mean(Temp),
            avg_secchi = mean(Secchi))
# errors table
error_se_samp_dat_sum <- se_samp_dat2 %>%   
  group_by(Date) %>% 
  summarise(s.m_se = mean(S.m_se),
            temp_se = mean(Temp_se),
            secchi_se = mean(Secchi_se))
# flip errors table
error_flip <- error_se_samp_dat_sum %>% 
  rename("Mean Secchi Depth (m)" = secchi_se,
         "Mean Temperature (˚C)" = temp_se,
         "Mean Conductivity (S/m)" = s.m_se) %>% 
  pivot_longer(cols = 2:4, names_to = "Variable") %>% 
  arrange(Date, Variable) %>% 
  mutate(unique_id = c(1:18)) %>% 
  rename("se" = value)
# plot df
se_samp_dat_flip <- se_samp_dat_sum %>% 
  rename("Mean Secchi Depth (m)" = avg_secchi,
         "Mean Temperature (˚C)" = avg_temp,
         "Mean Conductivity (S/m)" = avg_s.m) %>% 
  pivot_longer(cols = 2:4, names_to = "Variable") %>% 
  arrange(Date, Variable) %>% 
  mutate(unique_id = c(1:18))
# join the dfs
se_samp_dat_plot <- left_join(se_samp_dat_flip, error_flip, by = "unique_id")
se_samp_dat_plot2 <- se_samp_dat_plot %>% 
  rename("Date" = Date.x,
         "Variable" = Variable.x) %>% 
  select(Date, Variable, value, se)
# 3 tier env variable plot for our SAMPLE data:
ggplot(se_samp_dat_plot2, aes(x=Date, y=value, group = 1, col = Variable)) +  
  geom_line() + geom_point() +
  geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.2) +
  facet_grid(Variable ~ ., scales = "free") + theme_classic() +
  scale_x_date(breaks = dates, date_labels = "%b %d")