# Abundance Figures

In [None]:
library(tidyverse)
library(scatterpie)
library(scales)
library(cowplot)
library(ggnewscale)

In [None]:
# Metadata on the samples from NCBI
metadata <- read_csv("./data/tara_traces_merged.csv")

# Estimated abundances
family_data <- read_csv("./data/sar86_families_global_abundance_estimates.csv")
merged_data <- merge(x = family_data, y = metadata, by.x = "sample", by.y = "...1", all = TRUE)


genera_data <- read_csv("./data/sar86_genera_global_abundance_estimates.csv")
merged_genera_data <- merge(x = genera_data, y = metadata, by.x = "sample", by.y = "...1", all = TRUE)

In [None]:
convert_to_percent <- function(x) {
  x * 100
}

# Turn proportions to percents
family_data <- family_data %>%
  mutate_if(is.numeric, convert_to_percent)

## Size Fractionation figure

In [None]:
# Size Fractionation
merged_data_300 %>% 
    filter(startsWith(sample, "TARA")) %>%
    mutate(concat_filter = paste(`Size fraction lower threshold [micrometre]`, `Size fraction upper threshold [micrometre]`)) %>%
    gather("family", "abundance", all_of(family_level_order)) %>%
    ggplot(aes(x = concat_filter, y = abundance, fill = factor(family, level = family_level_order))) + 
    geom_boxplot() + 
    labs(x = "", y = "Relative Abundance (%)") + 
    theme(axis.text.x = element_text(angle = 0,hjust = 0.5, vjust = 0), legend.position = c(0.85, 0.85), legend.background = element_rect(fill = "white", color = "black")) + 
    scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks = scales::pretty_breaks(n = 8))

## Abundance Figures

In [None]:
# Filter down the data
base_data_family <- merged_data %>%
    # Remove virus filter samples sizes
    filter(concat_filter != "< 0.22") %>%
    filter(concat_filter != "0.1 0.22") %>%
    select(sample, "Sampling depth [m]", SAR156, RedeBAC7D11, "CHAB-I-7", "Suzuki")

# Sum together data for SAR86
base_data_family <- base_data_family %>%
    mutate(SAR86 = `Suzuki` + `CHAB-I-7` + RedeBAC7D11 + SAR156)

# Create three depth zones (0-100, 100-200, 200-300)
base_data_family$DZ <- ifelse(base_data_family$`Sampling depth [m]` <= 100, "Upper Euphotic (0-100m)", ifelse(base_data_family$`Sampling depth [m]` > 100 &
  base_data_family$`Sampling depth [m]` <= 200, "Lower Euphotic (100-200m)", ifelse(base_data_family$`Sampling depth [m]` > 200 & base_data_family$`Sampling depth [m]` <=
  300, "Upper Mesopelagic (200-300m)", "Below 300m")))

In [None]:
# Depth horizon SAR86 + families figure
DZ_order <- c("Upper Euphotic (0-100m)", "Lower Euphotic (100-200m)", "Upper Mesopelagic (200-300m)")
base_data_family %>%
    filter(`Sampling depth [m]` <= 300) %>%
    gather("family", "abundance", all_of(c("SAR86", family_level_order))) %>%
    ggplot(aes(x = factor(family, level = c("SAR86", family_level_order)), y = abundance, fill = factor(DZ, level = DZ_order))) + geom_boxplot() + #scale_y_sqrt() +
    scale_fill_manual(name = "Depth Horizons", values = c("#F9FCCA", "#66C2A4", "#3669D5"),
                labels = DZ_order) + 
    theme(axis.text.x = element_text(angle = 0,hjust = 0.5, vjust = 0.8), legend.position = c(0.85, 0.75), legend.background = element_rect(fill = "white", color = NA)) + 
    labs(x = "Family", y = "Relative Abundance (%)") +
    # Rename the named classifications
    scale_x_discrete(labels = sar156_family_rename_alt) +
    #scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks = scales::pretty_breaks(n = 8)) +
    scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
    xlab("")

In [None]:
merged_genera_data %>%
    filter(concat_filter != "< 0.22") %>%
    filter(concat_filter != "0.1 0.22")

In [None]:
sar86_generas <- c("Suzuki_G01", "Suzuki_G02", "Suzuki_G03", "Suzuki_G04", "Suzuki_G05", "Suzuki_G06", "Suzuki_G07", "Suzuki_G08", "Suzuki_G09", "Suzuki_G10", "CHAB-I-7_G1",
  "CHAB-I-7_G2", "CHAB-I-7_G3", "CHAB-I-7_G4", "RedeBAC7D11_G1", "RedeBAC7D11_G2", "RedeBAC7D11_G3", "RedeBAC7D11_G4", "SAR156_G1", "SAR156_G2", "SAR156_G3",
  "SAR156_G4")
# Filter down the data
base_data_genera <- merged_genera_data %>%
    select(sample, "Sampling depth [m]")
# Create three depth zones (0-100, 100-200, 200-300)
merged_genera_data$DZ <- ifelse(base_data_genera$`Sampling depth [m]` <= 100, "Upper Euphotic (0-100m)", ifelse(base_data_genera$`Sampling depth [m]` >
  100 & base_data_genera$`Sampling depth [m]` <= 200, "Lower Euphotic (100-200m)", ifelse(base_data_genera$`Sampling depth [m]` > 200 & base_data_genera$`Sampling depth [m]` <=
  300, "Upper Mesopelagic (200-300m)", "Below 300m")))

In [None]:
# Depth horizon generas figure
merged_genera_data %>%
    filter(`Sampling depth [m]` <= 300) %>%
    gather("genus", "abundance", all_of(sar86_generas)) %>%
    # Create a family column for the facet plot
    mutate(family = factor(levels = family_level_order, str_replace(genus, "_.*", ""))) %>%
    ggplot(aes(x = factor(genus, level = sar86_generas), y = abundance, fill = factor(DZ, level = DZ_order))) +
    geom_boxplot(outlier.size = 0.5) + 
    scale_fill_manual(name = "Depth Horizons", values = c("#F9FCCA", "#66C2A4", "#3669D5")) +
    labs(x = "Genus", y = "Relative Abundance (%)") +
    mike_formatting +
    theme(axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.85), legend.position = c(0.85, 0.85), 
          legend.background = element_blank(),
          strip.background = element_blank(),
          panel.border = element_rect(colour = "black", fill = NA),
          strip.text.x = element_text(size = 16, colour = "black"),
          #axis.text.x = element_text(vjust = 12),
          strip.placement = "outside") +
    # Rename the SAR156 values
    scale_x_discrete(labels = sar156_genus_rename_full) +
    #scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks = scales::pretty_breaks(n = 8)) +
    scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
    facet_grid(. ~ family, scales = "free_x", space = "free_x", labeller = as_labeller(sar156_family_rename),
               switch="x") + 
    # Rename the SAR156 values
    scale_x_discrete(labels = genus_list_link_alt) +
    xlab("")

In [None]:
# Global Ocean data averages figures
# Add ocean basin sections column for BIOGEOTRACES
bgt_data <- merged_data %>%
  filter(startsWith(sample, "BGT")) %>%
  filter(`Sampling depth [m]` <= max_depth) %>%
  mutate(ocean_section = case_when(
    `Longitude [degrees East]` > -100 & `Longitude [degrees East]` < -5 & `Latitude [degrees North]` > 0 ~ "(NAO) North Atlantic Ocean",
    `Longitude [degrees East]` > -70 & `Longitude [degrees East]` < 20 & `Latitude [degrees North]` < 0 ~ "(SAO) South Atlantic Ocean",
    `Longitude [degrees East]` > 145 | `Longitude [degrees East]` < -70 & `Latitude [degrees North]` < 0 ~ "(SPO) South Pacific Ocean"))

# Filter out samples we don't want
full_data <- merged_data %>%
  filter(startsWith(sample, "TARA")) %>%
  # Remove virus filter samples sizes
filter(concat_filter != "< 0.22") %>%
  filter(concat_filter != "0.1 0.22") %>%
  # Remove deeper samples from depths greater than 200m
filter(`Sampling depth [m]` <= max_depth) %>%
  # Drop MRGID portion
mutate(ocean_section = str_sub(`Ocean and sea regions (IHO General Sea Areas 1953) [MRGID registered at www.marineregions.com]`, end = -14)) %>%
  # Merge the BGT data with the TARA data
rbind(y = bgt_data)

# Get the means for each ocean section
avg_data <- full_data %>%
  select("sample", "CHAB-I-7", "RedeBAC7D11", "SAR156", "Suzuki", ocean_section) %>%
  group_by(ocean_section) %>%
  summarise(`mean_CHAB-I-7` = mean(`CHAB-I-7`), mean_RedeBAC7D11 = mean(RedeBAC7D11), mean_SAR156 = mean(SAR156), `mean_Suzuki` = mean(`Suzuki`))

# Manually set locations for the piecharts
avg_data$lats <- c(-12.657709, 58, 33.455246, 31.244798, 8, -24.162734, -62.732558, -31.312293)
avg_data$longs <- c(78.435551, 40, -47.928822, -156.682216, 23, -12.717372, -47.346278, -131.018153)

# Get sample counts
sample_counts <- full_data %>%
  # Remove virus filter samples sizes
    filter(concat_filter != "< 0.22") %>%
    filter(concat_filter != "0.1 0.22") %>%
  # Remove deeper samples from depths greater than XXXm
    filter(`Sampling depth [m]` <= max_depth) %>%
    count(ocean_section)

# Manually set locations for sample counts
# Order: IO, MS, NAO, NPO, RS, SAO, SO, SPO
base_lats <- c(3, 78, 43, 45, 20, -15, -59, -21)
sample_counts$lats <- base_lats
sample_counts$lower_lats <- base_lats - c(30, 40, 20, 27, 24, 18, 8, 20)
sample_counts$longs <- c(78.435551, 40, -47.928822, -156.682216, 23, -12.717372, -47.346278, -131.018153)

sample_counts <- sample_counts %>%
    mutate(ocean_section_acro = str_replace(ocean_section, "\\s.*", "")) %>% 
    # Removes the parantheses on both sides of the acronym
    mutate(ocean_section_acro = str_replace_all(ocean_section_acro, "\\(|\\[|\\)|\\]", ""))

sample_counts <- sample_counts %>%
    mutate(ocean_label = paste0(ocean_section_acro, " (", n, ")"))

In [None]:
avg_data %>%
    write_csv("../workflow-read-recruitment/data/11_abundance_estimates/ocean_region_average.csv")

In [None]:
# Create base map
world <- map_data("world")

# Update some aspets of the map
mean_map <- ggplot(world, aes(long, lat)) +
    geom_map(map = world, aes(map_id = region), fill = "gray", color = "black") +
    coord_quickmap(expand = FALSE) +
    ylim(c(-84, 85)) +
    xlim(-180, 180) +
    theme(axis.text.x = element_blank(), 
          axis.text.y = element_blank(),
          axis.ticks = element_blank()) +
    ylab("") +
    xlab("") +
    theme(panel.border = element_blank(),
          panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    mike_formatting

# Add the plots and eerything else to the map
mean_map + 
    sample_locations +
    sample_locations_labels +
    scale_color_discrete(guide="none") +
    scale_fill_discrete(guide="none") + 
    new_scale_fill() +
    new_scale_color() +
    geom_scatterpie(aes(x = longs, y = lats, r = mean_sar86), data = avg_data_no_southern_ocean, cols = mean_sub_clades, color = NA, alpha = 0.9) +
    scale_fill_manual(values = colors, name = paste0("Relative Abundance\n(0-",max_depth,"m)"), labels = c("Suzuki", "CHAB-I-7", "RedeBAC7D11", sar156_family_rename[["SAR156"]])) +
    geom_scatterpie_legend(avg_data_no_southern_ocean$mean_sar86, x = 150, y = 20, n = 3, labeller = function(x) signif(x/radius_multiple, 1)) +
    theme(legend.text = element_text(size = 14),
        legend.title = element_text(size = 18),
        legend.position = c(0.76, 0.80),
        legend.background = element_rect(fill = "gray", size = 0.5, colour = "gray"))

In [None]:
ocean_averages <- avg_data %>%
    mutate(Magnimaribacterales = mean_sar86/radius_multiple) %>% 
    select(c(ocean_section, mean_Suzuki, `mean_CHAB-I-7`, `mean_RedeBAC7D11`, `mean_SAR156`, `Magnimaribacterales`)) %>%
    rename(`Ocean Region` = ocean_section,
           `Suzuki` = mean_Suzuki,
           `CHAB-I-7` = `mean_CHAB-I-7`,
           `RedeBAC7D11` = `mean_RedeBAC7D11`,
           `Magnimaribacteraceae` = `mean_SAR156`)

# Get a data table for the averages
ocean_averages %>%
    write_csv("../workflow-read-recruitment/data/11_abundance_estimates/ocean_region_average.csv")

In [None]:
# Get the genus averages figure

bgt_genera_samples <- merged_genera_data %>%
  filter(startsWith(sample, "BGT")) %>%
  filter(`Sampling depth [m]` <= max_depth) %>%
  mutate(ocean_section = case_when(`Longitude [degrees East]` > -100 & `Longitude [degrees East]` < -5 & `Latitude [degrees North]` > 0 ~ "(NAO) North Atlantic Ocean",
    `Longitude [degrees East]` > -70 & `Longitude [degrees East]` < 20 & `Latitude [degrees North]` < 0 ~ "(SAO) South Atlantic Ocean", `Longitude [degrees East]` >
      145 | `Longitude [degrees East]` < -70 & `Latitude [degrees North]` < 0 ~ "(SPO) South Pacific Ocean"))

# Filter out samples we don't want
full_genera_data <- merged_genera_data %>%
    filter(startsWith(sample, "TARA")) %>%
    # Remove virus filter samples sizes
    filter(concat_filter != "< 0.22") %>%
    filter(concat_filter != "0.1 0.22") %>%
    # Remove deeper samples from depths greater than 200m
    filter(`Sampling depth [m]` <= max_depth) %>%
    # Drop MRGID portion
    mutate(ocean_section = str_sub(`Ocean and sea regions (IHO General Sea Areas 1953) [MRGID registered at www.marineregions.com]`, end = -14)) %>%
    # Merge the BGT data with the TARA data
    rbind(y = bgt_genera_samples)

full_genera_data %>%
    filter(`Sampling depth [m]` <= max_depth) %>%
    gather("genus", "abundance", all_of(sar86_generas)) %>% 
    # Create a family column for the facet plot
    mutate(family = factor(levels = family_level_order, str_replace(genus, "_.*", ""))) %>%
    ggplot(aes(x = genus, y = abundance, fill = ocean_section)) + geom_boxplot(outlier.size = 0.5) +
    scale_fill_manual(name = paste0("Ocean Region (0-", max_depth, "m)"), values = color_values) +
    labs(x = "", y = "Relative Abundance (%)") +
    # Rename the SAR156 values
    scale_x_discrete(labels = sar156_genus_rename_full) +
    mike_formatting +
    #scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks = scales::pretty_breaks(n = 8)) + 
    scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) + 
    #facet_grid(. ~ family, scales = "free_x", space = "free_x", labeller = as_labeller(sar156_family_rename), rows = 2) + 
    facet_wrap(. ~ family, scales = 'free', nrow = 2, labeller = as_labeller(sar156_family_rename), strip.position="top") + 
    # Rename the SAR156 values
    scale_x_discrete(labels = genus_list_link) +
    #annotate("text", aes(x= x, y = y, label = family), data=family_df )+
    theme(legend.position = c(0.50, 0.35),
    #theme(legend.position="bottom",
          strip.background = element_blank(),
          panel.border = element_rect(colour = "black", fill = NA),
          strip.text.x = element_text(size = 16, colour = "black"),
          #axis.text.x = element_text(vjust = 12),
          strip.placement = "outside",
          panel.spacing=unit(0.5,"lines")) +
    guides(fill=guide_legend(nrow=1))