Skip to content

Latest commit

 

History

History
213 lines (172 loc) · 5.42 KB

bacteria-and-archaea.md

File metadata and controls

213 lines (172 loc) · 5.42 KB
title output
archaea vs bacteria infecting viruses
html_document
keep_md
true

#importing data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggthemes)
library(paletteer)
library(ggplot2)
library(RColorBrewer)

#setting up aesthetics for later

colors<- LaCroixColoR::lacroix_palette("Pamplemousse", type = "discrete")

#importing data

viruses <- readr::read_csv("data/viruses.csv") %>% clean_names()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `Organism Name` = col_character(),
##   `Organism Groups` = col_character(),
##   BioSample = col_logical(),
##   BioProject = col_character(),
##   Assembly = col_character(),
##   Level = col_character(),
##   `Size(Mb)` = col_double(),
##   `GC%` = col_double(),
##   Host = col_character(),
##   CDS = col_double(),
##   Neighbors = col_double(),
##   `Release Date` = col_datetime(format = ""),
##   `GenBank FTP` = col_character(),
##   `RefSeq FTP` = col_character(),
##   Genes = col_double(),
##   Scaffolds = col_double()
## )
## Warning: 344 parsing failures.
##  row       col           expected       actual               file
## 1438 BioSample 1/0/T/F/TRUE/FALSE SAMN02981359 'data/viruses.csv'
## 4401 BioSample 1/0/T/F/TRUE/FALSE SAMN02981224 'data/viruses.csv'
## 8940 BioSample 1/0/T/F/TRUE/FALSE SAMN01137200 'data/viruses.csv'
## 8941 BioSample 1/0/T/F/TRUE/FALSE SAMN01137212 'data/viruses.csv'
## 8944 BioSample 1/0/T/F/TRUE/FALSE SAMN01137140 'data/viruses.csv'
## .... ......... .................. ............ ..................
## See problems(...) for more details.

#cleaning data

viruses <- viruses %>% 
  filter(level == "Complete") %>% 
  select(organism_name, organism_groups, level, size_mb, gc_percent, host, genes)
viruses<-viruses %>% 
  separate(organism_groups, into = c("domain", "group", "family"),sep=";")

#glimpse of data

glimpse(viruses)
## Rows: 32,211
## Columns: 9
## $ organism_name <chr> "White spot syndrome virus", "Posavirus sp.", "Hepatitis…
## $ domain        <chr> "Viruses", "Viruses", "Viruses", "Viruses", "Viruses", "…
## $ group         <chr> "Other", "unclassified viruses", "Other", "Other", "uncl…
## $ family        <chr> "Nimaviridae", "unclassified", "Hepeviridae", "Marnaviri…
## $ level         <chr> "Complete", "Complete", "Complete", "Complete", "Complet…
## $ size_mb       <dbl> 0.309286, 0.009570, 0.007176, 0.008587, 0.018098, 0.0044…
## $ gc_percent    <dbl> 40.9000, 32.8000, 57.9000, 46.9000, 47.2000, 51.7000, 42…
## $ host          <chr> "invertebrates", "vertebrates", "human,vertebrates", NA,…
## $ genes         <dbl> 177, 1, 3, 1, 25, 3, 3, 3, 3, 75, 265, 51, 7, 1, 7, 10, …

#comparing viruses with bacterial vs archaeal hosts
#bacteria: larger genome size, more genes, higher gc content
#similar gene to genome size ratio

arch_bact <- viruses %>% 
  filter(host == "archaea"|host == "bacteria")
data <- arch_bact %>% 
  group_by(host) %>% 
  summarize(size = mean(size_mb, na.rm = T),
            gc = mean(gc_percent, na.rm = T),
            gene = mean(genes, na.rm = T))
data %>% 
  ggplot(aes(x=host, y=size, fill=host))+
  geom_col()+
  theme_solarized()+
  scale_fill_manual(values=colors)+
  labs(title = "Genome Size of Viruses by Host",
       x="Host",
       y="Genome Size (Mb)")

data %>% 
  ggplot(aes(x=host, y=gc, fill=host))+
  geom_col()+
  theme_solarized()+
  scale_fill_manual(values=colors)+
  labs(title = "GC Content of Viruses by Host",
       x="Host",
       y="GC Content (Percent)")

data %>% 
  ggplot(aes(x=host, y=gene, fill=host))+
  geom_col()+
  theme_solarized()+
  scale_fill_manual(values=colors)+
  labs(title = "Number of Genes in Virus Genomes by Host",
       x="Host",
       y="Number of Genes")

arch_bact %>% 
  filter(genes != 0) %>% 
  mutate(gene_mb_ratio = genes/size_mb) %>% 
  group_by(host) %>% 
  summarize(gene_content = mean(gene_mb_ratio, na.rm = T)) %>%
  ggplot(aes(x=host, y=gene_content, fill=host))+
  geom_col()+
  theme_solarized()+
  scale_fill_manual(values=colors)+
  labs(title = "Gene to Genome Size Ratio by Host",
       x="Host",
       y="Genes to Genome Size (Genes/Mb)")