Run gRodon to calculate the growth rates for SAGs
### Citation 
Weissman JL, Hou S, Fuhrman JA. 2020. Estimating maximal microbial growth rates from cultures, metagenomes, and single cells via codon usage patterns. bioRxiv 118:1–10.



In [1]:
library(gRodon)
library(Biostrings)
library(tools)
library(tidyverse)

setwd('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/gRodon')

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min


Loading required package: S4Vectors

Loading required pac

In [2]:

# output filename
out_name <- "/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/gRodon/gRodon_SAG_growth_rate.csv"

# input ffn directories
ffn_dirs <- c("/mnt/scgc/simon/microg2p/Data/SAG_genes/GoM_190709_genes_ffn/",
              "/mnt/scgc/simon/microg2p/Data/SAG_genes/GoM_190402_genes_ffn/",
              "/mnt/scgc/simon/microg2p/Data/SAG_genes/GoM_181030_genes_ffn/",
              "/mnt/scgc/simon/microg2p/Data/SAG_genes/GoM_171102_genes_ffn/",
              "/mnt/scgc/simon/microg2p/Data/SAG_genes/GoM_170818_genes_ffn/",
              "/mnt/scgc/simon/microg2p/Data/SAG_genes/GoM_170412_genes_ffn/")

#variable that holds all of the outputs
total_pG <- c()

# store warning status and turn off warnings
oldw <- getOption("warn")
options(warn = -1)

#cycle through all the files in the input directories

for (i_dirs in 1:6) {
    ffn_files <- list.files(path=ffn_dirs[i_dirs])
    for (i_files in seq_along(ffn_files)) {
        file_name <- paste(ffn_dirs[i_dirs], ffn_files[i_files], sep="")
        
        # create input for gRodon a list of gene sequences and create TRUE, FALSE list of whether or not gener are "ribosomal proteins"
        genes <- readDNAStringSet(file_name)
        
        # The next couple of lines are new ideas that Jacob is testing
        
        # create a list of the gene name and the length of each gene convert to a list of integers
        lengths <- fasta.seqlengths(file_name)
        lengths <- as.integer(lengths)

        highly_expressed <- grepl("ribosomal protein",names(genes),ignore.case = T)
        
        # use highly_expressed and check if the position of each TRUE value equates to a length >240 in the lengths list.
        # If it does keep it as true, if it does not convert it to FALSE
        highly_expressed <- (highly_expressed) & (lengths > 240)

        
        # if at least 1 gene is "ribosomal protein" run gRodon and store the output I change this to require a certain number of 
        # ribosomal proteins. The article suggests 10 as a good cutoff. I can also filter after the fact using the csv file
        if (sum(highly_expressed) >= 1) {
            pG <- predictGrowth(genes, highly_expressed, mode="partial")
            
            # Store the SAG name, the number of total genes in the ffn file, and the number of "highly expressed ribosomal proteins"
            # in each SAG and add it to the output
            SAG <- file_path_sans_ext(ffn_files[i_files])
            pG[["SAG"]] <- SAG
            pG[["Total_genes"]] <- length(highly_expressed)
            pG[["N_Highly_expressed"]] <- sum(highly_expressed)
            
            # convert output to martix and append it to the total output
            total_pG <- rbind(total_pG, unlist(pG))
        }
    }
}
#turn warnings back on
options(warn = oldw)

#write csv
write.csv(file=out_name, total_pG)


In [3]:
# This code is just here until i get the error fixed and then it will be moved up
write.csv(file=out_name, total_pG)

In [4]:
total_pG

CUBHE,ConsistencyHE,CPB,FilteredSequences,d,LowerCI,UpperCI,SAG,Total_genes,N_Highly_expressed
0.481753997765038,0.458151049372234,-0.353868133537936,39,7.88606142748699,5.65408922957662,11.1813318420474,AH-700-A01,386,11
0.506449633644106,0.0434782608695652,-0.312448379342824,1,0.0681887247429016,0.0257385395495833,0.21115383193757,AH-700-A02,39,1
0.659936810220889,0.556610092516263,-0.397104118752264,71,5.59724570468886,4.38504984340748,7.20715795604723,AH-700-A04,1235,25
0.614825486400018,0.298165137614679,-0.42988966977566,15,0.34677587730856,0.192647666772853,0.658407033259115,AH-700-A05,230,1
0.571011768215598,0.533397492963343,-0.321667968915774,113,9.97397288174628,7.72728777431789,12.9975057214274,AH-700-A06,1921,37
0.552036766334537,0.410932187704819,-0.4010472755882,65,2.04095767152639,1.40451332063642,3.02831602830297,AH-700-A07,665,2
0.51737172906488,0.470009847287821,-0.374566696828607,21,6.51862664196612,4.94138655364835,8.6969938506091,AH-700-A08,362,27
0.468654867087528,0.395139373096451,-0.354883860512215,41,3.53479884891472,2.21857581572509,5.82003216630094,AH-700-A09,375,5
0.639132908698955,0.503667330537754,-0.325439733683894,149,3.20303117334922,2.75146102618526,3.74117884752833,AH-700-A11,1812,17
0.624245168537885,0.524990873448266,-0.40164861620838,117,5.01403085707391,4.19465471876338,6.02121023087885,AH-700-A13,1745,43
