.Rhistory

#'
#' @details The genomic heterozygosity (also known as the autosomal
#' heterozygosity) has been demonstrated to be the more accurate and robust
#' measure of heterozygosity (Schmidt et al. 2021). SNP-wise heterozygosity can
#' suffer from sampling biases (filtering, missing data, sample size, etc).
#' Note, estimates of genomic heterozygosity are orders of magnitude less than
#' SNP-wise heterozygosity (so do not be alarmed if you see very small values!).
#'
#' Heterozygsity for population pools is calculated using the method from
#' Ferretti et al. (2013). You can calculate genomic heterozygosity for population
#' pools (standardising by total covered sites), or SNP-wise heterozygosity too
#' (standardising by the number of polymorphic sites observed).
#'
#' You must specify both \code{type} and \code{method}. This will dicate the
#' required column needed for \code{snpData} and \code{extraData}.
#'
#' If \code{type=='genos'} and \code{method=='genomic'}:
#' \enumerate{
#'    \item \code{snpData} requires columns specified in:
#'    \code{chromCol}, \code{posCol}, \code{locusCol} and \code{sampCol}
#'
#'    \item \code{extraData} requires columns specified in:
#'    \code{chromCol}, \code{sampCol}, and \code{covCol}.
#' }
#'
#' If \code{type=='genos'} and \code{method=='snpwise'}
#' \enumerate{
#'    \item \code{snpData} requires columns specified in:
#'    \code{chromCol}, \code{posCol}, \code{locusCol} and \code{sampCol}
#'
#'    \item \code{extraData} will NOT be used.
#' }
#'
#' If \code{type=='freqs'} and \code{method=='genomic'}:
#' \enumerate{
#'    \item \code{snpData} requires columns specified in:
#'    \code{chromCol}, \code{posCol}, \code{locusCol}, \code{popCol},
#'    \code{roCol}, and \code{aoCol}.
#'
#'    \item \code{extraData} requires columns specified in:
#'    \code{chromCol}, \code{sampCol}, \code{covCol}, and \code{indsCol}.
#' }
#'
#' If \code{type=='freqs'} and \code{method=='snpwise'}:
#' \enumerate{
#'    \item \code{snpData} requires columns specified in:
#'    \code{chromCol}, \code{posCol}, \code{locusCol}, \code{popCol},
#'    \code{roCol}, and \code{aoCol}.
#'
#'    \item \code{extraData} requires columns specified in:
#'    \code{chromCol}, \code{sampCol}, and \code{indsCol}.
#' }
#'
#' @return Returns a data table of heterozygosity estimates per sample or population.
#' Genomic heterozygosity is reported per chromosome, SNP-wise heterozygosity is
#' reported across all SNPs.
#'
#' @references
#' Ferretti et al. (2013) Molecular Ecology. DOI: 10.1111/mec.12522 \cr
#' Schmidt et al. (2021) Methods in Ecology and Evolution. DOI: 10.1111/2041-210X.13659
#'
#' @examples
#' library(genomalicious)
#'
#' data(data_Genos)
#' data(data_PoolFreqs)
#'
#' # Convert genos to characters
#' data_Genos[, GT:=genoscore_converter(GT)]
#'
#' # Make extra data for the samples and populatin pools
#' extraSampInfo <- CJ(
#'   SAMPLE=unique(data_Genos$SAMPLE),
#'   CHROM=unique(data_Genos$CHROM),
#'   COV.SITES=150
#'   )
#'
#' extraPoolInfo <- CJ(
#'   POP=unique(data_PoolFreqs$POP),
#'   CHROM=unique(data_PoolFreqs$CHROM),
#'   COV.SITES=150,
#'   INDS=30
#'   )
#'
#' # Genomic heterozygosity of individuals, per chromosome/contig
#' het_calc(data_Genos, extraSampInfo, type='genos', method='genomic')
#'
#' # SNP-wise heterozygosity of indivdiuals, SNP-wise
#' het_calc(data_Genos, extraSampInfo, type='genos', method='snpwise')
#'
#' # Genomic heterozygosity of population pools, per chromosome/contig
#' het_calc(data_PoolFreqs, extraPoolInfo, type='freqs', method='genomic')
#'
#' # Genomic heterozygosity of population pools, SNP-wise
#' het_calc(data_PoolFreqs, extraPoolInfo, type='freqs', method='snpwise')
#' @export
het_calc <- function(
snpData, extraData, method, type, chromCol='CHROM', posCol='POS', locusCol='LOCUS',
sampCol='SAMPLE', genoCol='GT', popCol='POP', roCol='RO', aoCol='AO',
covCol='COV.SITES', indsCol='INDS'
){
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
####   CHECK AND ENVIRONMENT   ####
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Check type
if(!type %in% c('genos','freqs')){
stop('Argument `type` must be one of "genos" or "freqs". See ?het_calc.')
}
# Check method
if(!method %in% c('genomic','snpwise')){
stop('Argument `method` must be one of "genomic" or "snpwise". See ?het_calc.')
}
# Check the columns in snpData and extraData
if(type=='genos'){
# Genomic heterozygosity
if(method=='genomic'){
# SNP data
check.col.snp <- c(chromCol, posCol, locusCol, sampCol, genoCol)
if(length(check.col.snp)!=5){
stop('Argument `type`=="genos" and `method`=="genomic". All columns `chromCol`, `posCol`, `locusCol` and `sampCol` must be in `snpData`. See ?het_calc.')
}
snpData <- copy(snpData) %>%
setnames(., check.col.snp, c('CHROM','POS','LOCUS','SAMPLE','GT'))
# Extra data
check.col.extra <- c(chromCol, sampCol, covCol)
if(length(check.col.extra)!=3){
stop('Argument `type`=="genos" and `method`=="genomic". All columns `chromCol`, `sampCol`, and `covCol` must be in `extraData`. See ?het_calc.')
}
extraData <- copy(extraData) %>%
setnames(., check.col.extra, c('CHROM','SAMPLE','COV.SITES'))
}
# SNP-wise heterozygosity
if(method=='snpwise'){
# SNP data
check.col.snp <- c(chromCol, posCol, locusCol, sampCol, genoCol)
if(length(check.col.snp)!=5){
stop('Argument `type`=="genos" and `method`=="snpwise". All columns `chromCol`, `posCol`, `locusCol` and `sampCol` must be in `snpData`. See ?het_calc.')
}
snpData <- copy(snpData) %>%
setnames(., check.col.snp, c('CHROM','POS','LOCUS','SAMPLE','GT'))
}
}
if (type=='freqs'){
# Genomic heterozygosity
if(method=='genomic'){
# SNP data
check.col.snp <- c(chromCol, posCol, locusCol, popCol, roCol, aoCol)
if(length(check.col.snp)!=6){
stop('Argument `type`=="freqs" and `method`=="genomic". All columns `chromCol`, `posCol`, `locusCol`, `popCol`, `roCol` and `aoCol` must be in `snpData`. See ?het_calc.')
}
snpData <- copy(snpData) %>%
setnames(., check.col.snp, c('CHROM','POS','LOCUS','POP','RO','AO'))
# Extra data
check.col.extra <- c(chromCol, popCol, covCol, indsCol)
if(length(check.col.extra)!=4){
stop('Argument `type`=="freqs" and `method`=="genomic". All columns `chromCol`, `sampCol`, `covCol`, and `indsCol` must be in `extraData`. See ?het_calc.')
}
extraData <- copy(extraData) %>%
setnames(., check.col.extra, c('CHROM', 'POP', 'COV.SITES', 'INDS'))
}
# SNP-wise heterozygosity
if(method=='snpwise'){
# SNP data
check.col.snp <- c(chromCol, posCol, locusCol, popCol, roCol, aoCol)
if(length(check.col.snp)!=6){
stop('Argument `type`=="freqs" and `method`=="snpwise". All columns `chromCol`, `posCol`, `locusCol`, `popCol`, `roCol` and `aoCol` must be in `snpData`. See ?het_calc.')
}
snpData <- copy(snpData) %>%
setnames(., check.col.snp, c('CHROM', 'POS', 'LOCUS', 'POP', 'RO', 'AO'))
# Extra data
check.col.extra <- c(chromCol, popCol, indsCol)
if(length(check.col.extra)!=3){
stop('Argument `type`=="freqs" and `method`=="snpwise". All columns `chromCol`, `sampCol`, and `indsCol` must be in `extraData`. See ?het_calc.')
}
extraData <- copy(extraData) %>%
setnames(., check.col.extra, c('CHROM', 'POP', 'INDS'))
}
}
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
####   GENOMIC HETEROZYGOSITY   ####
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# For genomic heterozygosity, iterate over combinations of
# chromosome and populations in the extra data table.
if(method=='genomic'){
# Using individual genotypes
if(type=='genos'){
snpData[, ALLELE.1:=sub('/.*','',GT), by=c('SAMPLE','LOCUS')]
snpData[, ALLELE.2:=sub('.*/','',GT), by=c('SAMPLE','LOCUS')]
snpData[, HET:=(ALLELE.1!=ALLELE.2), by=c('SAMPLE','LOCUS')]
result <- snpData[, .(HET=sum(HET)), by=c('CHROM','SAMPLE')] %>%
merge.data.table(., extraData) %>%
.[, HET:=HET/COV.SITES] %>%
.[, c('CHROM','SAMPLE','HET')]
}
# Using population allele frequencies
if(type=='freqs'){
snpData <- snpData %>%
.[, RO:=as.integer(RO)] %>%
.[, AO:=as.integer(AO)] %>%
.[, DP:=AO+RO]
result <- lapply(1:nrow(extraData), function(i){
chrom <- extraData$CHROM[i]
pop <- extraData$POP[i]
cov.sites <- extraData$COV.SITES[i]
inds <- extraData$INDS[i]
allele.frac <- snpData[POP==pop] %>%
.[, NUMER:=2*(AO*(DP-AO)), by=LOCUS] %>%
.[, DENOM:=DP*(DP-1)] %>%
.[, sum(NUMER/DENOM)]
het <- (inds/(inds-1)) * (1/cov.sites) * allele.frac
data.table(CHROM=chrom, POP=pop, HET=het)
}) %>%
do.call('rbind',.)
}
}
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
####   SNP-WISE HETEROZYGOSITY   ####
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
if(method=='snpwise'){
# Using individual allele frequencies
if(type=='genos'){
snpData[, ALLELE.1:=sub('/.*','',GT), by=c('SAMPLE','LOCUS')]
snpData[, ALLELE.2:=sub('.*/','',GT), by=c('SAMPLE','LOCUS')]
snpData[, HET:=(ALLELE.1!=ALLELE.2), by=c('SAMPLE','LOCUS')]
result <- snpData[, .(HET=sum(HET)/length(HET)), by=c('SAMPLE')]
}
# Using population allele frequencies
if(type=='freqs'){
snpData <- snpData %>%
.[, RO:=as.integer(RO)] %>%
.[, AO:=as.integer(AO)] %>%
.[, DP:=AO+RO]
result <- lapply(1:nrow(extraData), function(i){
chrom <- extraData$CHROM[i]
pop <- extraData$POP[i]
inds <- extraData$INDS[i]
num.sites <- snpData[POP==pop] %>% nrow
allele.frac <- snpData[POP==pop] %>%
.[, NUMER:=2*(AO*(DP-AO)), by=LOCUS] %>%
.[, DENOM:=DP*(DP-1)] %>%
.[, sum(NUMER/DENOM)]
het <- (inds/(inds-1)) * (1/num.sites) * allele.frac
data.table(CHROM=chrom, POP=pop, HET=het)
}) %>%
do.call('rbind',.)
}
}
# Output
return(result)
}
# Developer libraries
libs <- c('devtools', 'roxygen2', 'testthat', 'knitr', 'data.table', 'tidyverse')
for(L in libs){require(L, character.only=TRUE)}
# Make just those documents that have changed
roxygenise()
library(genomalicious)
# Make just those documents that have changed
roxygenise()
library(genomalicious)
# Developer libraries
libs <- c('devtools', 'roxygen2', 'testthat', 'knitr', 'data.table', 'tidyverse')
for(L in libs){require(L, character.only=TRUE)}
# Make just those documents that have changed
roxygenise()
library(genomalicious)
# Make just those documents that have changed
roxygenise()
library(genomalicious)
library(genomalicious)
library(genomalicious)
data(data_Genos)
data(data_PoolFreqs)
data(data_PoolInfo)
data_Genos$GT %>% head
data_Genos[, GT:=genoscore_converter(GT)]
data_Genos$GT %>% head
# Set allele counts and individuals in pool-seq data
data_PoolFreqs %>% head
data_PoolInfo %>% head
data_PoolFreqs[, COUNTS:=paste(RO,AO,sep=',')]
data_PoolFreqs$INDS <- data_PoolInfo$INDS[
match(data_PoolFreqs$POOL, data_PoolInfo$POOL)
]
head(data_PoolFreqs)
# Genotypes and global F-statistics
fstat_calc(
dat=data_Genos,
type='genos', method='global', fstatVec=c('FST','FIS','FIT'),
popCol='POP', sampCol='SAMPLE',
locusCol='LOCUS', genoCol='GT',
permute=FALSE
)
fstat_calc(
dat=data_Genos,
type='genos', method='pairwise', fstatVec=c('FST','FIS','FIT'),
popCol='POP', sampCol='SAMPLE',
locusCol='LOCUS', genoCol='GT',
permute=FALSE
)
fstat_calc(
dat=data_PoolFreqs,
type='freqs', method='global', fstatVec=NULL,
popCol='POP', locusCol='LOCUS',
countCol='COUNTS', indsCol='INDS',
permute=FALSE
)
fstat_calc(
dat=data_PoolFreqs,
type='freqs', method='pairwise', fstatVec=NULL,
popCol='POP', locusCol='LOCUS',
countCol='COUNTS', indsCol='INDS',
permute=FALSE
)
# Developer libraries
libs <- c('devtools', 'roxygen2', 'testthat', 'knitr', 'data.table', 'tidyverse')
for(L in libs){require(L, character.only=TRUE)}
# Make all documents
roxygenise(clean=TRUE)
library(genomalicious)
library(genomalicious)
# Developer libraries
libs <- c('devtools', 'roxygen2', 'testthat', 'knitr', 'data.table', 'tidyverse')
for(L in libs){require(L, character.only=TRUE)}
# Make just those documents that have changed
roxygenise()
library(genomalicious)
# Developer libraries
libs <- c('devtools', 'roxygen2', 'testthat', 'knitr', 'data.table', 'tidyverse')
for(L in libs){require(L, character.only=TRUE)}
# Load currently installed genomalicious
library(genomalicious)
# Make just those documents that have changed
roxygenise()
library(genomalicious)
genomaliciousExtData <- paste0(find.package('genomalicious'), '/extdata')
list.files(path=genomaliciousExtData, pattern='indseq.vcf')
vcfPath <- paste0(genomaliciousExtData, '/data_indseq.vcf')
# You can read the file in as lines to see what it
# looks like:
readLines(vcfPath) %>%  head
readLines(vcfPath) %>%  tail
# Now read it in as a data table
readVcf1 <- vcf2DT(vcfFile=vcfPath)
readVcf1 %>% print()
#' VCF file to data table
#'
#' Reads a VCF file and converts to a long format data table. Note, that whilst
#' the \code{data.table} object class is very memory efficient, very large genomic
#' datasets might take longer to read in, and/or be difficult to hold in
#' memory. Take your operating system and the size of your input dataset into
#' consideration when using this function.
#'
#' @param vcfFile Character: The path to the input VCF file.
#'
#' @param dropCols Character: Vector of column names from the VCF that you
#' want to drop from the output data table. Use this for any column that occurs
#' before the 'FORMAT' column in the original VCF file. Default = \code{NULL}.
#'
#' @param keepComments Logical: Should the VCF comments be kept?
#' Default = \code{FALSE}. See Details for parameterisation.
#'
#' @param keepInfo Logical: Should the VCF info for each locus be kept?
#' Default = \code{FALSE}.
#'
#' @details Firstly, it should be noted that while data tables are a really
#' excellent way of handling genotype and sequence read information in R,
#' they are not necessarily the most efficient way to do so for very large
#' genomic datasets. Take your operating system and/or dataset in mind before
#' using this function. Most RADseq datasets should be manageable, but
#' whole-genome data can be challenging if you do not have a lot of available
#' memory. You can always try loading in subsets (e.g., by chromosome or contigs)
#' of your dataset to see how feasible it is to load with this function.
#'
#' @return A \code{data.table} object is returned with all the columns contained in
#' the original VCF file with some additions:
#' \itemize{
#'     \item A column called \code{$LOCUS} is generated. This is the concatenation of the
#'              \code{$CHROM} and \code{$POS} column to form a locus ID. "CHROM:POS".
#'     \item A column called \code{$SAMPLE} is generated. This contains the sample IDs that
#'              are the columns that follow the \code{$FORMAT} column in the original VCF.
#'     \item The items in the original \code{$FORMAT} column of the VCF are given their own columns.
#' } \cr
#' Note, for VCF files produced by Stacks, the $CHROM is given the same value
#' as the $ID column. \cr\cr
#' When \code{keepInfo==TRUE} and/or \code{keepComments==TRUE}, these are returned
#' as attributes. E.g., if the returned object is \code{vcfDT}, then you can
#' access Info and Comments (respectively) with: \code{attr(vcfDT, 'vcf_info')}
#' and \code{attr(vcfDT, 'vcf_comments')}.
#'
#' @examples
#' # Create a link to raw external datasets in genomalicious
#' genomaliciousExtData <- paste0(find.package('genomalicious'), '/extdata')
#'
#' # This command here shows you the VCF file that comes with genomalicious
#' list.files(path=genomaliciousExtData, pattern='indseq.vcf')
#'
#' # Use this to create a path to that file
#' vcfPath <- paste0(genomaliciousExtData, '/data_indseq.vcf')
#'
#' # You can read the file in as lines to see what it
#' # looks like:
#' readLines(vcfPath) %>%  head
#' readLines(vcfPath) %>%  tail
#'
#' # Now read it in as a data table
#' readVcf1 <- vcf2DT(vcfFile=vcfPath)
#' readVcf1 %>% print()
#'
#' # Read in VCF, but drop some columns,
#' # and keep comments and info.
#' readVcf2 <- vcf2DT(vcfPath
#'    , dropCols=c('QUAL')
#'    , keepComments=TRUE
#'    , keepInfo=TRUE)
#'
#' readVcf2 %>% print
#'
#' attr(readVcf2, 'vcf_comments')
#' attr(readVcf2, 'vcf_info')
#'
#' @export
vcf2DT <- function(vcfFile, dropCols=NULL, keepComments=FALSE, keepInfo=FALSE){
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# #### Libraries and assertions            ####
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
require(data.table); require(tidyverse)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# #### Code: VCF to data table             ####
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# Read file from header
cat('(1/4) Reading in VCF as a data table', sep='\n')
vcfDT <- fread(vcfFile, skip='#CHROM', sep='\t', header=TRUE)
# Adjust header
colnames(vcfDT) <- gsub(pattern='#', replace='', x=colnames(vcfDT))
# Which columns are the sample? The ones after the FORMAT column.
sampCols <- colnames(vcfDT)[(which(colnames(vcfDT)=='FORMAT')+1):ncol(vcfDT)]
# Generate a $LOCUS column, place at the start of the data table
cat('(2/4) Generating locus IDs', sep='\n')
vcfDT[, LOCUS:=paste0(CHROM, ':', POS)]
# Get the locus info as a vector and drop from data table
if(keepInfo==TRUE){
vcfInfo <- vcfDT$INFO
names(vcfInfo) <- vcfDT$LOCUS
}
vcfDT <- vcfDT[, !'INFO']
# Drop unwanted columns here to save memory
if(is.null(dropCols)==FALSE){
vcfDT <- vcfDT[, !dropCols, with=FALSE]
}
# Now convert the data from wide to long
cat('(3/4) Converting from wide to long format', sep='\n')
vcfDT <- data.table::melt(
data=vcfDT,
id.vars=colnames(vcfDT)[!colnames(vcfDT)%in%sampCols],
measure.vars=sampCols,
variable.name='SAMPLE',
value.name='DATA') %>%
as.data.table()
# Make sure SAMPLE is a character
vcfDT[, SAMPLE:=as.character(SAMPLE)]
# Separate out the FORMAT data components into their own columns
cat('(4/4) Parsing data for each sample', sep='\n')
# ... Get the format names
formatNames <- unlist(strsplit(vcfDT$FORMAT[1], split=':'))
# ... Drop the format column to save space now
vcfDT <- vcfDT[, !'FORMAT']
# ... If the $DATA column is '.', add in NA
vcfDT[DATA=='.', DATA:=NA]
# ... Separate $DATA by $FORMAT names
vcfDT <- vcfDT[, tstrsplit(DATA, ':', names=formatNames)] %>%
cbind(vcfDT[, !'DATA'], .) %>%
as.data.table()
# .... Replace '.' values in the data columns with NA
for(f in formatNames){
vcfDT[[f]][vcfDT[[f]]=='.'] <- NA
}
# Make sure DP and RO integers
if('DP' %in% colnames(vcfDT)){ vcfDT[, DP:=as.integer(DP)] }
if('RO' %in% colnames(vcfDT)){ vcfDT[, RO:=as.integer(RO)] }
# Attach header as an attribute, if specified.
if(keepComments==TRUE){
attr(vcfDT, 'vcf_comments') <- readLines(vcfFile, n=headPos-1)
}
# Attach info as an attribute if, if specified.
if(keepInfo==TRUE){
attr(vcfDT, 'vcf_info') <- vcfInfo
}
# Finish
cat('All done! <3', '\n')
# Return the data.table, drop any columns if specified.
return(vcfDT)
}
genomaliciousExtData <- paste0(find.package('genomalicious'), '/extdata')
list.files(path=genomaliciousExtData, pattern='indseq.vcf')
vcfPath <- paste0(genomaliciousExtData, '/data_indseq.vcf')
readLines(vcfPath) %>%  head
readLines(vcfPath) %>%  tail
readVcf1 <- vcf2DT(vcfFile=vcfPath)
readVcf1 %>% print()
# Developer libraries
libs <- c('devtools', 'roxygen2', 'testthat', 'knitr', 'data.table', 'tidyverse')
for(L in libs){require(L, character.only=TRUE)}
# Make just those documents that have changed
roxygenise()
library(genomalicious)
# Make just those documents that have changed
roxygenise()
# Developer libraries
libs <- c('devtools', 'roxygen2', 'testthat', 'knitr', 'data.table', 'tidyverse')
for(L in libs){require(L, character.only=TRUE)}
# Make just those documents that have changed
roxygenise()