Skip to content

Commit

Permalink
Added PCA function, PCA article, rebuilt site
Browse files Browse the repository at this point in the history
  • Loading branch information
kfarleigh committed Apr 2, 2024
1 parent 5346851 commit 8dbd55b
Show file tree
Hide file tree
Showing 30 changed files with 337 additions and 604 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -8,6 +8,7 @@ export(Div_Stats_Map)
export(Div_stats)
export(Heterozygosity)
export(Network_map)
export(PCA)
export(Pairwise_heatmap)
export(Piechart_map)
export(Plot_ancestry)
Expand Down
111 changes: 111 additions & 0 deletions R/PCA.R
@@ -0,0 +1,111 @@
#' A function to perform principal component analysis (PCA) on genetic data. Loci with missing data will be removed prior to PCA.
#'
#' @param data Character. String indicating the name of the vcf file, geno file or vcfR object to be used in the analysis.
#' @param center Boolean. Whether or not to center the data before principal component analysis.
#' @param scale Boolean. Whether or not to scale the data before principal component analysis.
#' @param missing_value Character. String indicating missing data in the input data. It is assumed to be NA, but that may not be true (is likely not) in the case of geno files.
#' @param write Boolean. Whether or not to write the output to files in the current working directory. There will be two files, one for the individual loadings and the other for the percent variance explained by each axis.
#' @param prefix Character. Optional argument. String that will be appended to file output. Please provide a prefix if write is set to TRUE.
#' @return A list containing two elements: the loadings of individuals on each principal component and the variance explained by each principal component.
#'
#' @author Keaka Farleigh
#' @export
#'
#' @examples
#' \donttest{
#' data("HornedLizard_VCF")
#' Test <- PCA(data = HornedLizard_VCF)}
PCA <- function(data, center = TRUE, scale = FALSE, missing_value = NA, write = FALSE, prefix = NULL) {

# Read in files and convert to necessary formats
if(missing(data)){
stop("Please supply a file or vcfR object for analysis")
}
if(methods::is(data,"vcfR")){
Dat <- data
print("vcfR object detected, proceeding to formatting.")
# Convert the vcf gt slot to a geno style table for calculations
gt <- vcfR::extract.gt(Dat)
gt[gt == "0/0"] <- 0
gt[gt == "0/1" | gt == "1/0"] <- 1
gt[gt == "1/1"] <- 2

# Transpose the numeric gt matrix
Dat <- as.data.frame(t(as.matrix(gt)))

# Preserve individual names
Inds <- rownames(Dat)
Dat <- sapply(Dat, as.numeric)
}
else if(tools::file_ext(data) == 'vcf') {
Dat <- vcfR::read.vcfR(data, verbose = FALSE)
print("VCF file detected, proceeding to formatting.")
# Convert the vcf gt slot to a geno style table for calculations
gt <- vcfR::extract.gt(Dat)
gt[gt == "0/0"] <- 0
gt[gt == "0/1" | gt == "1/0"] <- 1
gt[gt == "1/1"] <- 2

# Transpose the numeric gt matrix
Dat <- as.data.frame(t(as.matrix(gt)))

# Preserve individual names
Inds <- rownames(Dat)
Dat <- sapply(Dat, as.numeric)
}
else if(tools::file_ext(data) == 'geno'){
Dat <- utils::read.table(data)
print("Geno file detected, proceeding to formatting. Note, PopGenHelpR assumes that your individuals in the geno file and
popmap are in the same order, please check to avoid erroneous inferences.")
}
else {
stop("Please supply a geno file, vcf file, or vcfR object for analysis")
}

# Replace missing data value with NA
if(is.na(missing_value) == FALSE){
Dat[Dat == missing_value] <- NA
}

if(any(is.na(Dat))){
n_NA <- colSums(is.na(Dat))
Loc2rem <- which(n_NA > 0)

print(paste(colnames(Dat)[Loc2rem], " contain NAs which cannot occur in PCA, they have been removed from subsequent analysis. Please impute these sites if you wish to include them. Alternatively, see the R packages LEA and adegenet.", sep = ''))

Dat <- Dat[,-Loc2rem]
}

# Scale the data
Dat_scaled <- scale(Dat, center = center, scale = scale)

# Run the pca
Dat_pca <- stats::prcomp(Dat_scaled)

# Calculate variance explained by each principal component
Dat_pc_var <- summary(Dat_pca)$importance[2:3,]*100
rownames(Dat_pc_var) <- c("Percent variance explained", "Cumulative percent variance explained")

Dat_loadings <- Dat_pca$x
rownames(Dat_loadings) <- Inds

Output <- list(Dat_loadings, Dat_pc_var)

names(Output) <- c("Individual Loadings", "Variance Explained")

if(write & !is.null(prefix)){

utils::write.csv(t(Dat_pc_var), paste(prefix, "_pc_variance_explained.csv", sep = '_'))
utils::write.csv(Dat_loadings, paste(prefix, "_pc_loadings.csv", sep = '_'))

} else if(write & is.null(prefix)){

utils::write.csv(t(Dat_pc_var), "Pc_variance_explained.csv")
utils::write.csv(Dat_loadings, "Pc_loadings.csv")

}

return(Output)

}

2 changes: 2 additions & 0 deletions _pkgdown.yml
Expand Up @@ -25,6 +25,8 @@ navbar:
href: articles/PopGenHelpR_benchmarking.html
- text: Choosing heterozygosity
href: articles/PopGenHelpR_heterozygosity.html
- text: Principal Component Analysis
href: articles/PopGenHelpR_PCA.html
- text: Creating a Q-matrix
href: articles/PopGenHelpR_createQmatrix.html
- text: Sorting a Q-matrix
Expand Down
5 changes: 3 additions & 2 deletions docs/404.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion docs/LICENSE.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 8dbd55b

Please sign in to comment.