-
Notifications
You must be signed in to change notification settings - Fork 0
/
Div_Stats.R
171 lines (147 loc) · 8.32 KB
/
Div_Stats.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#' \bold{WARNING! This function has been deprecated and is no longer supported. Please use the Heterozygosity and Private.alleles functions.}
#' A function to estimate heterozygosity and the number of private alleles from a vcf file.
#'
#' @param VCF Character string indicating the name of the vcf file to be used in analysis.
#' @param pops Character string indicating the name of the population assignment file. This file should have four columns and be in the same order as your vcf file. The first column named Sample indicates the sample name. The second column named Population indicates the population assignment of each individual. The third column named Longitude indicates the longitude of the sample. The fourth column named Latitude indicates the latitude of the sample.
#' @param ploidy Numeric. The ploidy of the data.
#' @param prefix Character string that will be appended to file output.
#' @param write Boolean. Whether or not to write the output to a file in the current working directory.
#'
#' @return A list containing the estimated diversity statistics, model output from linear regression of these statistics against latitude, and model plots.
#' @export
#'
#' @examples
#' \donttest{
#' data("HornedLizard_Pop")
#' data("HornedLizard_VCF")
#' Test <- Div_stats(VCF = HornedLizard_VCF, pops = HornedLizard_Pop,
#' ploidy = 2, write = FALSE)}
Div_stats <- function(VCF, pops, ploidy, write = FALSE, prefix) {
.Deprecated("Heterozygosity", msg = "The Div_Stats function has been deprecated as of PopGenHelpR v1.3.0 and will dissappear in v2.0.0. Please use the Heterozygosity function if you wish to estimate heterozygosity or the Private.alleles function if you wish to calculate the number of private alleles per population. Please use the Point_Map function if you wish to visualize the results on a map or plot.")
Latitude <- Heterozygosity <- Pop <- Standard.Deviation <- Private.Alleles <- NULL
# Read in files and convert to necessary formats
if(missing(VCF)){
stop("Please supply a vcf file for analysis")
}
if(methods::is(VCF,"vcfR")){
Dat <- VCF
Glight <- vcfR::vcfR2genlight(Dat)
}
else if(is.character(VCF)) {
Dat <- vcfR::read.vcfR(VCF, verbose = FALSE)
Glight <- vcfR::vcfR2genlight(Dat)
}
else{
stop("Please supply a vcf file or object of class vcfR for analysis")
}
# Set the ploidy of the genlight
adegenet::ploidy(Glight) <- ploidy
# Make sure that the individuals in the vcf/genlight are in the same order as your popmap
if(is.data.frame(pops)){
Pops <- pops
}
else if(is.character(pops)){
Pops <- utils::read.csv(pops)
}
else{
stop("Please supply a csv file or data frame for population assignment")
}
P <- Pops
if(any(Glight@ind.names %in% P[,1] == FALSE)){
stop("Sample names in the VCF and Population file are not in the same order or samples are missing,
The sample(s) in question are: ",
paste(Glight@ind.names[!(Glight@ind.names %in% P[,1])], collapse = ' ')) }
# Set the populations in the genlight
Pops <- as.factor(Pops[,2])
Glight@pop <- Pops
# Make an genind object and convert to format for heterozygosity calculations
Genind <- dartR::gl2gi(Glight, v = 0)
Hstat <- hierfstat::genind2hierfstat(Genind)
message('Formatting has finished, moving onto calculations')
##########################
##### Heterozygosity #####
##########################
# Calculate heterozygosity and standard deviation for each population
Het <- hierfstat::basic.stats(Hstat)
# Get per populations heterozygosity
H_all <- data.frame(colMeans(Het$Ho, na.rm = TRUE))
H_all$Pop <- rownames(H_all)
colnames(H_all) <- c('Heterozygosity', 'Pop')
# Standard deviation
H_all$StandardDeviation <- stats::sd(H_all$Heterozygosity)
# Attach coordinates
# First we check to make sure that the populations are in the right order
if(any(H_all$Pop != unique(P[,2]))){
stop("Populations are not in the correct order, if you see this please email the package authors")
}
# Pull populations coordinates
Pop_coords <- P[!duplicated(P$Population),]
# Append to heterozygosity estimates
H_all[,4:5] <- Pop_coords[,3:4]
colnames(H_all) <- c('Heterozygosity', 'Pop', 'Standard.Deviation','Longitude', 'Latitude')
# Is there a statistical relationship between heterozygosity and latitude
Het_model <- stats::lm(H_all$Heterozygosity ~ H_all$Latitude)
message('Heterozygosity calculated, moving to private alleles')
###########################
##### Private Alleles #####
###########################
# Get private alleles per population
Private_alleles <- data.frame(rowSums(poppr::private_alleles(Genind)))
Private_alleles$Pop <- rownames(Private_alleles)
colnames(Private_alleles) <- c('PrivateAlleles', 'Pop')
# Attach coordinates
# First we check to make sure that the populations are in the right order
if(any(Private_alleles$Pop != unique(P[,2]))){
stop("Populations are not in the correct order, if you see this please email the package authors")
}
Private_alleles[,3:4] <- Pop_coords[,3:4]
colnames(Private_alleles) <- c('Private.Alleles', 'Pop','Longitude', 'Latitude')
# Is there a statistical relationship between the number of private alleles and latitude
PA_model <- stats::lm(Private_alleles$Private.Alleles ~ Private_alleles$Latitude)
message('Private Alleles have been calculated, moving onto plotting')
##########################
##### Visualizations #####
##########################
##### Heterozygosity #####
# We will make a plot of heterozygostiy estimates and an interpolated map of heterozygosity values for each population
# Plot of heterozygosity values (y-axis) and latitude (x-axis)
Het_plot <- ggplot2::ggplot(data = H_all, ggplot2::aes(x = Latitude, y = Heterozygosity)) + ggplot2::geom_point(ggplot2::aes(color = Pop),size = 3) +
ggplot2::geom_errorbar(data = H_all, ggplot2::aes(x = Latitude, ymin = Heterozygosity-Standard.Deviation, ymax = Heterozygosity+Standard.Deviation,
color = Pop)) +
ggplot2::stat_smooth(method = 'lm', formula = y ~ x, size =1, colour = 'black') +
ggplot2::labs(x = 'Latitude', y = 'Heterozygosity') +
ggplot2::ggtitle(expression(atop("Obersved Heterozygosity of Localities"))) +
ggplot2::theme_classic() + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5),
legend.title = ggplot2::element_blank())
##### Private Alleles #####
# Plot of private allele values (y-axis) and latitude (x-axis)
Paf_plot <- ggplot2::ggplot(data = Private_alleles, ggplot2::aes(x = Latitude, y = Private.Alleles)) + ggplot2::geom_point(ggplot2::aes(color = Pop), size = 3) +
ggplot2::stat_smooth(method = 'lm', formula = y ~ x, size =1, colour = 'black') +
ggplot2::labs(x = 'Latitude', y = 'Private Alleles') +
ggplot2::ggtitle(expression(atop("Private Alleles of Localities"))) +
ggplot2::theme_classic() + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5),
legend.title = ggplot2::element_blank())
Output <- list(H_all, Private_alleles, Het_model, PA_model, Het_plot, Paf_plot)
names(Output) <- c('Heterozygosity_calculations', 'Private_Allele_Calculations', "Heterozygosity_vs_Latitude_Model",
"Private_Allleles_vs_Latitude_Model", "Heterozygosity_vs_Latitude_plot",
"PA_vs_Latitude_plot")
if(write == TRUE){
# Write out heterozygosity results
utils::write.csv(H_all, file = paste(as.character(prefix), '_Heterozygosity.csv', sep = ''), row.names = FALSE)
summary(Het_model)
# Write out linear regression results heterozygosity ~ latitude
sink(paste(as.character(prefix), '_Heterozygosity_lm.txt', sep = ''))
summary(Het_model)
sink()
# Write out private allele results
utils::write.csv(Private_alleles, file = paste(as.character(prefix), '_PrivateAlleles.csv', sep = ''), row.names = FALSE)
# Write out linear regression results private alleles ~ latitude
summary(PA_model)
sink(paste(as.character(prefix), '_Private.Alleles_lm.txt', sep = ''))
summary(PA_model)
sink()
}
message("Calculations have finished, the packages used to perform file formatting and calculations were
vcfR, adegenet, and dartR for formatting, hierfstat to calculate heterozygosity, and poppr to calculate private alleles")
return(Output)
}