First commit of hierfstat

jgx65 · Mar 18, 2015 · a217203 · a217203
commit a217203
Show file tree

Hide file tree

Showing 105 changed files with 11,012 additions and 0 deletions.
diff --git a/CHANGES b/CHANGES
@@ -0,0 +1,81 @@
+#version 0.04-16 18.03.2025
+-
+-
+-
+-
+
+#version 0.04-15 30.09.2014
+-fixed a bug in basic.stats, when only one population with index 2, caused a non conformable array error message. 
+Now add 1 to the last element of the population vector
+
+#version 0.04-14 25.09.2014
+-Modified sim.genot to allow for different island sizes and independant inbreeding coefficient
+
+#version 0.04-12 06.02.2014
+-caught bugs in getal and getal.b relative to number of digits encoding alleles (1001-> modulo 1000 not 100)
+
+
+#version 0.04-11 31.10.2013
+-fixed a bug in boot.ppfst which stop the looping over populations to be carried out correctly
+-basic.stats can now take input from data set made of one sample only (a dummy extra line is added to the data set)
+
+#version 0.04-10 09.04.2013
+-changed the help file for read.fstat and read.fstat.data so that the examples now call find.package rather than .find.package
+-added function to write fstat files, structure files, and sub-sampled individuals from populations
+-changed the printout of basic.stats, now an object
+-created a function (requiring ade4) indpca carrying out a PCA on individual allele frequencies
+-Fixed various other small bugs
+
+
+#version 0.04-9 04.10.2012
+modified bs and wc to allow for haploid data and for better output
+#version 0.04-8 30.07.2012
+modified getal and pp.fst to allow for non continuous numbering of populations
+#version 0.04-7 26.07.2012
+added beta.r for estimation of WH02 beta
+#version 0.04-6 23.11.2011
+cleaned up things to avoid warning messages
+#version 0.04-5 30.07.2010
+changed \format to \value in Rd file
+#version 0.04-4 15.08.2006
+removed data frame names (names.data<-names(data)) in prepdata, caused a bug from R version 2.3.1 when only one level and does not seem to do anything
+
+
+#version 0.04-3 30.06.2006
+fixed a bug in boot.vc, now the function removes monomorphic loci before carrying out the bootstrap, and gives an error message if the number of polymorphic loci is less than 5
+
+#version 0.04-2
+
+fixed a bug in genot2al when alleles were coded with 3 digits but with number not exceeding 9
+
+#version 0.04-1
+
+boot.vc now provides bootstrap CI for variance components.
+
+#version 0.03-4
+
+fixed bug in samp.within, now it can handle lev with only one subgroup
+
+#version 0.03-3
+
+Again fixed bugs in test.between, test.between.within and test.within. replaced as.integer(x) with as.integer(factor(x)) forcing consecutive numbering of factor levels. 
+
+#version 0.03-2
+Again fixed bugs in test.between, test.between.within and test.within. No renumbering of factor levels, factors are just forced as integers and sorted once.
+
+updated some references
+
+#version 0.03-1
+in varcomp.glob, added diploid as a parameter passed to varcomp. Absence of this option caused the function to report an error message when the data were haploid
+Changes were made to functions test.between, test.between.within and test.within. The vector rand.unit and test.lev are transformed such that the different units are numbered consecutively
+Updated the reference list
+
+#version 0.02.2
+in samp.between and samp.within, changed x<-NULL (which caused message Error: more elements supplied than there are to replace) into x<-list().  
+
+#version 0.02
+
+The function test.between, test.between.within and test.within now sort the data before doing the resampling. This avoids errors when the vector within, rand.unit and test.lev are not sorted. 
+
+In function test.between and test.between.within, the vector rand.unit is transformed such that the different units are numbered consecutively
+
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,11 @@
+Package: hierfstat
+Version: 0.04-14
+Date: 2014-09-25
+Title: Estimation and tests of hierarchical F-statistics
+Author: Jerome Goudet <jerome.goudet@unil.ch>
+Maintainer: Jerome Goudet <jerome.goudet@unil.ch>
+Depends: gtools,ade4
+Suggests: adegenet,ape
+Description: This R package allows the estimation of hierarchical F-statistics from haploid or diploid genetic data with any numbers of levels in the hierarchy, following the algorithm of Yang (Evolution, 1998, 52(4):950-956). Functions are also given to test via randomisations the significance of each F and variance components, using the likelihood-ratio statistics G -see Goudet etal (Genetics, 1996, 144(4): 1933-1940) 
+License: GPL (>= 2)
+URL: http://www.r-project.org, http://www.unil.ch/popgen/softwares/hierfstat.htm
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,22 @@
+export(allele.count,allelic.richness,
+basic.stats,betai,boot.ppfis,boot.ppfst,boot.vc,
+cfe.dist,da.dist,eucl.dist,eucl.dist.trait,
+g.stats,g.stats.glob,
+genot2al,getal,getal.b,
+ind.count,
+indpca,print.indpca,plot.indpca,
+mat2vec,
+nb.alleles,
+nei.dist,pcoa,pop.freq,pp.fst,
+pp.sigma.loc,prepdata,print.pp.fst,
+read.fstat,read.fstat.data,
+samp.between,samp.between.within,samp.within,
+sim.freq,sim.genot,
+subsampind,
+test.between,test.between.within,test.g,test.within,
+varcomp,varcomp.glob,
+vec2mat,wc,write.fstat,write.struct)
+
+# Import all packages listed as Imports or Depends
+importFrom(gtools,"rdirichlet")
+importFrom(ade4,"dudi.pca")
diff --git a/R/FSTAT.INI b/R/FSTAT.INI
@@ -0,0 +1 @@
+23531   787456
diff --git a/R/beta.R b/R/beta.R
@@ -0,0 +1,69 @@
+my.betai <- function(dat){
+# june 21st 2012
+# estimates betas from Weir & Hill 2002 Ann Rev Genet
+# gendata is a data frame with first column containing pop id and remaining cols containing genotypes
+# betai are from (7) of WH2002
+# betaio are population estimates over loci 1-(sum of nums)/(sum of dens)
+# betaw are average over populations using (9) of WH2002
+# betaii' can be extracted with betas[1,,] etc...
+  ind.count.n<-function(data){
+    dum<-function(x){
+      a<-!is.na(x)
+      tapply(a,data[,1],sum)
+    }
+    data[,1]<-factor(data[,1])
+    apply(data[,-1],2,dum)
+
+  }
+
+  npop<-length(table(dat[,1]))
+  nloc<-dim(dat)[2]-1
+  al.c.t<-apply(dat[,-1],2,function(x) table(factor(x)))
+
+  al.c.pp<-apply(dat[,-1],2,function(x) tapply(factor(x),dat[,1],table))
+  al.c.pp.m<-lapply(al.c.pp,function(x) matrix(unlist(x),byrow=TRUE,nrow=npop))
+  pb<-lapply(al.c.t,function(x) {dum<-sum(x);if (dum>0.0) x/dum})
+  p<-lapply(al.c.pp.m,function(x) {apply(x,1,
+                                             function(y) {dum<-sum(y); if (dum>0.0) y/dum else y})
+  })
+
+
+ninds <- ind.count.n(dat) 
+#p <- pop.freq(gendata)
+#pb <- pop.freq(cbind(rep(1,dim(gendata)[1]),gendata[,-1]))
+n2 <- ninds^2
+n2 <- sweep(n2,2,apply(ninds,2,sum),FUN="/")
+nic <- ninds-n2 #top of p729, WH2002
+snic <- apply(nic,2,sum,na.rm=TRUE) #sum over pop
+betas <- array(dim=c(npop,npop,nloc))
+nums <- array(dim=c(npop,npop,nloc))
+lden <- numeric(nloc)
+for (il in 1:nloc){
+  dum1 <- sweep(sweep(p[[il]],1,pb[[il]],FUN="-")^2,2,ninds[,il],FUN="*")
+  dum2 <- sweep(p[[il]]*(1.0-p[[il]]),2,nic[,il],FUN="*")
+  sden <- sum(dum1+dum2,na.rm=TRUE)
+  lden[il] <- sden
+  for (ip1 in 1:npop){
+    for (ip2 in ip1:npop){
+      if (ip1==ip2){
+        dum1 <- p[[il]][,ip1]*(1.0-p[[il]][,ip1])
+        nums[ip1,ip1,il] <- snic[il]*ninds[ip1,il]/(ninds[ip1,il]-1.0)*sum(dum1,na.rm=TRUE)
+        betas[ip1,ip1,il] <- 1.0-nums[ip1,ip1,il]/sden
+      }
+      else{
+        dum1 <- p[[il]][,ip1]*(1.0-p[[il]][,ip2])
+        dum2 <- p[[il]][,ip2]*(1.0-p[[il]][,ip1])
+        nums[ip1,ip2,il] <- snic[il]*sum(dum1+dum2,na.rm=TRUE)
+        betas[ip1,ip2,il]  <- 1.0-nums[ip1,ip2,il]/2.0/sden
+      }
+    }
+  }
+}
+betai  <- t(apply(betas,3,diag)) # betai per pop and locus
+
+betaio  <- 1-apply(apply(nums,3,diag),1,sum,na.rm=TRUE)/sum(lden,na.rm=TRUE) # beta per pop over loci, as sums of numerators divided by sums of denominators
+
+betaw  <- 1-apply(apply(nums,3,diag)*ninds,2,sum,na.rm=TRUE)/(apply(ninds,2,sum,na.rm=TRUE)*lden) # beta overall per loci, eq 9 p 734 of Weir & Hill 2002,
+
+return(list(betaiip=betas,nic=snic,betai=betai,betaiov=betaio,betaw=betaw))
+}
diff --git a/R/boot.R b/R/boot.R
@@ -0,0 +1,53 @@
+boot.vc<-function (levels = levels, loci = loci, diploid = TRUE, nboot = 1000, 
+    quant = c(0.025, 0.5, 0.975)) 
+{
+    gf <- function(dat, num, den) {
+        sum(dat[num])/sum(dat[den])
+    }
+    nloc <- dim(loci)[2]
+    if (nloc < 5) {
+        stop("Not enough loci to bootstrap. Exiting")
+    }
+    x <- varcomp.glob(levels = levels, loci = loci, diploid = diploid)
+    x.loc <- data.frame(x$loc)
+    rows<-complete.cases(x.loc)
+    if(sum(rows)<5){
+	stop("Not enough polymorphic loci to bootstrap. Exiting")
+    }
+    nloc<-sum(rows)
+    x.loc<-x.loc[rows,]
+    nlev <- dim(x.loc)[2]
+    names(x.loc) <- names(x$overall)
+    mat.boot <- data.frame(matrix(rep(0, nboot * nlev), ncol = nlev))
+    for (i in 1:nboot) {
+        mat.boot[i, ] <- apply(x.loc[sample(nloc, replace = TRUE), 
+            ], 2, sum)
+    }
+    nam <- vector(length = nlev + 1)
+    nam[-1] <- names(x.loc)
+    nam[1] <- "Total"
+    names(mat.boot) <- names(x.loc)
+    mat.res <- data.frame(matrix(rep(0, nboot * (nlev * (nlev + 
+        1)/2)), nrow = nboot))
+    names.res <- vector(length = nlev * (nlev + 1)/2)
+    mat.res[, 1] <- apply(mat.boot, 1, sum)
+    acc <- 0
+    for (i in 1:(nlev - 1)) {
+        acc <- acc + 1
+        mat.res[, acc] <- apply(mat.boot[, c(i:nlev)], 1, sum)/nloc
+        names.res[acc] <- paste("H-", nam[i], sep = "")
+        for (j in i:(nlev - 1)) {
+            acc <- acc + 1
+            mat.res[, acc] <- apply(mat.boot, 1, gf, num = i:j, 
+                den = i:nlev)
+            names.res[acc] <- paste("F-", nam[j + 1], "/", nam[i], 
+                sep = "")
+        }
+    }
+    acc <- acc + 1
+    mat.res[, acc] <- mat.boot[, nlev]/nloc
+    names.res[acc] <- "Hobs"
+    names(mat.res) <- names.res
+    return(list(boot = round(mat.boot,digits=4), res = round(mat.res,digits=4), ci = round(apply(mat.res, 2, quantile, quant),digits=4)))
+}
+