In [1]:
library(dplyr)

getEmpiricalDistrRaw <- function(n, k, reps=1e7){
    # Just generates the samples and leaves them in matrix form,
    # one per row, unsorted.
    samReplicates <- t(replicate(reps, sort(sample(n, k))))
    return(samReplicates)
}        


getEmpiricalDistr <- function(samReplicates){
  # Takes input from getEmpiricalDistrRaw
  samReplicates <-  samReplicates %>% as.data.frame() %>% mutate("uniqueid" = apply(samReplicates, 1, paste0, collapse = "."))
  uniqueSampleDF <- samReplicates %>% group_by(uniqueid) %>% summarise(n())
  uniqueSamples <- vector("list", nrow(uniqueSampleDF))
  for(i in seq_len(nrow(uniqueSampleDF))){
    uniqueSamples[[i]]$sample <- as.numeric(unlist(strsplit(as.character(uniqueSampleDF[i, "uniqueid"]), split="\\.")))
    uniqueSamples[[i]]$freq <- unlist(uniqueSampleDF[i, "n()"])
  }
  return(uniqueSamples)
}


getEmpiricalDistr_slow <- function(samReplicates){
    # Takes input from getEmpiricalDistrRaw
    uniqueSampleVec <- unique(samReplicates)
    uniqueSamples <- vector("list", nrow(uniqueSampleVec))
    for(i in seq_along(uniqueSamples)){
        sam <- uniqueSampleVec[i, ]
        uniqueSamples[[i]]$sample <- sam
        uniqueSamples[[i]]$freq <- sum(apply(samReplicates, 1, function(row) all(row==sam)))
    }
    return(uniqueSamples)
}
                                             
                                             
getItemCounts <- function(samReplicates){
  # Takes input from getEmpiricalDistrRaw
  itemCounts <- data.frame(table(samReplicates))
  colnames(itemCounts) <- c("Item", "Count")
  return(itemCounts)
}                                         

                                             
getEmpiricalDistr_old <- function(n, k, reps=1e7){
  samReplicates <- t(replicate(reps, sort(sample(n, k))))
  uniqueSampleVec <- unique(samReplicates)
  uniqueSamples <- vector("list", nrow(uniqueSampleVec))
  for(i in seq_along(uniqueSamples)){
    sam <- uniqueSampleVec[i, ]
    uniqueSamples[[i]]$sample <- sam
    uniqueSamples[[i]]$freq <- sum(apply(samReplicates, 1, function(row) all(row==sam)))
  }
  return(uniqueSamples)
}
                                                                          

getItemCounts_old <- function(n, k, reps = 10^7){
  samReplicates <- t(replicate(reps, sort(sample(n, k))))
  itemCounts <- data.frame(table(samReplicates))
  colnames(itemCounts) <- c("Item", "Count")
  return(itemCounts)
}

                                         
getItemFreq <- function(itemCounts, reps = 10^7){
  itemCounts$Probability <- itemCounts$Count/reps
  return(itemCounts)
}


computeMaxProbRatio <- function(probs){
  return(max(probs) / min(probs))
}

                                         
conductChisqTest <- function(counts){
  res <- chisq.test(counts)
  return(list(
    "Statistic" = res$statistic,
    "DF" = res$parameter,
    "Pvalue" = res$p.value
  ))
}


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
library(testthat)

distrNormalRange <- function(w, n){
  # CDF of the range of n IID standard normals, evaluated at w
  tmp <- integrate(function(x) dnorm(x)*(pnorm(x+w)-pnorm(x))^(n-1), lower = -Inf, upper = Inf)
  n*tmp$value
}


test_distrNormalRange <- function(){
  n = 100
  set.seed(12345)
  
  # Works!
  empiricalRangeDistr <- replicate(100000, {
    tmp <- rnorm(n)
    max(tmp) - min(tmp)
  })
  for(w in seq(3, 6, by = 0.5)){
    emp <- mean(empiricalRangeDistr <= w)
    expect_equal(distrNormalRange(w, n), emp, tolerance = 0.005)
  }
}

distrMultinomialRange <- function(w, n, k){
  # CDF of the range of multinomial variables evaluated at w
  # n draws, k categories each having probability 1/k
  cutoff <- (w - 1/(2*n))*sqrt(k/n)
  return(distrNormalRange(cutoff, k))
}

test_distrMultinomialRange <- function(){
  reps = 10000
  
  bins = 15
  set.seed(12345)
  
  # Works!
  empiricalRangeDistr <- replicate(100000, {
    tmp <- rmultinom(n = 1, size = reps, prob = rep(1/bins, bins))
    diff(range(tmp))
  })
  for(w in (1:20)*10){
    emp <- mean(empiricalRangeDistr <= w)
    expect_equal(distrMultinomialRange(w, reps, bins), emp, tolerance = 0.015)
  }
}
                   
# Will be silent if there are no errors
test_distrNormalRange()
test_distrMultinomialRange()


Attaching package: ‘testthat’

The following object is masked from ‘package:dplyr’:

    matches



In [3]:
# Boilerplate stuff

reps <- 10^5
n <- c(13, 30, 90)
k <- c(4, 10, 20)

maxProb <- c()
minProb <- c()
meanProb <- c()
maxProbRatio <- c()
nvalues <- c()
kvalues <- c()
prng <- c()
seed <- c()

# FO = first order selection probabilities
chisqStatistic_FO <- c()
chisqDF_FO <- c()
chisqPvalue_FO <- c()
rangeStat_FO <- c()
rangePvalue_FO <- c()

# US = unique sample selection probabilities
chisqStatistic_US <- c()
chisqDF_US <- c()
chisqPvalue_US <- c()
rangeStat_US <- c()
rangePvalue_US <- c()

# Super-Duper

In [4]:
seedvalues <- c(100, 233424280)

for(nn in n){
  for(kk in k){
    if(kk >= nn){
      next
    }
    
    for(ss in seedvalues){
      
      set.seed(ss, kind = "Super-Duper")
      edistr <- getEmpiricalDistrRaw(nn, kk, reps)
      itemFreq <- getItemFreq(getItemCounts(edistr), reps)
      samFreq <- getEmpiricalDistr(edistr)
      
      maxProb <- c(maxProb, max(itemFreq$Probability))
      minProb <- c(minProb, min(itemFreq$Probability))
      meanProb <- c(meanProb, mean(itemFreq$Probability))
      maxProbRatio <- c(maxProbRatio, computeMaxProbRatio(itemFreq$Probability))
      nvalues <- c(nvalues, nn)
      kvalues <- c(kvalues, kk)
      prng <- c(prng, "Super-Duper")
      seed <- c(seed, ss) 
      
      # First order
      chisqtest <- conductChisqTest(itemFreq$Count)
      chisqDF_FO <- c(chisqDF_FO, chisqtest$DF)
      chisqStatistic_FO <- c(chisqStatistic_FO, chisqtest$Statistic)
      chisqPvalue_FO <- c(chisqPvalue_FO, chisqtest$Pvalue)
        
      rangeObserved_FO <- diff(range(itemFreq$Count))
      rangeStat_FO <- c(rangeStat_FO, rangeObserved_FO)
      rangePvalue_FO <- c(rangePvalue_FO, 1-distrMultinomialRange(rangeObserved_FO, reps*kk, nn))
        
      # Unique samples
      sampleFreqVec <- sapply(samFreq, function(x) x$freq)
      chisqtest <- conductChisqTest(sampleFreqVec)
      chisqDF_US <- c(chisqDF_US, chisqtest$DF)
      chisqStatistic_US <- c(chisqStatistic_US, chisqtest$Statistic)
      chisqPvalue_US <- c(chisqPvalue_US, chisqtest$Pvalue)
      
      rangeObserved_US <- diff(range(sampleFreqVec))
      rangeStat_US <- c(rangeStat_US, rangeObserved_US)
      rangePvalue_US <- c(rangePvalue_US, 1-distrMultinomialRange(rangeObserved_US, reps, choose(nn, kk)))
    }
  }
}

“Chi-squared approximation may be incorrect”

In [5]:
seedvalues <- c(100, 233424280, 429496729)

for(nn in n){
  for(kk in k){
    if(kk >= nn){
      next
    }
    
    for(ss in seedvalues){
      
      set.seed(ss, kind = "Mersenne-Twister")
      edistr <- getEmpiricalDistrRaw(nn, kk, reps)
      itemFreq <- getItemFreq(getItemCounts(edistr), reps)
      samFreq <- getEmpiricalDistr(edistr)

      maxProb <- c(maxProb, max(itemFreq$Probability))
      minProb <- c(minProb, min(itemFreq$Probability))
      meanProb <- c(meanProb, mean(itemFreq$Probability))
      maxProbRatio <- c(maxProbRatio, computeMaxProbRatio(itemFreq$Probability))
      nvalues <- c(nvalues, nn)
      kvalues <- c(kvalues, kk)
      prng <- c(prng, "Mersenne Twister")
      seed <- c(seed, ss)
      
      # First order
      chisqtest <- conductChisqTest(itemFreq$Count)
      chisqDF_FO <- c(chisqDF_FO, chisqtest$DF)
      chisqStatistic_FO <- c(chisqStatistic_FO, chisqtest$Statistic)
      chisqPvalue_FO <- c(chisqPvalue_FO, chisqtest$Pvalue)
        
      rangeObserved_FO <- diff(range(itemFreq$Count))
      rangeStat_FO <- c(rangeStat_FO, rangeObserved_FO)
      rangePvalue_FO <- c(rangePvalue_FO, 1-distrMultinomialRange(rangeObserved_FO, reps*kk, nn))
        
      # Unique samples
      sampleFreqVec <- sapply(samFreq, function(x) x$freq)
      chisqtest <- conductChisqTest(sampleFreqVec)
      chisqDF_US <- c(chisqDF_US, chisqtest$DF)
      chisqStatistic_US <- c(chisqStatistic_US, chisqtest$Statistic)
      chisqPvalue_US <- c(chisqPvalue_US, chisqtest$Pvalue)
      
      rangeObserved_US <- diff(range(sampleFreqVec))
      rangeStat_US <- c(rangeStat_US, rangeObserved_US)
      rangePvalue_US <- c(rangePvalue_US, 1-distrMultinomialRange(rangeObserved_US, reps, choose(nn, kk)))
    }
  }
}

“Chi-squared approximation may be incorrect”

# First-order selection probabilities, summary statistics


In [6]:
d1 <- cbind(nvalues, kvalues, prng, seed, minProb, meanProb, maxProb, maxProbRatio)
rownames(d1) <- NULL

ord <- order(nvalues, kvalues, prng, seed)
d1[ord, ]

nvalues,kvalues,prng,seed,minProb,meanProb,maxProb,maxProbRatio
13,4,Mersenne Twister,100,0.30392,0.307692307692308,0.31095,1.02313108712819
13,4,Mersenne Twister,233424280,0.30355,0.307692307692308,0.30996,1.02111678471422
13,4,Mersenne Twister,429496729,0.3045,0.307692307692308,0.31002,1.01812807881773
13,4,Super-Duper,100,0.30574,0.307692307692308,0.31049,1.01553607640479
13,4,Super-Duper,233424280,0.30506,0.307692307692308,0.31204,1.02288074477152
13,10,Mersenne Twister,100,0.76694,0.769230769230769,0.77084,1.00508514355751
13,10,Mersenne Twister,233424280,0.76747,0.769230769230769,0.77187,1.00573312311882
13,10,Mersenne Twister,429496729,0.7674,0.769230769230769,0.77173,1.00564242898097
13,10,Super-Duper,100,0.76746,0.769230769230769,0.77096,1.00456049826701
13,10,Super-Duper,233424280,0.76699,0.769230769230769,0.77114,1.00541076154839


# First order selection probabilities, chi-squared test and range statistic

We first test whether each item $1, \dots, k$ is selected with equal probability. We do two tests: the usual chi-squared test and another test based on the range of the multinomial values, $max_i n_i - min_i n_i$, where $n_1, \dots, n_k$ are the number of items in each of $k$ cells that have equal probability $1/k$.

Johnson and Young (1960) and Young (1962) provide the following approximation to the distribution of the range

$$P(\max_i n_k - \min_i n_k \leq r) \approx P(W_m \leq (r-(2n)^{-1})(m/n)^{1/2})$$

where $W_m$ denotes the sample range of $m$ independent standard normal random variables. It is a known result (see e.g. Pearson and Hartley p. 43, 1954 or Ruben, 1960) that the distribution function for the range of IID normal samples is given by

$$R(w) = n \int_{-\infty}^{\infty} \phi(x)\left[ \Phi(x+w) - \Phi(x)\right]^{n-1}dx$$

where $\phi$ and $\Phi$ are the standard normal density and cumulative distribution function, respectively.  We leverage these two results to approximate the p-value of the range statistic.

In [7]:
d2 <- cbind(nvalues, kvalues, prng, seed, chisqStatistic_FO, chisqDF_FO, chisqPvalue_FO, rangeStat_FO, rangePvalue_FO)
rownames(d2) <- NULL
d2[ord, ]

nvalues,kvalues,prng,seed,chisqStatistic_FO,chisqDF_FO,chisqPvalue_FO,rangeStat_FO,rangePvalue_FO
13,4,Mersenne Twister,100,10.85956,12,0.54099128727911,703,0.186515087785709
13,4,Mersenne Twister,233424280,10.54873,12,0.567935882805461,641,0.31990507395412
13,4,Mersenne Twister,429496729,10.455715,12,0.57604665489425,552,0.572500663217839
13,4,Super-Duper,100,11.014,12,0.527719265030047,475,0.787070878809145
13,4,Super-Duper,233424280,13.47854,12,0.335238088863653,698,0.195575636664423
13,10,Mersenne Twister,100,2.177122,12,0.999082217703549,390,0.998840270418633
13,10,Mersenne Twister,233424280,2.77247,12,0.99694831066334,440,0.996307279066042
13,10,Mersenne Twister,429496729,2.928418,12,0.996028631859736,433,0.996822148904004
13,10,Super-Duper,100,1.863744,12,0.999587952539855,350,0.99960969510869
13,10,Super-Duper,233424280,2.477838,12,0.998241320438647,415,0.997877058821156


# Selection probabilities for unique samples, chi-squared test

In [8]:
d3 <- cbind(nvalues, kvalues, prng, seed, chisqStatistic_US, chisqDF_US, chisqPvalue_US, rangeStat_US, rangePvalue_US)
rownames(d3) <- NULL
d3[ord, ]

nvalues,kvalues,prng,seed,chisqStatistic_US,chisqDF_US,chisqPvalue_US,rangeStat_US,rangePvalue_US
13,4,Mersenne Twister,100,674.2022,714,0.854439233091403,67,0.904172473345367
13,4,Mersenne Twister,233424280,718.2605,714,0.448221654618962,76,0.35957903649256
13,4,Mersenne Twister,429496729,671.857,714,0.868690003386312,72,0.625296501574707
13,4,Super-Duper,100,709.4517,714,0.541004573175527,65,0.984998743680458
13,4,Super-Duper,233424280,690.8188,714,0.726584562007638,80,0.167957671342366
13,10,Mersenne Twister,100,286.3004,285,0.467202947457186,102,0.667771235436501
13,10,Mersenne Twister,233424280,288.2166,285,0.435602866775834,97,0.840935238900607
13,10,Mersenne Twister,429496729,290.93932,285,0.391679825395672,112,0.290051846417693
13,10,Super-Duper,100,283.8408,285,0.508250242391722,105,0.547499420477291
13,10,Super-Duper,233424280,255.155,285,0.897672661305453,94,0.913712827464592
