In [1]:
getEmpiricalDistr <- function(n, k, reps=1e7){
  samReplicates <- t(replicate(reps, sort(sample(n, k))))
  uniqueSampleVec <- unique(samReplicates)
  uniqueSamples <- vector("list", nrow(uniqueSampleVec))
  for(i in seq_along(uniqueSamples)){
    sam <- uniqueSampleVec[i, ]
    uniqueSamples[[i]]$sample <- sam
    uniqueSamples[[i]]$freq <- sum(apply(samReplicates, 1, function(row) all(row==sam)))
  }
  return(uniqueSamples)
}
                                                                          

getItemCounts <- function(n, k, reps = 10^7){
  samReplicates <- t(replicate(reps, sort(sample(n, k))))
  itemCounts <- data.frame(table(samReplicates))
  colnames(itemCounts) <- c("Item", "Count")
  return(itemCounts)
}

                                         
getItemFreq <- function(itemCounts, reps = 10^7){
  itemCounts$Probability <- itemCounts$Count/reps
  return(itemCounts)
}


computeMaxProbRatio <- function(probs){
  return(max(probs) / min(probs))
}

                                         
conductChisqTest <- function(counts){
  res <- chisq.test(counts)
  return(list(
    "Statistic" = res$statistic,
    "DF" = res$parameter,
    "Pvalue" = res$p.value
  ))
}

In [2]:
# Boilerplate stuff

reps = 10^7
n = c(13, 30, 90)
k = c(4, 10, 20)

maxProb = c()
minProb = c()
meanProb = c()
maxProbRatio = c()
nvalues = c()
kvalues = c()
prng = c()
seed = c()

chisqStatistic = c()
chisqDF = c()
chisqPvalue = c()

# Super-Duper

In [3]:
seedvalues = c(100, 233424280)

for(nn in n){
  for(kk in k){
    if(kk >= nn){
      next
    }
    
    for(ss in seedvalues){
      
      set.seed(ss, kind = "Super-Duper")
      itemFreq <- getItemFreq(getItemCounts(nn, kk, reps))
      
      maxProb <- c(maxProb, max(itemFreq$Probability))
      minProb <- c(minProb, min(itemFreq$Probability))
      meanProb <- c(meanProb, mean(itemFreq$Probability))
      maxProbRatio <- c(maxProbRatio, computeMaxProbRatio(itemFreq$Probability))
      nvalues <- c(nvalues, nn)
      kvalues <- c(kvalues, kk)
      prng <- c(prng, "Super-Duper")
      seed <- c(seed, ss) 
      
      chisqtest <- conductChisqTest(itemFreq$Count)
      chisqDF <- c(chisqDF, chisqtest$DF)
      chisqStatistic <- c(chisqStatistic, chisqtest$Statistic)
      chisqPvalue <- c(chisqPvalue, chisqtest$Pvalue)
    }
  }
}

In [4]:
seedvalues = c(100, 233424280, 429496729)

for(nn in n){
  for(kk in k){
    if(kk >= nn){
      next
    }
    
    for(ss in seedvalues){
      
      set.seed(ss, kind = "Mersenne-Twister")
      itemFreq <- getItemFreq(getItemCounts(nn, kk, reps))
      
      maxProb <- c(maxProb, max(itemFreq$Probability))
      minProb <- c(minProb, min(itemFreq$Probability))
      meanProb <- c(meanProb, mean(itemFreq$Probability))
      maxProbRatio <- c(maxProbRatio, computeMaxProbRatio(itemFreq$Probability))
      nvalues <- c(nvalues, nn)
      kvalues <- c(kvalues, kk)
      prng <- c(prng, "Mersenne Twister")
      seed <- c(seed, ss)
      
      chisqtest <- conductChisqTest(itemFreq$Count)
      chisqDF <- c(chisqDF, chisqtest$DF)
      chisqStatistic <- c(chisqStatistic, chisqtest$Statistic)
      chisqPvalue <- c(chisqPvalue, chisqtest$Pvalue)
    }
  }
}

# First-order selection probabilities, summary statistics


In [5]:
d1 <- cbind(nvalues, kvalues, prng, seed, minProb, meanProb, maxProb, maxProbRatio)
ord <- order(nvalues, kvalues, prng, seed)
d1[ord, ]

nvalues,kvalues,prng,seed,minProb,meanProb,maxProb,maxProbRatio
13,4,Mersenne Twister,100,0.3074506,0.307692307692308,0.308033,1.00189428805798
13,4,Mersenne Twister,233424280,0.307329,0.307692307692308,0.3080307,1.00228322091309
13,4,Mersenne Twister,429496729,0.307523,0.307692307692308,0.3079343,1.00133746093788
13,4,Super-Duper,100,0.3073586,0.307692307692308,0.3079046,1.00177642662349
13,4,Super-Duper,233424280,0.3072619,0.307692307692308,0.3079051,1.00209332819982
13,10,Mersenne Twister,100,0.7689525,0.769230769230769,0.7694902,1.00069926295838
13,10,Mersenne Twister,233424280,0.7690658,0.769230769230769,0.7694497,1.00049917705351
13,10,Mersenne Twister,429496729,0.7690473,0.769230769230769,0.7694752,1.00055640270761
13,10,Super-Duper,100,0.7690176,0.769230769230769,0.7695378,1.00067644745712
13,10,Super-Duper,233424280,0.7689145,0.769230769230769,0.7695121,1.00077719954559


In [6]:
d2 <- cbind(nvalues, kvalues, prng, seed, chisqStatistic, chisqDF, chisqPvalue)
d2[ord, ]

Unnamed: 0,nvalues,kvalues,prng,seed,chisqStatistic,chisqDF,chisqPvalue
X-squared,13,4,Mersenne Twister,100,9.58361395,12,0.652437901583894
X-squared,13,4,Mersenne Twister,233424280,10.4428574,12,0.57716918502981
X-squared,13,4,Mersenne Twister,429496729,5.91376855,12,0.920366318138758
X-squared,13,4,Super-Duper,100,6.1487546,12,0.908398342344128
X-squared,13,4,Super-Duper,233424280,10.1248137,12,0.605011701620448
X-squared,13,10,Mersenne Twister,100,4.35195914,12,0.976215892067408
X-squared,13,10,Mersenne Twister,233424280,2.42364392,12,0.998424546758798
X-squared,13,10,Mersenne Twister,429496729,2.20075294,12,0.999030458635763
X-squared,13,10,Super-Duper,100,3.3611748,12,0.992412325311763
X-squared,13,10,Super-Duper,233424280,5.272479,12,0.948253181405639
