In [1]:
from __future__ import division
import numpy as np
from prng import lcgRandom
from sample import PIKK
from scipy.misc import comb, factorial
from scipy.stats import chisquare, norm 
import scipy.integrate as integrate
import pandas as pd

In [2]:
def getEmpiricalDistr(randomObject, n, k, reps=10**7):
    uniqueSamples = dict()

    for i in range(reps): # use range in python 3, xrange in python 2
        sam = frozenset(PIKK(n, k, randomObject))
        if sam in uniqueSamples:
            uniqueSamples[sam] += 1
        else:
            uniqueSamples[sam] = 1
    return uniqueSamples
    

def getItemCounts(uniqueSamples):
    itemCounts = dict()
    for u,v in uniqueSamples.items():
        for i in u:
            if i in itemCounts:
                itemCounts[i] += v
            else:
                itemCounts[i] = v
    return itemCounts


def printItemFreq(itemCounts, verbose=False):
    for i in itemCounts.keys():
        itemCounts[i] /= reps
        if verbose:
            print(i, itemCounts[i])
    return itemCounts
    

def printMaxProbRatio(itemCounts, verbose=False):
    freq = list(itemCounts.values())
    pr = np.amax(freq)/np.amin(freq)
    if verbose:
        print("Max ratio of selection probs: " + str(pr))
    return pr


def conductChiSquareTest(itemCounts):
    freq = list(itemCounts.values())
    return(chisquare(freq))

In [3]:
def distrNormalRange(w, n):
    '''
    The CDF of the range of n IID standard normals evaluated at w
    '''
    innerInt = lambda x: norm.pdf(x)*(norm.cdf(x+w) - norm.cdf(x))**(n-1)
    tmp = integrate.quad(innerInt, -np.inf, np.inf)
    return n*tmp[0]


def test_distrNormalRange():
    n = 100
    np.random.seed(12345)

    empiricalRangeDistr = np.array([np.ptp(norm.rvs(size=n)) for i in range(100000)])
    for w in np.array(range(6,13))/2:
        emp = np.mean(empiricalRangeDistr <= w)
        theoretical = distrNormalRange(w, n)
        assert np.abs(emp - theoretical) <= 0.005
    return None


def distrMultinomialRange(w, n, k):
    '''
    CDF of the range of multinomial variables, evaluated at w
    n draws, k categories each having probability 1/k
    '''
    cutoff = (w - 1/(2*n))*np.sqrt(k/n)
    return distrNormalRange(cutoff, k)


def test_distrMultinomialRange():
    reps = 10000
    bins = 15
    np.random.seed(12345)

    empiricalRangeDistr = np.ptp(np.random.multinomial(n=reps, pvals=[1/bins]*bins, size=100000), axis=1)
    for w in np.array(range(20))*10:
        emp = np.mean(empiricalRangeDistr <= w)
        theoretical = distrMultinomialRange(w, reps, bins)
        assert np.abs(emp - theoretical) <= 0.05
    return None

# will be silent if there are no errors
test_distrNormalRange()
test_distrMultinomialRange()

In [4]:
# Boilerplate stuff

reps = int(10**5)
n = [13, 30, 90]
k = [4, 10, 20]

maxProb = []
minProb = []
meanProb = []
maxProbRatio = []
nvalues = []
kvalues = []
prng = []
seed = []

# FO = first order selection probabilities
chisqStatistic_FO = []
chisqDF_FO = []
chisqPvalue_FO = []
rangeStat_FO = []
rangePvalue_FO = []

# US = unique sample selection probabilities
chisqStatistic_US = []
chisqDF_US = []
chisqPvalue_US = []
rangeStat_US = []
rangePvalue_US = []

# RANDU

In [5]:
for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        
        lcg = lcgRandom(seed=100) # set seed of RANDU to 100

        uniqueSampleCounts = getEmpiricalDistr(lcg, n=nn, k=kk, reps=reps)
        itemCounts = getItemCounts(uniqueSampleCounts)
        
        # First order
        chisqTestResults = conductChiSquareTest(itemCounts)
        chisqDF_FO = chisqDF_FO + [len(itemCounts)]
        chisqStatistic_FO = chisqStatistic_FO + [chisqTestResults[0]]
        chisqPvalue_FO = chisqPvalue_FO + [chisqTestResults[1]]
        
        rangeStatObserved = np.ptp(list(itemCounts.values()))
        rangeStat_FO = rangeStat_FO + [rangeStatObserved]
        rangePvalue_FO = rangePvalue_FO + [1-distrMultinomialRange(rangeStatObserved, reps*kk, nn)]
        
        # Unique samples
        chisqTestResults = conductChiSquareTest(uniqueSampleCounts)
        chisqDF_US = chisqDF_US + [len(uniqueSampleCounts)]
        chisqStatistic_US = chisqStatistic_US + [chisqTestResults[0]]
        chisqPvalue_US = chisqPvalue_US + [chisqTestResults[1]]
        
        rangeStatObserved = np.ptp(list(itemCounts.values()))
        rangeStat_US = rangeStat_US + [rangeStatObserved]
        rangePvalue_US = rangePvalue_US + [1-distrMultinomialRange(rangeStatObserved, reps, comb(nn, kk))]
        
        # Selection probability summary stats
        itemFreq = printItemFreq(itemCounts)
        maxProb = maxProb + [np.amax(list(itemFreq.values()))]
        minProb = minProb + [np.amin(list(itemFreq.values()))]
        meanProb = meanProb + [np.mean(list(itemFreq.values()))]
        maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
        nvalues = nvalues + [nn]
        kvalues = kvalues + [kk]
        prng = prng + ['RANDU']
        seed = seed + [100]

# Super Duper LCG

In [6]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32

In [7]:
seedvalues = [100, 233424280]

for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        for ss in seedvalues:
            sdlcg = lcgRandom(seed=ss, A=A_SD, B=B_SD, M=M_SD)

            uniqueSampleCounts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            itemCounts = getItemCounts(uniqueSampleCounts)
        
            # First order
            chisqTestResults = conductChiSquareTest(itemCounts)
            chisqDF_FO = chisqDF_FO + [len(itemCounts)]
            chisqStatistic_FO = chisqStatistic_FO + [chisqTestResults[0]]
            chisqPvalue_FO = chisqPvalue_FO + [chisqTestResults[1]]
        
            rangeStatObserved = np.ptp(list(itemCounts.values()))
            rangeStat_FO = rangeStat_FO + [rangeStatObserved]
            rangePvalue_FO = rangePvalue_FO + [1-distrMultinomialRange(rangeStatObserved, reps*kk, nn)]
        
            # Unique samples
            chisqTestResults = conductChiSquareTest(uniqueSampleCounts)
            chisqDF_US = chisqDF_US + [len(uniqueSampleCounts)]
            chisqStatistic_US = chisqStatistic_US + [chisqTestResults[0]]
            chisqPvalue_US = chisqPvalue_US + [chisqTestResults[1]]
        
            rangeStatObserved = np.ptp(list(itemCounts.values()))
            rangeStat_US = rangeStat_US + [rangeStatObserved]
            rangePvalue_US = rangePvalue_US + [1-distrMultinomialRange(rangeStatObserved, reps, comb(nn, kk))]
        
            # Selection probability summary stats
            itemFreq = printItemFreq(itemCounts)
            maxProb = maxProb + [np.amax(list(itemFreq.values()))]
            minProb = minProb + [np.amin(list(itemFreq.values()))]
            meanProb = meanProb + [np.mean(list(itemFreq.values()))]
            maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super-Duper']
            seed = seed + [ss]

# Mersenne Twister

In [8]:
seedvalues = [100, 233424280, 429496729]

for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        for ss in seedvalues:
            mt = np.random
            mt.seed(ss)

            uniqueSampleCounts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            itemCounts = getItemCounts(uniqueSampleCounts)
        
            # First order
            chisqTestResults = conductChiSquareTest(itemCounts)
            chisqDF_FO = chisqDF_FO + [len(itemCounts)]
            chisqStatistic_FO = chisqStatistic_FO + [chisqTestResults[0]]
            chisqPvalue_FO = chisqPvalue_FO + [chisqTestResults[1]]
        
            rangeStatObserved = np.ptp(list(itemCounts.values()))
            rangeStat_FO = rangeStat_FO + [rangeStatObserved]
            rangePvalue_FO = rangePvalue_FO + [1-distrMultinomialRange(rangeStatObserved, reps*kk, nn)]
        
            # Unique samples
            chisqTestResults = conductChiSquareTest(uniqueSampleCounts)
            chisqDF_US = chisqDF_US + [len(uniqueSampleCounts)]
            chisqStatistic_US = chisqStatistic_US + [chisqTestResults[0]]
            chisqPvalue_US = chisqPvalue_US + [chisqTestResults[1]]
        
            rangeStatObserved = np.ptp(list(itemCounts.values()))
            rangeStat_US = rangeStat_US + [rangeStatObserved]
            rangePvalue_US = rangePvalue_US + [1-distrMultinomialRange(rangeStatObserved, reps, comb(nn, kk))]
        
            # Selection probability summary stats
            itemFreq = printItemFreq(itemCounts)
            maxProb = maxProb + [np.amax(list(itemFreq.values()))]
            minProb = minProb + [np.amin(list(itemFreq.values()))]
            meanProb = meanProb + [np.mean(list(itemFreq.values()))]
            maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Mersenne Twister']
            seed = seed + [ss]

# First-order selection probabilities, summary statistics

In [9]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Min Prob' : minProb,
     'Mean Prob' : meanProb,
     'Max Prob' : maxProb,
     'Max Selection Prob Ratio' : maxProbRatio,
     'seed' : seed
    }
resTable = pd.DataFrame(d)
cols = resTable.columns.tolist()
cols.reverse()
cols = [cols[2]] + [cols[1]] + [cols[0]] + cols[3:6] + [cols[7]] + [cols[6]]
resTable[cols].sort_values(['Pop size', 'Sample size', 'PRNG', 'seed'])

Unnamed: 0,Pop size,Sample size,seed,PRNG,Min Prob,Mean Prob,Max Prob,Max Selection Prob Ratio
24,13,4,100,Mersenne Twister,0.30512,0.307692,0.31068,1.018222
25,13,4,233424280,Mersenne Twister,0.3055,0.307692,0.31008,1.014992
26,13,4,429496729,Mersenne Twister,0.30549,0.307692,0.30888,1.011097
0,13,4,100,RANDU,0.30526,0.307692,0.31053,1.017264
8,13,4,100,Super-Duper,0.30199,0.307692,0.31008,1.026789
9,13,4,233424280,Super-Duper,0.30545,0.307692,0.3102,1.015551
27,13,10,100,Mersenne Twister,0.76686,0.769231,0.77085,1.005203
28,13,10,233424280,Mersenne Twister,0.76791,0.769231,0.77164,1.004857
29,13,10,429496729,Mersenne Twister,0.76637,0.769231,0.77096,1.005989
1,13,10,100,RANDU,0.76582,0.769231,0.77223,1.00837


# First order selection probabilities, chi-squared test and range statistic

We first test whether each item $1, \dots, k$ is selected with equal probability. We do two tests: the usual chi-squared test and another test based on the range of the multinomial values, $max_i n_i - min_i n_i$, where $n_1, \dots, n_k$ are the number of items in each of $k$ cells that have equal probability $1/k$.

Johnson and Young (1960) and Young (1962) provide the following approximation to the distribution of the range

$$P(\max_i n_k - \min_i n_k \leq r) \approx P(W_m \leq (r-(2n)^{-1})(m/n)^{1/2})$$

where $W_m$ denotes the sample range of $m$ independent standard normal random variables. It is a known result (see e.g. Pearson and Hartley p. 43, 1954 or Ruben, 1960) that the distribution function for the range of IID normal samples is given by

$$R(w) = n \int_{-\infty}^{\infty} \phi(x)\left[ \Phi(x+w) - \Phi(x)\right]^{n-1}dx$$

where $\phi$ and $\Phi$ are the standard normal density and cumulative distribution function, respectively.  We leverage these two results to approximate the p-value of the range statistic.

In [10]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Max Selection Prob Ratio' : maxProbRatio,
     'seed' : seed,
     'Chi-squared' : chisqStatistic_FO,
     'Df' : chisqDF_FO,
     'P-value' : chisqPvalue_FO,
     'Range' : rangeStat_FO,
     'Range P-value' : rangePvalue_FO
    }
resTable = pd.DataFrame(d)
cols = ['Pop size', 'Sample size', 'PRNG', 'seed', 'Chi-squared', 'Df', 'P-value', 'Range', 'Range P-value']
resTable[cols].sort_values(['Pop size', 'Sample size', 'PRNG', 'seed'])

Unnamed: 0,Pop size,Sample size,PRNG,seed,Chi-squared,Df,P-value,Range,Range P-value
24,13,4,Mersenne Twister,100,7.88438,13,0.794099,556,0.560505
25,13,4,Mersenne Twister,233424280,10.06032,13,0.610669,458,0.826192
26,13,4,Mersenne Twister,429496729,4.07694,13,0.982008,339,0.979089
0,13,4,RANDU,100,9.79512,13,0.633929,527,0.646642
8,13,4,Super-Duper,100,18.204235,13,0.109629,809,0.058586
9,13,4,Super-Duper,233424280,9.11249,13,0.693293,475,0.787071
27,13,10,Mersenne Twister,100,2.212378,13,0.999004,399,0.998549
28,13,10,Mersenne Twister,233424280,2.30174,13,0.998784,373,0.999256
29,13,10,Mersenne Twister,429496729,2.256448,13,0.9989,459,0.994548
1,13,10,RANDU,100,5.140914,13,0.953106,641,0.919437


# Selection probabilities for unique samples, chi-squared test + range test

In [11]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Max Selection Prob Ratio' : maxProbRatio,
     'seed' : seed,
     'Chi-squared' : chisqStatistic_US,
     'Df' : chisqDF_US,
     'P-value' : chisqPvalue_US,
     'Range' : rangeStat_US,
     'Range P-value' : rangePvalue_US
    }
resTable = pd.DataFrame(d)
cols = ['Pop size', 'Sample size', 'PRNG', 'seed', 'Chi-squared', 'Df', 'P-value', 'Range', 'Range P-value']
resTable[cols].sort_values(['Pop size', 'Sample size', 'PRNG', 'seed'])

Unnamed: 0,Pop size,Sample size,PRNG,seed,Chi-squared,Df,P-value,Range,Range P-value
24,13,4,Mersenne Twister,100,730.7301,715,0.323863,556,-2.220446e-16
25,13,4,Mersenne Twister,233424280,689.8607,715,0.735144,458,-2.220446e-16
26,13,4,Mersenne Twister,429496729,686.8291,715,0.761349,339,-2.220446e-16
0,13,4,RANDU,100,702.8022,715,0.610347,527,-2.220446e-16
8,13,4,Super-Duper,100,785.4848,715,0.032192,809,-2.220446e-16
9,13,4,Super-Duper,233424280,724.6955,715,0.382373,475,-2.220446e-16
27,13,10,Mersenne Twister,100,340.68044,286,0.013105,399,2.298162e-14
28,13,10,Mersenne Twister,233424280,291.7344,286,0.379134,373,2.298162e-14
29,13,10,Mersenne Twister,429496729,238.71,286,0.978656,459,2.298162e-14
1,13,10,RANDU,100,321.70148,286,0.06636,641,2.298162e-14
