In [1]:
from __future__ import division
import numpy as np
from prng import lcgRandom
from sample import PIKK
from scipy.misc import comb, factorial
import pandas as pd
import math

In [2]:
def getEmpiricalDistr(randomObject, n, k, reps=10**7):
    uniqueSamples = dict()

    for i in range(reps): # use range in python 3, xrange in python 2
        sam = frozenset(PIKK(n, k, randomObject))
        if sam in uniqueSamples:
            uniqueSamples[sam] += 1
        else:
            uniqueSamples[sam] = 1
    return uniqueSamples


def getPopMean(x):
    return(np.mean(x))


def getSampleMean(x, uniqueSamples):
    m = 0
    totCnt = 0
    for sam, cnt in uniqueSamples.items():
        m += np.mean([x[i] for i in sam])*cnt
        totCnt += cnt
    sampleMean = m/totCnt
    return(sampleMean)

    
def makePopulation(n, p):
    '''
    Create a population of 0s and 1s
    n = pop size
    p = number of 1s in the population
    '''
    x = [0]*n
    x[:p] = [1]*p
    return(x)

In [3]:
# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
se = []
bias = []
relBias = []
theoreticalSE = []

# RANDU

In [4]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            lcg = lcgRandom(seed=100) # set seed of RANDU to 100
            randu_counts = getEmpiricalDistr(lcg, n=nn, k=kk, reps=reps)

            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, randu_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['RANDU']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [5]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop mean' : popMean,
     'Sample mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'Theoretical SE' : theoreticalSE
    }
resTable = pd.DataFrame(d)
cols = ['Pop size', 'Sample size', 'Pop mean', 'Sample mean', 'Bias', 'Relative bias', 'Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

Unnamed: 0,Pop size,Sample size,Pop mean,Sample mean,Bias,Relative bias,Theoretical SE
0,13,4,0.384615,0.38825,0.003635,0.00945,0.002107
1,13,4,0.769231,0.770525,0.001294,0.001682,0.001824
2,13,10,0.384615,0.38356,-0.001055,-0.002744,0.000769
3,13,10,0.769231,0.76837,-0.000861,-0.001119,0.000666
4,30,4,0.166667,0.165,-0.001667,-0.01,0.001764
5,30,4,0.333333,0.333175,-0.000158,-0.000475,0.002232
6,30,4,0.666667,0.662525,-0.004142,-0.006212,0.002232
7,30,10,0.166667,0.16518,-0.001487,-0.00892,0.000979
8,30,10,0.333333,0.33148,-0.001853,-0.00556,0.001238
9,30,10,0.666667,0.66544,-0.001227,-0.00184,0.001238


# Super Duper LCG

In [6]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32

# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
seed = []
bias = []
relBias = []
theoreticalSE = []

In [7]:
# Super-Duper, seed=100
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            sdlcg = lcgRandom(seed=100, A=A_SD, B=B_SD, M=M_SD)
            sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, sdlcg_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super Duper']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [100]
            
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [8]:
# Super-Duper, seed = 2334242802
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            sdlcg = lcgRandom(seed=2334242802, A=A_SD, B=B_SD, M=M_SD)
            sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, sdlcg_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super Duper']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [2334242802]
             
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * nn * (nn-1)))]

In [9]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'seed' : seed,
     'Theoretical SE' : theoreticalSE
    }
resTable = pd.DataFrame(d)
cols = ["Pop size", "Sample size", "seed", "Pop Mean", "Sample Mean", "Bias", "Relative bias", 'Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

Unnamed: 0,Pop size,Sample size,seed,Pop Mean,Sample Mean,Bias,Relative bias,Theoretical SE
0,13,4,100,0.384615,0.385325,0.00071,0.001845,0.002107
1,13,4,100,0.769231,0.7705,0.001269,0.00165,0.001824
22,13,4,2334242802,0.384615,0.384975,0.00036,0.000935,0.000584
23,13,4,2334242802,0.769231,0.77285,0.003619,0.004705,0.000506
2,13,10,100,0.384615,0.38545,0.000835,0.00217,0.000769
3,13,10,100,0.769231,0.76944,0.000209,0.000272,0.000666
24,13,10,2334242802,0.384615,0.38587,0.001255,0.003262,0.000213
25,13,10,2334242802,0.769231,0.77081,0.001579,0.002053,0.000185
4,30,4,100,0.166667,0.165525,-0.001142,-0.00685,0.001764
5,30,4,100,0.333333,0.33235,-0.000983,-0.00295,0.002232


# Mersenne Twister

In [10]:
# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
bias = []
relBias = []
seed = []
theoreticalSE = []

In [11]:
# MT, seed = 100
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(100)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [100]
            
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [12]:
# MT, seed = 2334242802
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(2334242802)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [2334242802]
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [13]:
# MT, seed = 4294967295
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(4294967295)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [4294967295]
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [14]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'seed' : seed,
     'Theoretical SE' : theoreticalSE
    }
resTable = pd.DataFrame(d)
cols = ["Pop size", "Sample size", "seed", "Pop Mean", "Sample Mean", "Bias", "Relative bias", 'Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

Unnamed: 0,Pop size,Sample size,seed,Pop Mean,Sample Mean,Bias,Relative bias,Theoretical SE
0,13,4,100,0.384615,0.382075,-0.002540,-0.006605,0.002107
1,13,4,100,0.769231,0.768775,-0.000456,-0.000593,0.001824
22,13,4,2334242802,0.384615,0.385175,0.000560,0.001455,0.002107
23,13,4,2334242802,0.769231,0.767525,-0.001706,-0.002218,0.001824
44,13,4,4294967295,0.384615,0.385125,0.000510,0.001325,0.002107
45,13,4,4294967295,0.769231,0.769025,-0.000206,-0.000268,0.001824
2,13,10,100,0.384615,0.385080,0.000465,0.001208,0.000769
3,13,10,100,0.769231,0.768670,-0.000561,-0.000729,0.000666
24,13,10,2334242802,0.384615,0.384140,-0.000475,-0.001236,0.000769
25,13,10,2334242802,0.769231,0.768560,-0.000671,-0.000872,0.000666


If all samples were *actually* equally likely, then the sum of $1$s drawn from the population is distributed as hypergeometric with $p = n\times \text{popMean}$ "good" items, $n-p = n*(1-\text{popMean})$ "bad" items, and $k$ draws.
This random variable $X$ is distributed with

$$E(X) = k\text{popMean} = \frac{kp}{n}$$ and 
$$var(X) = \frac{k\text{popMean}(1-\text{popMean})(n-k)}{n-1} = \frac{kp(1-p)(n-k)}{n^2(n-1)}.$$ 

Instead of the sum, we look at the mean of the $k$ draws, $\bar{X}$. It has a scaled hypergeometric distribution, with

$$E(\bar{X}) = \text{popMean} = \frac{p}{n}$$ and 
$$var(\bar{X}) = \frac{\text{popMean}(1-\text{popMean})(n-k)}{k(n-1)} = \frac{p(1-p)(n-k)}{kn^2(n-1)}.$$

Finally, we sample from this distribution $B$ times and take the sample average -- this is the column Sample Mean in the table. This is an average of IID random variables, so it has mean $\text{popMean} = \frac{p}{n}$ and variance

$$\frac{\text{popMean}(1-\text{popMean})(n-k)}{Bk(n-1)} = \frac{p(1-p)(n-k)}{Bkn^2(n-1)}.$$