In [None]:
from __future__ import division
import numpy as np
from prng import lcgRandom
from sample import PIKK
from scipy.misc import comb, factorial
import pandas as pd
import math

In [None]:
def getEmpiricalDistr(randomObject, n, k, reps=10**7):
    uniqueSamples = dict()

    for i in range(reps): # use range in python 3, xrange in python 2
        sam = frozenset(PIKK(n, k, randomObject))
        if sam in uniqueSamples:
            uniqueSamples[sam] += 1
        else:
            uniqueSamples[sam] = 1
    return uniqueSamples


def getPopMean(x):
    return(np.mean(x))


def getSampleMean(x, uniqueSamples):
    m = 0
    totCnt = 0
    for sam, cnt in uniqueSamples.items():
        m += np.mean([x[i] for i in sam])*cnt
        totCnt += cnt
    sampleMean = m/totCnt
    return(sampleMean)

    
def makePopulation(n, p):
    '''
    Create a population of 0s and 1s
    n = pop size
    p = number of 1s in the population
    '''
    x = [0]*n
    x[:p] = [1]*p
    return(x)

In [None]:
# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
se = []
bias = []
relBias = []
theoreticalSE = []

# RANDU

In [None]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            lcg = lcgRandom(seed=100) # set seed of RANDU to 100
            randu_counts = getEmpiricalDistr(lcg, n=nn, k=kk, reps=reps)

            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, randu_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['RANDU']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps**2 * kk * nn**2 * (nn-1)))]

In [None]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop mean' : popMean,
     'Sample mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'Theoretical SE' : theoreticalSE
    }
resTable = pd.DataFrame(d)
cols = ['Pop size', 'Sample size', 'Pop mean', 'Sample mean', 'Bias', 'Relative bias', 'Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

# Super Duper LCG

In [None]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32

# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
seed = []
se = []
bias = []
relBias = []
theoreticalSE = []

In [None]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            sdlcg = lcgRandom(seed=100, A=A_SD, B=B_SD, M=M_SD)
            sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, sdlcg_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super Duper']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [100]
            
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps**2 * kk * nn**2 * (nn-1)))]

In [None]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            sdlcg = lcgRandom(seed=2334242802, A=A_SD, B=B_SD, M=M_SD)
            sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, sdlcg_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super Duper']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [2334242802]
             
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps**2 * kk * nn**2 * (nn-1)))]

In [None]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'seed' : seed,
     'Theoretical SE' : theoreticalSE
    }
resTable = pd.DataFrame(d)
cols = ["Pop size", "Sample size", "seed", "Pop Mean", "Sample Mean", "Bias", "Relative bias", 'Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

# Mersenne Twister

In [None]:
# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
se = []
bias = []
relBias = []
seed = []
theoreticalSE = []

In [None]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(100)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [100]
            
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps**2 * kk * nn**2 * (nn-1)))]

In [None]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(2334242802)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [2334242802]
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps**2 * kk * nn**2 * (nn-1)))]

In [None]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(4294967295)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [4294967295]
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps**2 * kk * nn**2 * (nn-1)))]

In [None]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'seed' : seed,
     'Theoretical SE' : theoreticalSE
    }
resTable = pd.DataFrame(d)
cols = ["Pop size", "Sample size", "seed", "Pop Mean", "Sample Mean", "Bias", "Relative bias", 'Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

If all samples were *actually* equally likely, then the sum of $1$s drawn from the population is distributed as hypergeometric with $p = n\times \text{popMean}$ "good" items, $n-p = n*(1-\text{popMean})$ "bad" items, and $k$ draws.
This random variable $X$ is distributed with

$$E(X) = k\text{popMean} = \frac{kp}{n}$$ and 
$$var(X) = \frac{k\text{popMean}(1-\text{popMean})(n-k)}{n-1} = \frac{kp(1-p)(n-k)}{n^2(n-1)}.$$ 

Instead of the sum, we look at the mean of the $k$ draws, $\bar{X}$. It has a scaled hypergeometric distribution, with

$$E(\bar{X}) = \text{popMean} = \frac{p}{n}$$ and 
$$var(\bar{X}) = \frac{\text{popMean}(1-\text{popMean})(n-k)}{k(n-1)} = \frac{p(1-p)(n-k)}{kn^2(n-1)}.$$

Finally, we sample from this distribution $B$ times and take the sample average -- this is the column Sample Mean in the table. This is an average of IID random variables, so it has mean $\text{popMean} = \frac{p}{n}$ and variance

$$\frac{\text{popMean}(1-\text{popMean})(n-k)}{B^2k(n-1)} = \frac{p(1-p)(n-k)}{B^2kn^2(n-1)}.$$

For large $B$, this is miniscule. If $B = 10^4$, the variance should be on the order of $10^{-8}$. Yet, we see bias on the order of $10^{-3}$.