In [1]:
from __future__ import division
import numpy as np
from prng import lcgRandom
from sample import PIKK
from scipy.misc import comb, factorial
import pandas as pd
import math

In [2]:
def getEmpiricalDistr(randomObject, n, k, reps=10**7):
    uniqueSamples = dict()

    for i in range(reps): # use range in python 3, xrange in python 2
        sam = frozenset(PIKK(n, k, randomObject))
        if sam in uniqueSamples:
            uniqueSamples[sam] += 1
        else:
            uniqueSamples[sam] = 1
    return uniqueSamples


def getItemCounts(uniqueSamples):
    itemCounts = dict()
    for u,v in uniqueSamples.items():
        for i in u:
            if i in itemCounts:
                itemCounts[i] += v
            else:
                itemCounts[i] = v
    return itemCounts


def printItemFreq(itemCounts, verbose=False):
    for i in itemCounts.keys():
        itemCounts[i] /= reps
        if verbose:
            print(i, itemCounts[i])
    return itemCounts


def findFreqItems(itemCounts, m):
    '''
    Return indices of the m most frequently occurring items
    '''
    ordered = sorted(enumerate(list(itemCounts.values())), key = lambda x: x[1], reverse = True)
    topM = ordered[:m]
    grabIndex = [i[0] for i in topM]
    return grabIndex


def getPopMean(x):
    return(np.mean(x))


def getSampleMean(x, uniqueSamples):
    m = 0
    totCnt = 0
    for sam, cnt in uniqueSamples.items():
        m += np.mean([x[i] for i in sam])*cnt
        totCnt += cnt
    sampleMean = m/totCnt
    return(sampleMean)

    
def makePopulation(n, p):
    '''
    Create a population of 0s and 1s
    n = pop size
    p = number of 1s in the population
    '''
    x = [0]*n
    x[:p] = [1]*p
    return(x)


def makeAdversarialPopulation(n, indices):
    '''
    Create a population of 0s and 1s
    n = pop size
    indices = locations to put the 1s
    '''
    x = [0]*n
    for i in indices:
        x[i] = 1
    return x

In [3]:
# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
se = []
bias = []
relBias = []
theoreticalSE = []

# RANDU

In [4]:
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            lcg = lcgRandom(seed=100) # set seed of RANDU to 100
            randu_counts = getEmpiricalDistr(lcg, n=nn, k=kk, reps=reps)
            most_freq_p = findFreqItems(getItemCounts(randu_counts), pp)
            
#            x = makePopulation(nn, pp)
            x = makeAdversarialPopulation(nn, most_freq_p)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, randu_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['RANDU']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            se = se + [theoreticalSE]
            relBias = relBias + [estimBias/truePopMean]
            
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [5]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'Theoretical SE' : theoreticalSE,
     'Bias/Theoretical SE' : np.array(bias)/np.array(theoreticalSE)
    }
resTable = pd.DataFrame(d)
cols = ["Pop size", "Sample size", "Pop Mean", "Sample Mean", "Bias", "Relative bias", \
        'Theoretical SE', 'Bias/Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

Unnamed: 0,Pop size,Sample size,Pop Mean,Sample Mean,Bias,Relative bias,Theoretical SE,Bias/Theoretical SE
0,13,4,0.384615,0.392375,0.00776,0.020175,0.002107,3.683434
1,13,4,0.769231,0.775,0.005769,0.0075,0.001824,3.162278
2,13,10,0.384615,0.38695,0.002335,0.00607,0.000769,3.035
3,13,10,0.769231,0.77161,0.002379,0.003093,0.000666,3.571489
4,30,4,0.166667,0.17315,0.006483,0.0389,0.001764,3.674573
5,30,4,0.333333,0.342625,0.009292,0.027875,0.002232,4.163343
6,30,4,0.666667,0.677175,0.010508,0.015763,0.002232,4.708498
7,30,10,0.166667,0.17021,0.003543,0.02126,0.000979,3.620448
8,30,10,0.333333,0.33855,0.005217,0.01565,0.001238,4.213891
9,30,10,0.666667,0.67182,0.005153,0.00773,0.001238,4.162732


# Super Duper LCG

In [6]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32

# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
seed = []
bias = []
relBias = []
theoreticalSE = []

In [7]:
# Super-Duper, seed=100
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            sdlcg = lcgRandom(seed=100, A=A_SD, B=B_SD, M=M_SD)
            sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            most_freq_p = findFreqItems(getItemCounts(sdlcg_counts), pp)
            
            x = makeAdversarialPopulation(nn, most_freq_p)
            #x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, sdlcg_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super Duper']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [100]
            
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [8]:
# Super-Duper, seed = 2334242802
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            sdlcg = lcgRandom(seed=2334242802, A=A_SD, B=B_SD, M=M_SD)
            sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            most_freq_p = findFreqItems(getItemCounts(sdlcg_counts), pp)
            
            #x = makePopulation(nn, pp)
            x = makeAdversarialPopulation(nn, most_freq_p)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, sdlcg_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super Duper']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [2334242802]
             
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * nn * (nn-1)))]

In [9]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'seed' : seed,
     'Theoretical SE' : theoreticalSE,
     'Bias/Theoretical SE' : np.array(bias)/np.array(theoreticalSE)
    }
resTable = pd.DataFrame(d)
cols = ["Pop size", "Sample size", "seed", "Pop Mean", "Sample Mean", "Bias", "Relative bias", \
        'Theoretical SE', 'Bias/Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

Unnamed: 0,Pop size,Sample size,seed,Pop Mean,Sample Mean,Bias,Relative bias,Theoretical SE,Bias/Theoretical SE
0,13,4,100,0.384615,0.389075,0.00446,0.011595,0.002107,2.116948
1,13,4,100,0.769231,0.772975,0.003744,0.004867,0.001824,2.052318
22,13,4,2334242802,0.384615,0.393225,0.00861,0.022385,0.000584,14.735611
23,13,4,2334242802,0.769231,0.775075,0.005844,0.007597,0.000506,11.549977
2,13,10,100,0.384615,0.3862,0.001585,0.00412,0.000769,2.06
3,13,10,100,0.769231,0.7706,0.001369,0.00178,0.000666,2.055367
24,13,10,2334242802,0.384615,0.38658,0.001965,0.005108,0.000213,9.208578
25,13,10,2334242802,0.769231,0.77095,0.001719,0.002235,0.000185,9.305047
4,30,4,100,0.166667,0.171125,0.004458,0.02675,0.001764,2.52686
5,30,4,100,0.333333,0.3413,0.007967,0.0239,0.002232,3.569647


# Mersenne Twister

In [10]:
# Boilerplate stuff

reps = int(10**4)
n = [13, 30, 90]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
bias = []
relBias = []
seed = []
theoreticalSE = []

In [11]:
# MT, seed = 100
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(100)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            most_freq_p = findFreqItems(getItemCounts(mt_counts), pp)
            
            x = makeAdversarialPopulation(nn, most_freq_p)
            #x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [100]
            
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [12]:
# MT, seed = 2334242802
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(2334242802)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            most_freq_p = findFreqItems(getItemCounts(mt_counts), pp)
            
            x = makeAdversarialPopulation(nn, most_freq_p)
            #x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [2334242802]
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [13]:
# MT, seed = 4294967295
for nn in n:
    for kk in k:
        for pp in p:
            if pp >= nn or kk >= nn:
                continue
            mt = np.random
            mt.seed(4294967295)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            most_freq_p = findFreqItems(getItemCounts(mt_counts), pp)
            
            x = makeAdversarialPopulation(nn, most_freq_p)
            #x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            relBias = relBias + [estimBias/truePopMean]
            seed = seed + [4294967295]
                        
            theoreticalSE = theoreticalSE + \
                [math.sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1)))]

In [14]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Relative bias' : relBias,
     'seed' : seed,
     'Theoretical SE' : theoreticalSE,
     'Bias/Theoretical SE' : np.array(bias)/np.array(theoreticalSE)
    }
resTable = pd.DataFrame(d)
cols = ["Pop size", "Sample size", "seed", "Pop Mean", "Sample Mean", "Bias", "Relative bias", \
        'Theoretical SE', 'Bias/Theoretical SE']
resTable[cols].sort_values(['Pop size', 'Sample size'])

Unnamed: 0,Pop size,Sample size,seed,Pop Mean,Sample Mean,Bias,Relative bias,Theoretical SE,Bias/Theoretical SE
0,13,4,100,0.384615,0.390000,0.005385,0.014000,0.002107,2.556039
1,13,4,100,0.769231,0.773475,0.004244,0.005517,0.001824,2.326382
22,13,4,2334242802,0.384615,0.390100,0.005485,0.014260,0.002107,2.603508
23,13,4,2334242802,0.769231,0.773200,0.003969,0.005160,0.001824,2.175647
44,13,4,4294967295,0.384615,0.389350,0.004735,0.012310,0.002107,2.247488
45,13,4,4294967295,0.769231,0.772850,0.003619,0.004705,0.001824,1.983802
2,13,10,100,0.384615,0.387150,0.002535,0.006590,0.000769,3.295000
3,13,10,100,0.769231,0.771410,0.002179,0.002833,0.000666,3.271267
24,13,10,2334242802,0.384615,0.386650,0.002035,0.005290,0.000769,2.645000
25,13,10,2334242802,0.769231,0.770760,0.001529,0.001988,0.000666,2.295545


If all samples were *actually* equally likely, then the sum of $1$s drawn from the population is distributed as hypergeometric with $p = n\times \text{popMean}$ "good" items, $n-p = n*(1-\text{popMean})$ "bad" items, and $k$ draws.
This random variable $X$ is distributed with

$$E(X) = k\text{popMean} = \frac{kp}{n}$$ and 
$$var(X) = \frac{k\text{popMean}(1-\text{popMean})(n-k)}{n-1} = \frac{kp(1-p)(n-k)}{n^2(n-1)}.$$ 

Instead of the sum, we look at the mean of the $k$ draws, $\bar{X}$. It has a scaled hypergeometric distribution, with

$$E(\bar{X}) = \text{popMean} = \frac{p}{n}$$ and 
$$var(\bar{X}) = \frac{\text{popMean}(1-\text{popMean})(n-k)}{k(n-1)} = \frac{p(1-p)(n-k)}{kn^2(n-1)}.$$

Finally, we sample from this distribution $B$ times and take the sample average -- this is the column Sample Mean in the table. This is an average of IID random variables, so it has mean $\text{popMean} = \frac{p}{n}$ and variance

$$\frac{\text{popMean}(1-\text{popMean})(n-k)}{Bk(n-1)} = \frac{p(1-p)(n-k)}{Bkn^2(n-1)}.$$