In [1]:
from __future__ import division
import numpy as np
from prng import lcgRandom, MT19937
from sample import PIKK
from scipy.misc import comb, factorial
import pandas as pd
import math

In [2]:
def getEmpiricalDistr(randomObject, n, k, reps=10**7):
    uniqueSamples = dict()

    for i in range(reps): # use range in python 3, xrange in python 2
        sam = frozenset(PIKK(n, k, randomObject))
        if sam in uniqueSamples:
            uniqueSamples[sam] += 1
        else:
            uniqueSamples[sam] = 1
    return uniqueSamples


def getPopMean(x):
    return(np.mean(x))


def getSampleMean(x, uniqueSamples):
    m = 0
    totCnt = 0
    for sam, cnt in uniqueSamples.items():
        m += np.mean([x[i] for i in sam])*cnt
        totCnt += cnt
    sampleMean = m/totCnt
    return(sampleMean)

    
def makePopulation(n, p):
    '''
    Create a population of 0s and 1s
    n = pop size
    p = number of 1s in the population
    '''
    x = [0]*n
    x[:p] = [1]*p
    return(x)

In [3]:
# Boilerplate stuff

reps = int(10**5)
n = [13, 30]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
se = []
bias = []
stBias = []

# RANDU

In [4]:
for nn in n:
    for kk in k:
        for pp in p:
            if p >= n:
                pass
            lcg = lcgRandom(seed=100) # set seed of RANDU to 100
            randu_counts = getEmpiricalDistr(lcg, n=nn, k=kk, reps=reps)

            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, randu_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['RANDU']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            stBias = stBias + [estimBias/theoreticalSE]

In [5]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Standardized bias' : stBias
    }
resTable = pd.DataFrame(d)
cols = resTable.columns.tolist()
cols = [cols[3]] + [cols[5]] + [cols[1]] + [cols[2]] + [cols[4]] + [cols[0]] + [cols[-1]]
resTable[cols].sort_values(['Sample size', 'Pop size'])

Unnamed: 0,Pop size,Sample size,PRNG,Pop Mean,Sample Mean,Bias,Standardized bias
0,13,4,RANDU,0.384615,0.383773,-0.000843,-0.003465
1,13,4,RANDU,0.769231,0.768532,-0.000698,-0.003315
2,13,4,RANDU,1.0,1.0,0.0,
9,30,4,RANDU,0.166667,0.167148,0.000481,0.00258
10,30,4,RANDU,0.333333,0.333883,0.000549,0.00233
11,30,4,RANDU,0.666667,0.666397,-0.000269,-0.001142
3,13,10,RANDU,0.384615,0.383891,-0.000724,-0.004708
4,13,10,RANDU,0.769231,0.768832,-0.000399,-0.002993
5,13,10,RANDU,1.0,1.0,0.0,
12,30,10,RANDU,0.166667,0.166931,0.000264,0.002243


# Super Duper LCG

In [6]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32

# Boilerplate stuff

reps = int(10**3)
n = [13, 30]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
se = []
bias = []
stBias = []

In [7]:
for nn in n:
    for kk in k:
        for pp in p:
            if p >= n:
                pass
            sdlcg = lcgRandom(seed=100, A=A_SD, B=B_SD, M=M_SD)
            sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, sdlcg_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['Super Duper']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            stBias = stBias + [estimBias/theoreticalSE]

In [8]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Standardized bias' : stBias
    }
resTable = pd.DataFrame(d)
cols = resTable.columns.tolist()
cols = [cols[3]] + [cols[5]] + [cols[1]] + [cols[2]] + [cols[4]] + [cols[0]] + [cols[-1]]
resTable[cols].sort_values(['Sample size', 'Pop size'])

Unnamed: 0,Pop size,Sample size,PRNG,Pop Mean,Sample Mean,Bias,Standardized bias
0,13,4,Super Duper,0.384615,0.379,-0.005615,-0.023085
1,13,4,Super Duper,0.769231,0.76375,-0.005481,-0.026017
2,13,4,Super Duper,1.0,1.0,0.0,
9,30,4,Super Duper,0.166667,0.16275,-0.003917,-0.021019
10,30,4,Super Duper,0.333333,0.323,-0.010333,-0.043841
11,30,4,Super Duper,0.666667,0.66475,-0.001917,-0.008132
3,13,10,Super Duper,0.384615,0.383,-0.001615,-0.0105
4,13,10,Super Duper,0.769231,0.7703,0.001069,0.008025
5,13,10,Super Duper,1.0,1.0,0.0,
12,30,10,Super Duper,0.166667,0.1678,0.001133,0.009617


# Mersenne Twister

In [9]:
# Boilerplate stuff

reps = int(10**3)
n = [13, 30]
k = [4, 10, 20]
p = [5, 10, 20]

popMean = []
sampleMean = []
nvalues = []
kvalues = []
prng = []
se = []
bias = []
stBias = []

In [10]:
for nn in n:
    for kk in k:
        for pp in p:
            if p >= n:
                pass
            mt = MT19937(seed=100)
            mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
            
            x = makePopulation(nn, pp)
            truePopMean = getPopMean(x)
            popMean = popMean + [truePopMean]
            sampleMean = sampleMean + [getSampleMean(x, mt_counts)]
            nvalues = nvalues + [nn]
            kvalues = kvalues + [kk]
            prng = prng + ['MT']
            
            estimBias = sampleMean[-1] - truePopMean
            bias = bias + [estimBias]
            theoreticalSE = math.sqrt(truePopMean * (1-truePopMean)/kk)
            se = se + [theoreticalSE]
            stBias = stBias + [estimBias/theoreticalSE]

In [11]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Pop Mean' : popMean,
     'Sample Mean' : sampleMean,
     'Bias' : bias,
     'Standardized bias' : stBias
    }
resTable = pd.DataFrame(d)
cols = resTable.columns.tolist()
cols = [cols[3]] + [cols[5]] + [cols[1]] + [cols[2]] + [cols[4]] + [cols[0]] + [cols[-1]]
resTable[cols].sort_values(['Sample size', 'Pop size'])

Unnamed: 0,Pop size,Sample size,PRNG,Pop Mean,Sample Mean,Bias,Standardized bias
0,13,4,MT,0.384615,0.38775,0.003135,0.012886
1,13,4,MT,0.769231,0.77125,0.002019,0.009585
2,13,4,MT,1.0,1.0,0.0,
9,30,4,MT,0.166667,0.17125,0.004583,0.024597
10,30,4,MT,0.333333,0.33325,-8.3e-05,-0.000354
11,30,4,MT,0.666667,0.66225,-0.004417,-0.018738
3,13,10,MT,0.384615,0.3844,-0.000215,-0.0014
4,13,10,MT,0.769231,0.7682,-0.001031,-0.007736
5,13,10,MT,1.0,1.0,0.0,
12,30,10,MT,0.166667,0.1691,0.002433,0.020648
