In [2]:
from __future__ import division
import numpy as np
from prng import lcgRandom
from sample import PIKK
from scipy.misc import comb, factorial
from scipy.stats import chisquare
import pandas as pd

In [3]:
def getEmpiricalDistr(randomObject, n, k, reps=10**7):
    uniqueSamples = dict()

    for i in range(reps): # use range in python 3, xrange in python 2
        sam = frozenset(PIKK(n, k, randomObject))
        if sam in uniqueSamples:
            uniqueSamples[sam] += 1
        else:
            uniqueSamples[sam] = 1
    return uniqueSamples
    

def getItemCounts(uniqueSamples):
    itemCounts = dict()
    for u,v in uniqueSamples.items():
        for i in u:
            if i in itemCounts:
                itemCounts[i] += v
            else:
                itemCounts[i] = v
    return itemCounts


def printItemFreq(itemCounts, verbose=False):
    for i in itemCounts.keys():
        itemCounts[i] /= reps
        if verbose:
            print(i, itemCounts[i])
    return itemCounts


def printMaxProbRatio(itemCounts, verbose=False):
    freq = list(itemCounts.values())
    pr = np.amax(freq)/np.amin(freq)
    if verbose:
        print("Max ratio of selection probs: " + str(pr))
    return pr


def conductChiSquareTest(itemCounts):
    freq = list(itemCounts.values())
    return(chisquare(freq))

In [4]:
# Boilerplate stuff

reps = int(10**7)
n = [13, 30, 90]
k = [4, 10, 20]

maxProb = []
minProb = []
meanProb = []
maxProbRatio = []
nvalues = []
kvalues = []
prng = []
seed = []

chisqStatistic = []
chisqDF = []
chisqPvalue = []

# RANDU

In [14]:
for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        
        lcg = lcgRandom(seed=100) # set seed of RANDU to 100

        randu_counts = getEmpiricalDistr(lcg, n=nn, k=kk, reps=reps)
        #print("Number of missing samples: " + str(comb(n, k) - len(randu_counts)))
        itemFreq = printItemFreq(getItemCounts(randu_counts))

        maxProb = maxProb + [np.amax(list(itemFreq.values()))]
        minProb = minProb + [np.amin(list(itemFreq.values()))]
        meanProb = meanProb + [np.mean(list(itemFreq.values()))]
        maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
        nvalues = nvalues + [nn]
        kvalues = kvalues + [kk]
        prng = prng + ['RANDU']
        seed = seed + [100]
        
        chisqDF = chisqDF + [len(randu_counts)]
        chisqTestResults = conductChiSquareTest(randu_counts)
        chisqStatistic = chisqStatistic + [chisqTestResults[0]]
        chisqPvalue = chisqPvalue + [chisqTestResults[1]]

# Super Duper LCG

In [6]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32

In [7]:
for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        
        sdlcg = lcgRandom(seed=100, A=A_SD, B=B_SD, M=M_SD)

        sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
        #print("Number of missing samples: " + str(comb(n, k) - len(sdlcg_counts)))
        itemFreq = printItemFreq(getItemCounts(sdlcg_counts))

        maxProb = maxProb + [np.amax(list(itemFreq.values()))]
        minProb = minProb + [np.amin(list(itemFreq.values()))]
        meanProb = meanProb + [np.mean(list(itemFreq.values()))]
        maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
        nvalues = nvalues + [nn]
        kvalues = kvalues + [kk]
        prng = prng + ['Super Duper']
        seed = seed + [100]
        
        chisqDF = chisqDF + [len(sdlcg_counts)]
        chisqTestResults = conductChiSquareTest(sdlcg_counts)
        chisqStatistic = chisqStatistic + [chisqTestResults[0]]
        chisqPvalue = chisqPvalue + [chisqTestResults[1]]

In [8]:
for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        
        sdlcg = lcgRandom(seed=2334242802, A=A_SD, B=B_SD, M=M_SD)

        sdlcg_counts = getEmpiricalDistr(sdlcg, n=nn, k=kk, reps=reps)
        #print("Number of missing samples: " + str(comb(n, k) - len(sdlcg_counts)))
        itemFreq = printItemFreq(getItemCounts(sdlcg_counts))

        maxProb = maxProb + [np.amax(list(itemFreq.values()))]
        minProb = minProb + [np.amin(list(itemFreq.values()))]
        meanProb = meanProb + [np.mean(list(itemFreq.values()))]
        maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
        nvalues = nvalues + [nn]
        kvalues = kvalues + [kk]
        prng = prng + ['Super Duper']
        seed = seed + [2334242802]
                
        chisqDF = chisqDF + [len(sdlcg_counts)]
        chisqTestResults = conductChiSquareTest(sdlcg_counts)
        chisqStatistic = chisqStatistic + [chisqTestResults[0]]
        chisqPvalue = chisqPvalue + [chisqTestResults[1]]

# Mersenne Twister

In [9]:
for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        
        mt = np.random
        mt.seed(100)

        mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
        #print("Number of missing samples: " + str(comb(n, k) - len(mt_counts)))
        itemFreq = printItemFreq(getItemCounts(mt_counts))

        maxProb = maxProb + [np.amax(list(itemFreq.values()))]
        minProb = minProb + [np.amin(list(itemFreq.values()))]
        meanProb = meanProb + [np.mean(list(itemFreq.values()))]
        maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
        nvalues = nvalues + [nn]
        kvalues = kvalues + [kk]
        prng = prng + ['MT']
        seed = seed + [100]
                
        chisqDF = chisqDF + [len(mt_counts)]
        chisqTestResults = conductChiSquareTest(mt_counts)
        chisqStatistic = chisqStatistic + [chisqTestResults[0]]
        chisqPvalue = chisqPvalue + [chisqTestResults[1]]

In [10]:
for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        mt = np.random
        mt.seed(2334242802)

        mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
        #print("Number of missing samples: " + str(comb(n, k) - len(mt_counts)))
        itemFreq = printItemFreq(getItemCounts(mt_counts))

        maxProb = maxProb + [np.amax(list(itemFreq.values()))]
        minProb = minProb + [np.amin(list(itemFreq.values()))]
        meanProb = meanProb + [np.mean(list(itemFreq.values()))]
        maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
        nvalues = nvalues + [nn]
        kvalues = kvalues + [kk]
        prng = prng + ['MT']
        seed = seed + [2334242802]
                        
        chisqDF = chisqDF + [len(mt_counts)]
        chisqTestResults = conductChiSquareTest(mt_counts)
        chisqStatistic = chisqStatistic + [chisqTestResults[0]]
        chisqPvalue = chisqPvalue + [chisqTestResults[1]]

In [11]:
for nn in n:
    for kk in k:
        if kk >= nn:
            continue
        
        mt = np.random
        # mt.seed(38245944668371091219) # 20 digits is too long - max seed is 4294967295
        mt.seed(4294967295)

        mt_counts = getEmpiricalDistr(mt, n=nn, k=kk, reps=reps)
        #print("Number of missing samples: " + str(comb(n, k) - len(mt_counts)))
        itemFreq = printItemFreq(getItemCounts(mt_counts))

        maxProb = maxProb + [np.amax(list(itemFreq.values()))]
        minProb = minProb + [np.amin(list(itemFreq.values()))]
        meanProb = meanProb + [np.mean(list(itemFreq.values()))]
        maxProbRatio = maxProbRatio + [printMaxProbRatio(itemFreq)]
        nvalues = nvalues + [nn]
        kvalues = kvalues + [kk]
        prng = prng + ['MT']
        seed = seed + [4294967295]
                        
        chisqDF = chisqDF + [len(mt_counts)]
        chisqTestResults = conductChiSquareTest(mt_counts)
        chisqStatistic = chisqStatistic + [chisqTestResults[0]]
        chisqPvalue = chisqPvalue + [chisqTestResults[1]]

# First-order selection probabilities, summary statistics

In [15]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Min Prob' : minProb,
     'Mean Prob' : meanProb,
     'Max Prob' : maxProb,
     'Max Selection Prob Ratio' : maxProbRatio,
     'seed' : seed
    }
resTable = pd.DataFrame(d)
cols = resTable.columns.tolist()
cols.reverse()
cols = [cols[2]] + [cols[1]] + [cols[0]] + cols[3:6] + [cols[7]] + [cols[6]]
resTable[cols].sort_values(['Pop size', 'Sample size', 'seed'])

Unnamed: 0,Pop size,Sample size,seed,PRNG,Min Prob,Mean Prob,Max Prob,Max Selection Prob Ratio
0,13,4,100,Super Duper,0.307363,0.307692,0.308004,1.002086
16,13,4,100,MT,0.307419,0.307692,0.30786,1.001434
40,13,4,100,RANDU,0.307477,0.307692,0.308064,1.001908
8,13,4,2334242802,Super Duper,0.307391,0.307692,0.307964,1.001861
24,13,4,2334242802,MT,0.30752,0.307692,0.307906,1.001257
32,13,4,4294967295,MT,0.307481,0.307692,0.307984,1.001639
1,13,10,100,Super Duper,0.768996,0.769231,0.769404,1.000531
17,13,10,100,MT,0.768851,0.769231,0.769445,1.000772
41,13,10,100,RANDU,0.76909,0.769231,0.769504,1.000539
9,13,10,2334242802,Super Duper,0.769103,0.769231,0.769607,1.000655


# Chi-squared test for uniformity of first-order selection probabilities

In [16]:
d = {'Sample size' : kvalues,
     'Pop size' : nvalues,
     'PRNG' : prng,
     'Max Selection Prob Ratio' : maxProbRatio,
     'seed' : seed,
     'Chi-squared' : chisqStatistic,
     'Df' : chisqDF,
     'P-value' : chisqPvalue
    }
resTable = pd.DataFrame(d)
cols = resTable.columns.tolist()
cols.reverse()
cols = [cols[2]] + [cols[1]] + [cols[0]] + [cols[3]] + [cols[7]] + [cols[6]] + cols[4:5]
resTable[cols].sort_values(['Pop size', 'Sample size', 'seed'])

Unnamed: 0,Pop size,Sample size,seed,PRNG,Chi-squared,Df,P-value
0,13,4,100,Super Duper,666.8423,715,0.8959426
16,13,4,100,MT,699.9703,715,0.6391
40,13,4,100,RANDU,959.7907,715,1.856374e-09
8,13,4,2334242802,Super Duper,698.4942,715,0.653822
24,13,4,2334242802,MT,685.3385,715,0.7737201
32,13,4,4294967295,MT,692.14,715,0.7145748
1,13,10,100,Super Duper,258.1213,286,0.8718435
17,13,10,100,MT,274.0896,286,0.668194
41,13,10,100,RANDU,561.8441,286,2.6248279999999997e-20
9,13,10,2334242802,Super Duper,260.4413,286,0.8488809
