# Sequential probability ratio test, applied to PRNGs

We modify the sequential testing procedure from Weiss (1962) to be a more powerful conditional test.

The method tests the null hypothesis that a multinomial random variable has equal category probabilities $1/k$. We condition on a sample being among the most frequent $s$ or least frequent $s$ categories. Then, under the null, it's equally likely for the sample to be in the top or bottom $s$. The alternative hypothesis is that the probability of landing in the top $s$ is $\frac{m}{2}$ for some $m > 1$.

The function `sequential_multinomial_conditional_test` tests the null with type 1 error at most $\alpha$ and power at least $1 - \beta$.

In this particular use case, we want to test whether samples of size $k$ drawn from a population of size $n$ using a particular PRNG and sampling algorithm actually occur with equal frequency.  They should be distributed as multinomial with probability ${n \choose k}^{-1}$.

In [1]:
%matplotlib inline
from __future__ import division
import matplotlib.pyplot as plt
import math
import numpy as np
import sys
sys.path.append('../modules')
from sample import PIKK, sample_by_index
from sha256prng import SHA256
from prng import lcgRandom
from scipy.misc import comb

In [2]:
def sequential_multinomial_conditional_test(sampling_function, alpha, beta, multiplier, \
                                s, maxsteps=10**5):
    '''
    Conduct Wald's SPRT for multinomial distribution, conditional on samples being in the
    top or bottom s most frequent categories
    H_0: selection probabilities are all 1/num_categories so p=s/num_categories
    H_1: probability of landing in top s is higher than landing in lowest s
    
    sampling_function: a function which generates a random number or random sample.
    alpha: desired type 1 error rate
    beta: desired power
    multiplier: value larger than 1. Determines alternative: p1 = multiplier/2
    s: tuning parameter, number of top + bottom categories considered. An integer between 1 and k.
    maxsteps: max number of samples before the algorithm terminates.
    '''

    assert multiplier > 1
    assert maxsteps > 0
    if isinstance(s, int):
        s = [s]
    
    # Set parameters
    lower = beta/(1-alpha)
    upper = (1-beta)/alpha

    # Initialize counter
    sampleCounts = dict()
    while len(sampleCounts.keys()) < 2*max(s):
        Xn = str(sorted(sampling_function()))
        if Xn not in sampleCounts.keys():
            sampleCounts[Xn] = 0
    steps = 0
    event_occurs = {ss: 0 for ss in s}
    top_s_occurs = {ss: 0 for ss in s}
    LR = {ss: [1] for ss in s}
    decision = {ss: "None" for ss in s}
    num_steps = {ss: maxsteps for ss in s}
    tests_running = len(s)
    
    # Draw samples
    while tests_running and steps < maxsteps:
        Xn = str(sorted(sampling_function()))
        top_categories = sorted(sampleCounts, key = sampleCounts.get, reverse = True)

        # add Xn to sampleCounts and repeat
        if Xn in sampleCounts.keys():
            sampleCounts[Xn] += 1
        else:
            sampleCounts[Xn] = 1

        steps += 1    
        for ss in s:
            # Run test for greater than alternative
            # Event occurs if Xn is among the s most frequent values of X1,...,X_n-1
            if (Xn not in top_categories[:ss]) and (Xn not in top_categories[-ss:]):
                continue
            event_occurs[ss] += 1
            
            if Xn in top_categories[:ss]:
                top_s_occurs[ss] += 1
                LR[ss].append(LR[ss][-1] * multiplier) # p1/p0 = multiplier
            else:
                LR[ss].append(LR[ss][-1] * (1 - multiplier/2)*2) # (1-p1)/(1-p0)

            # Run test at step n
            if LR[ss][-1] <= lower:
                # accept the null and stop
                decision[ss] = 0
                num_steps[ss] = steps
                tests_running -= 1
                s.remove(ss)
                
            if LR[ss][-1] >= upper:
                # reject the null and stop
                decision[ss] = 1
                num_steps[ss] = steps
                tests_running -= 1
                s.remove(ss)


    return {'decision' : decision,
            'lower_threshold' : lower,
            'LR' : LR,
            'upper_threshold' : upper,
            'steps' : num_steps,
            'event_occurs' : event_occurs,
            'top_s_occurs' : top_s_occurs
            }

In [21]:
# Choose pop and sample size, multiplier > 1, alpha, beta
n = 13
k = 3
alpha = 0.05
beta = 0
multiplier = 1.01


# RANDU

In [22]:
prng = lcgRandom(100) # from random.org Timestamp: 2017-01-14 22:56:40 UTC
sampling_func = lambda: PIKK(n, k, prng)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [23]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 'None'}, {143: 100000}, {143: 49733})

In [24]:
prng = lcgRandom(100) # from random.org Timestamp: 2017-01-14 22:56:40 UTC
sampling_func = lambda: sample_by_index(n, k, prng)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [25]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 1}, {143: 6310}, {143: 3321})

# Super-Duper LCG

In [26]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32
sdlcg = lcgRandom(seed=547691802, A=A_SD, B=B_SD, M=M_SD) 
sampling_func = lambda: PIKK(n, k, sdlcg)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [27]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 'None'}, {143: 100000}, {143: 50108})

In [28]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32
sdlcg = lcgRandom(seed=547691802, A=A_SD, B=B_SD, M=M_SD) 
sampling_func = lambda: sample_by_index(n, k, sdlcg)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [29]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 'None'}, {143: 100000}, {143: 50102})

# Mersenne Twister

In [30]:
prng = np.random
prng.seed(547691802)
sampling_func = lambda: PIKK(n, k, prng)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [31]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 'None'}, {143: 100000}, {143: 49959})

In [32]:
prng = np.random
prng.seed(547691802)
sampling_func = lambda: sample_by_index(n, k, prng)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [33]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 'None'}, {143: 100000}, {143: 50098})

# SHA256

In [34]:
prng = SHA256(547691802)
sampling_func = lambda: PIKK(n, k, prng)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [35]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 'None'}, {143: 100000}, {143: 49833})

In [36]:
prng = SHA256(547691802)
sampling_func = lambda: sample_by_index(n, k, prng)
sampling_func()

res = sequential_multinomial_conditional_test(sampling_func, alpha=alpha, beta=beta, multiplier=multiplier, s=int(comb(n, k)/2))

In [37]:
res['decision'], res['event_occurs'], res['top_s_occurs']

({143: 'None'}, {143: 100000}, {143: 50006})