# Sequential probability ratio test, applied to PRNGs

Sequential testing procedure from Weiss (1962)

The method tests the null hypothesis that a multinomial random variable has equal category probabilities $1/k$ against the alternative that the most common $s$ categories have a probability of occuring more often than $s/k$.

The function `sequential_multinomial_test` tests the null with type 1 error at most $\alpha$ and power at least $1 - \beta$.

In this particular use case, we want to test whether samples of size $k$ drawn from a population of size $n$ using a particular PRNG and sampling algorithm actually occur with equal frequency.  They should be distributed as multinomial with probability ${n \choose k}^{-1}$.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import math
import numpy as np
import sys
sys.path.append('../modules')
from sample import PIKK, sample_by_index
from sha256prng import SHA256
from prng import lcgRandom
from scipy.misc import comb

In [2]:
def sequential_multinomial_test(sampling_function, num_categories, alpha, beta, multiplier, \
                                s = None, maxsteps=10**5):
    '''
    Conduct Wald's SPRT for multinomial distribution with num_categories categories
    Let p = sum_{s most frequent categories} p_category
    H_0: selection probabilities are all 1/num_categories so p=s/num_categories
    H_1: p = p1 = multiplier * s/num_categories
    
    sampling_function: a function which generates a random number or random sample.
    num_categories: number of categories
    alpha: desired type 1 error rate
    beta: desired power
    multiplier: value larger than 1. Determines alternative: p1 = multiplier * s/num_categories
    s: tuning parameter, integer between 1 and k. Default is 1% of num_categories.
    '''

    assert multiplier > 1
    assert maxsteps > 0
    
    if s is None:
        s = math.ceil(0.01*num_categories)
    assert isinstance(s, int)    


    k = num_categories # Rename for ease of use!
    
    # Set p0 = s/k, p1 = multiplier*(s/k)
    p0 = s/k
    p1 = multiplier*s/k
    assert p1 < 1
    assert p0 < 1
    
    # Set parameters
    lower = beta/(1-alpha)
    upper = (1-beta)/alpha
    lr_occurs = p1/p0
    lr_doesnotoccur = (1 - p1)/(1 - p0)

    # Initialize counter
    sampleCounts = dict()
    while len(sampleCounts.keys()) < s:
        Xn = str(sorted(sampling_function()))
        if Xn not in sampleCounts.keys():
            sampleCounts[Xn] = 0
    steps = 0
    LR = [1]
    decision = None        
    
    # Draw samples
    while lower < LR[-1] < upper and steps < maxsteps:
        Xn = str(sorted(sampling_function()))

        # Event occurs if Xn is among the top s most frequent values of X1,...,X_n-1
        steps += 1
        top_categories = sorted(sampleCounts, key = sampleCounts.get, reverse = True)[:s]
        if Xn in top_categories:
            Bn = 1
            LR.append(LR[-1] * lr_occurs)
        else:
            Bn = 0
            LR.append(LR[-1] * lr_doesnotoccur)

        # Run test at step n
        if LR[-1] <= lower:
            # accept the null and stop
            decision = 0
            break
#            return {'decision' : 0,
#                    'lower' : lower,
#                    'LR' : LR,
#                    'upper' : upper,
#                    'steps' : steps,
#                    'pvalue' : min(1/LR[-1], 1)
#                   }
        if LR[-1] >= upper:
            # reject the null and stop
            decision = 1
            break
#            return {'decision' : 1,
#                    'lower' : lower,
#                    'LR' : LR,
#                    'upper' : upper,
#                    'steps' : steps,
#                    'pvalue' : min(1/LR[-1], 1)
#                   }
        
        # add Xn to sampleCounts and repeat
        if Xn in sampleCounts.keys():
            sampleCounts[Xn] += 1
        else:
            sampleCounts[Xn] = 1
    return {'decision' : decision,
            'lower' : lower,
            'LR' : LR,
            'upper' : upper,
            'steps' : steps,
            'pvalue' : min(1/LR[-1], 1)
            }

In [3]:
# Choose pop and sample size, multiplier > 1, alpha, beta
n = 13
k = 4
alpha = 0.05
beta = 0.05
multiplier = 1.2
s = 10

# RANDU

In [4]:
prng = lcgRandom(100) # from random.org Timestamp: 2017-01-14 22:56:40 UTC
sampling_func = lambda: PIKK(n, k, prng)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(0, 24957, 1)

In [5]:
prng = lcgRandom(100) # from random.org Timestamp: 2017-01-14 22:56:40 UTC
sampling_func = lambda: sample_by_index(n, k, prng)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(1, 32652, 0.051428137921121123)

# Super-Duper LCG

In [6]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32
sdlcg = lcgRandom(seed=547691802, A=A_SD, B=B_SD, M=M_SD) 
sampling_func = lambda: PIKK(n, k, sdlcg)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(0, 2862, 1)

In [7]:
# Parameters for the Super Duper LCG
A_SD = 0
B_SD = 69069
M_SD = 2**32
sdlcg = lcgRandom(seed=547691802, A=A_SD, B=B_SD, M=M_SD) 
sampling_func = lambda: sample_by_index(n, k, sdlcg)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(0, 20329, 1)

# Mersenne Twister

In [8]:
prng = np.random
prng.seed(547691802)
sampling_func = lambda: PIKK(n, k, prng)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(1, 6771, 0.050605867869732174)

In [9]:
prng = np.random
prng.seed(547691802)
sampling_func = lambda: sample_by_index(n, k, prng)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(0, 12182, 1)

# SHA256

In [10]:
prng = SHA256(547691802)
sampling_func = lambda: PIKK(n, k, prng)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(0, 12117, 1)

In [11]:
prng = SHA256(547691802)
sampling_func = lambda: sample_by_index(n, k, prng)
sampling_func()

res = sequential_multinomial_test(sampling_func, num_categories=comb(n, k), 
                                  alpha=alpha, beta=beta, multiplier=multiplier, s=s)
res['decision'], res['steps'], res['pvalue']

(0, 3188, 1)