In [1]:
import numpy as np
import pymc3 as pm
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import aesara.tensor as at
from aesara.tensor.random.op import RandomVariable

from scipy import stats as st

import aesara

%config InlineBackend.figure_format = "retina"
%matplotlib inline

You are running the v4 development version of PyMC3 which currently still lacks key features. You probably want to use the stable v3 instead which you can either install via conda or find on the v3 GitHub branch: https://github.com/pymc-devs/pymc3/tree/v3


In [2]:
K = 10
N = 1000
N_dp = K*25

# data-generating parameters
M = 5
mu = 2.
sigma = 3.

rng = np.random.RandomState(seed=1234)

In [3]:
def generate_weights(M, rng, size):
    
    N, K = size
    betas = rng.beta(1., M, size=[N, K-1])
    
    sticks = np.hstack(
        [
            np.ones(shape=[N, 1]),
            np.cumprod(1 - betas[:, :-1], axis=1),
        ]
    )

    product = betas * sticks
    
    last_column = 1 - product.sum(axis=1)[..., np.newaxis]
    
    return np.hstack((product, last_column))


def stick_glueing(sticks):
    
    N = sticks.shape[0]
    
    """
    sticks = [w1, w2, ..., wK] such that:
    • sticks.sum() = 1
    • wK = 1 - w1 - w2 - ... - wK-1
    
    denominator = [1, -w1, -w2, ..., wK]
    • length is K + 1
    • after cumsum, denominator = [1, 1 - w1, 1 - w1 - w2, ..., 1 - w1 - w2 - ... - wK] (length K + 1)
    
    output is [w1/1, w2/(1 - w1), w3/(1 - w1 - w2), ..., wK/(1 - w1 - ... - wK-1)]
    • wK/(1 - w1 - ... - wK-1) = wK/wK = 1 by construction of wK (i.e. `last_column` in `generate_weights`)
    • denominator[:, :-1] is necessary because the cumsum runs from 1 to j for the (j+1)th term
    • output[:, :-1] (shape = [N, K-1]) is necessary to remove the column of 1's at the end
    """
    
    if len(sticks.shape) == 1:
        # testing for a non-matrix weight vector
        denominator = np.cumsum(np.concatenate(([1], sticks)))
        return (sticks/denominator[:-1])[:-1]
    
    denominator = np.cumsum(
        np.hstack(
            [
                np.ones(shape=[N, 1]),
                - sticks,
            ]
        ),
        axis=1,
    )
    
    output = (sticks/(denominator[:, :-1]))
    
    return output[:, :-1]

Everything up to this point is identical to `test-multiple-dp-samples.ipynb`

In [4]:
def create_dp_samples(M, rng, size):
    
    """
    Just looking at the first weight vector with weights[0]
    • Output temporarily does not include atoms
    """
    
    N, K = size
    
    atoms = rng.normal(loc=mu, scale=sigma, size=size)
    weights = generate_weights(M, rng, size)
    
    assert all([w_sum == 1 for w_sum in weights.sum(axis=1)])
    assert np.all(weights >= 0)
    assert np.all(weights <= 1)

    return rng.multinomial(n=N_dp, pvals=weights[0])

In [5]:
single_dp_sample = create_dp_samples(M, rng, size=[N, K])

In [6]:
single_dp_sample

array([49,  7, 72, 34, 10, 29,  4,  0,  0, 45])

In [7]:
empirical_frequency = np.sort(single_dp_sample)[::-1]/N_dp
stick_glueing(empirical_frequency)

array([0.288     , 0.15217391, 0.1212938 , 0.08173077, 0.06444444,
       0.02087683, 0.01431493, 0.00806452, 0.        ])