In [1]:
import pandas as pd
import numpy as np

In [85]:
data = pd.read_csv('./data/clean/move-lens-100k-all.csv')
data.head()

Unnamed: 0,user_id,item_id,timestamp,rating,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,...,genre_Western,genre_unknown,release_decade_1920.0,release_decade_1930.0,release_decade_1940.0,release_decade_1950.0,release_decade_1960.0,release_decade_1970.0,release_decade_1980.0,release_decade_1990.0
0,259,286,874724727,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,259,286,874724727,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,259,286,874724727,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,259,185,874724781,4,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,259,185,874724781,4,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## MV-CTR

The joint distribution is given by:

$$
P(f, z, \Phi, \theta | \alpha, \beta)
    = \prod_{t=1}^T P(\theta | \alpha) P(z_{t} | \theta) \prod_{x=1}^V  P(\Phi^{(l)} | \beta^{(l)}) P(f_{t}^{(l)} | z_t, \Phi^{(l)}) \\
    = \prod_{k=1}^K \theta_k^{\sum_{t=1}^T z_{t} + \alpha_k - 1} \prod_{x=1}^V  (\Phi_{xk}^{(l)})^{\sum_{t=1}^T \sum_{x=1}^V f_{tx}^{(l)}z_t + \beta_k^{(l)} - 1} 
$$

Marginalizing over $\theta$ and $\Phi$ gives:

$$
    P(f, z | \alpha, \beta) = \int P(f, z, \Phi_t, \theta | \alpha, \beta) d\theta d\Phi \\
        = \frac{\prod_{k=1}^K \Gamma(n_k + \alpha_k)}{\Gamma(\sum_{k=1}^K n_k + \alpha_0)} 
            \frac{\prod_{x=1}^V \prod_{k=1}^K \Gamma(n_{xk} + \beta_k)}{\Gamma(\sum_{x=1}^V \sum_{k=1}^K n_{xk} + \beta_0)}
$$

Where we defined:

$$
    n_k \equiv \sum_{t=1}^T r_t z_{tk} \\
    n_{xk} \equiv \sum_{t=1}^T r_t f_{tx} z_{tk}
$$

where $r_t$ is the rating at $t$. Then, since $\Gamma(n+1) = n \Gamma(n)$:

$$
P(z_t = k' | z_{-t}, f, \alpha, \beta) \propto P(f, z | \alpha, \beta) \\
    \propto \prod_{k \neq k'} \Gamma(n_{k, -t} + \alpha_k) 
        \prod_{x=1}^V \frac{\Gamma(n_{xk, -t} + \beta_k)}{\Gamma(\sum_{x=1}^V \sum_{k=1}^K n_{xk, -t} + \beta_0)}
        \Gamma(n_{k', -t} + \alpha_{k'} + 1)
        \prod_{x = 1}^V \frac{\Gamma(n_{xk', -t} + \beta_k' + 1)}{\Gamma(\sum_{x=1}^V \sum_{k=1}^K n_{xk, -t} + \beta_0 + 1)} \\
    = \prod_{k = 1}^K \Gamma(n_{k, -t} + \alpha_k) 
        \prod_{x=1}^V \frac{\Gamma(n_{xk, -t} + \beta_k)}{\Gamma(\sum_{x=1}^V \sum_{k=1}^K n_{xk, -t} + \beta_0)}
        (n_{k', -t} + \alpha_{k'}) \prod_{x =1}^V \frac{n_{xk', -t} + \beta_{k'}}{\sum_{x=1}^V \sum_{k=1}^K n_{xk, -t} + \beta_0} \\
    \propto (n_{k', -t} + \alpha_{k'}) \prod_{x = 1}^V  \frac{n_{xk', -t} + \beta_k}{\sum_{x'=1}^V \sum_{k=1}^K n_{xk', -t} + \beta_0} \\
$$

where,
$$
    n_{k, -t} \equiv \sum_{t' \neq t} r_{t'} z_{tk} \\
    n_{xk, -t} \equiv \sum_{t' \neq t} r_t f_{tx} z_{tk}
$$

Notice that the "point" removed from the products is that associated with a single rating point and feature-value. But because points stack up, we can simply pull out each aggregated rating simultaneously with the same result.

In [77]:
# training data
X = data.drop('rating', axis=1).to_numpy()
r = data.rating.astype(int).to_numpy()

V = data.shape[1] - 1
K = 3
T = len(data)

a, b = 1, 1
Phi = np.zeros((V, K))
theta = np.zeros(K)

# init topic assignment vector
topic_assignment = np.random.choice(K, T)

# init topic assignment counts
_, n = np.unique(topic_assignment, return_counts=True) 

def m(x, k):
    return (r * (topic_assignment == k).astype(int) * X[:, x]).sum()

m(0, 0)

z = 0
for t in range(T):
    # decrement
    n[z] -= 1 
    # sample from markov chain
    z = np.random.multinomial(
        1, pvals=(n + a) * np.array([[m(x, k) for k in range(K)] for x in range(V)]).prod()
    ).argmax()
    # update assignment
    topic_assignment[t] = z
    # increment
    n[z] += 1 


# # suppose we have a dataset with 30 datapoints
# # with the following cluster assignment counts
# n = np.array([10, 20, 70])

# r = []
# z_candidate = 0
# for _ in range(1000000):
#     n[z_candidate] -= 1 # decrement
#     # a-K = 1
#     z_candidate = np.random.multinomial(1, pvals=(n+1)/(n+1).sum()).argmax()
#     n[z_candidate] += 1 # increment

#     r.append(z_candidate)


# # print distribution
# t = 100000
# (pd.Series(r)[:t].value_counts() / len(r[:t])).sort_index()

23919

In [84]:
unique, counts = np.unique(topic_assignment, return_counts=True) 

print(unique)
print(counts)

k = 0
x = 0
r * (topic_assignment == k).astype(int) @ X[:, :]

[0 1 2]
[24947 24902 24718]


array([23919, 71863,  7282,  2488,   565,  8356,  8792,  1835,  3212,
        1749,   316,  1168,  4720,  2181,   770, 10350,  8289,  1235,
         959,  2494, 20907,  3596,  4518,    75,  7593, 37922, 25523,
       14531,  7835,  2098,   205, 14530,  7272,  1101,  2489,  8153,
        3830,     0, 16184,   416,   772,  1712,  1773,  2466,  9501,
        8601, 10790,  5857,   335,     0,     0,  1380,  1505,     0,
        3418, 11823, 19073, 58583])

In [5]:
from scipy.stats import multinomial

multinomial(8, np.ones(3)/3).rvs()

array([[2, 2, 4]])

## MV-CTR 2

The joint distribution is given by:

$$
P(f, z, \Phi, \theta | \alpha, \beta)
    = \prod_{k=1}^K
    P(\theta_k | \alpha) \prod_{t=1}^T P(z_{kt} | \theta_k) \prod_{x=1}^V  P(\Phi_{kx} | \beta_x) P(f_{tx} | z_{kt}, \Phi_{kx}) \\
    = \prod_{k=1}^K \theta_k^{\sum_{t=1}^T z_{kt} + \alpha_k - 1} \prod_{x=1}^V  (\Phi_{kx})^{\sum_{t=1}^T f_{tx}z_{kt} + \beta_x - 1} 
$$

Marginalizing over $\theta$ and $\Phi$ gives:

$$
    P(f, z | \alpha, \beta) = \int P(f, z, \Phi_t, \theta | \alpha, \beta) d\theta d\Phi \\
        = \frac{\prod_{k=1}^K \Gamma(n_k + \alpha_k)}{\Gamma(\sum_{k=1}^K n_k + \alpha_0)} 
            \frac{\prod_{x=1}^V \Gamma(n_{kx} + \beta_k)}{\Gamma(\sum_{x=1}^V n_{kx} + \beta_0)}
$$

Where we defined:

$$
    n_k \equiv \sum_{t=1}^T r_t z_{tk} \\
    n_{kx} \equiv \sum_{t=1}^T r_t f_{tx} z_{tk}
$$

where $r_t$ is the rating at $t$. Then, since $\Gamma(n+1) = n \Gamma(n)$:

$$
P(z_t = k' | z_{-t}, f, \alpha, \beta) \propto P(f, z | \alpha, \beta) \\
    \propto \prod_{k \neq k'} \Gamma(n_{k, -t} + \alpha_k) 
        \prod_{x=1}^V \frac{\Gamma(n_{kx, -t} + \beta_k)}{\Gamma(\sum_{x=1}^V \sum_{k=1}^K n_{kx, -t} + \beta_0)}
        \Gamma(n_{k', -t} + \alpha_{k'} + 1)
        \prod_{x = 1}^V \frac{\Gamma(n_{k'x, -t} + \beta_k' + 1)}{\Gamma(\sum_{x=1}^V \sum_{k=1}^K n_{kx, -t} + \beta_0 + 1)} \\
    = \prod_{k = 1}^K \Gamma(n_{k, -t} + \alpha_k) 
        \prod_{x=1}^V \frac{\Gamma(n_{kx, -t} + \beta_k)}{\Gamma(\sum_{x=1}^V \sum_{k=1}^K n_{kx, -t} + \beta_0)}
        (n_{k', -t} + \alpha_{k'}) \prod_{x =1}^V \frac{n_{xk', -t} + \beta_{k'}}{\sum_{x=1}^V \sum_{k=1}^K n_{kx, -t} + \beta_0} \\
    \propto (n_{k', -t} + \alpha_{k'}) \prod_{x = 1}^V  \frac{n_{k'x, -t} + \beta_k}{\sum_{x'=1}^V \sum_{k=1}^K n_{k'x, -t} + \beta_0} \\
$$

where,
$$
    n_{k, -t} \equiv \sum_{t' \neq t} r_{t'} z_{tk} \\
    n_{xk, -t} \equiv \sum_{t' \neq t} r_t f_{tx} z_{tk}
$$

Notice that the "point" removed from the products is that associated with a single rating point and feature-value. But because points stack up, we can simply pull out each aggregated rating simultaneously with the same result.