# Latent Dirichlet Allocation 

In [27]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import numpy as np

In [23]:
# exclude 'comp.os.ms-windows.misc'
categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
#     "comp.sys.mac.hardware",
#     "comp.windows.x",
#     "misc.forsale",
#     "rec.autos",
#     "rec.motorcycles",
#     "rec.sport.baseball",
#     "rec.sport.hockey",
#     "sci.crypt",
#     "sci.electronics",
#     "sci.med",
#     "sci.space",
#     "soc.religion.christian",
#     "talk.politics.guns",
#     "talk.politics.mideast",
#     "talk.politics.misc",
#     "talk.religion.misc",
]

newsgroups = fetch_20newsgroups(categories=categories)

In [24]:
data = np.array(newsgroups.data)
groups = newsgroups.target
group_names = newsgroups.target_names

In [43]:
# Use tf (raw term count) features for LDA.
n_words = 100
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10, max_features=n_words, stop_words="english")

tf = tf_vectorizer.fit_transform(data)
feature_names = tf_vectorizer.get_feature_names_out()

Extracting tf features for LDA...


In [45]:
tf.toarray()

array([[2, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 2, 0, 0]])

## Two-Level Hierarchical Topic Model 

$$
\theta \sim Dir(\alpha) \\
z_d \sim Multinom(\theta) \\
w_{nd} \sim p(w \mid z_d)
$$

In [26]:
import numpyro
from numpyro.infer import MCMC, NUTS, Predictive
import numpyro.distributions as dist
from jax import random

In [None]:
def topic_model2(words):
    alpha = numpyro.sample("alpha", ) # global variable alpha, shape = (, n_topics)
    beta = numpyro.sample("beta", ) # global beta, shape = (n_topics, n_words)
    
    with numpyro.plate("in_corpus", n_docs):
        # sample topic: \theta ~ Dir(alpha)
        theta = numpyro.sample("theta", dist.Dirichlet(concentration = alpha))
        
        with numpyro.plate("in_document", n_words):
            # sample a topic for each word: z_n ~ Multinom(\theta)
            z = numpyro.sample("z", dist.Multinomial(probs = theta))
            # sample a word conditional on the topic: w_n ~ p(w_n | z_n, \beta)
            w = numpyro.sample("w", dist.Categorical(probs=beta))
            

In [None]:
def model(patient_code, Weeks, FVC_obs=None):
    μ_α = numpyro.sample("μ_α", dist.Normal(0.0, 500.0))
    σ_α = numpyro.sample("σ_α", dist.HalfNormal(100.0))
    μ_β = numpyro.sample("μ_β", dist.Normal(0.0, 3.0))
    σ_β = numpyro.sample("σ_β", dist.HalfNormal(3.0))

    n_patients = len(np.unique(patient_code))

    with numpyro.plate("plate_i", n_patients):
        α = numpyro.sample("α", dist.Normal(μ_α, σ_α))
        β = numpyro.sample("β", dist.Normal(μ_β, σ_β))

    σ = numpyro.sample("σ", dist.HalfNormal(100.0))
    FVC_est = α[patient_code] + β[patient_code] * Weeks

    with numpyro.plate("data", len(patient_code)):
        numpyro.sample("obs", dist.Normal(FVC_est, σ), obs=FVC_obs)