In [1]:
using Turing, Turing.RandomMeasures
using Plots, StatsPlots
using Statistics, Random, LinearAlgebra
using MCMCChains

Define the LDA model.

In [2]:
@model function LDA(w, K, D)
    # K: number of topics
    # D: number of words
    # M = number of documents
    # this gets the length of the 1st dimension of array w
    M = size(w, 1)

    # topic distributions
    # A Vector of Vectors, size M, each initialized to undef
    # Each inner vector will have K entries that add up to 1.
    θ = Vector{Vector}(undef, M)
    α = 1.0
    for m = 1:M
        θ[m] ~ Dirichlet(K, α)
    end
    # println("theta:")
    # println(θ)

    # word distributions (for each topic)
    ψ = Vector{Vector}(undef, K)
    η = 0.01
    for k = 1:K
        ψ[k] ~ Dirichlet(D, η)
    end

    # println("ψ (word distributions for each topic):")
    # println(ψ)

    # one entry in outer vec per doc
    # the ints represent the topic assignment of each word
    z = Vector{Vector{Int}}(undef, M)

    for m = 1:M
        doc_length = size(w[m], 1)
        # in each doc, initialize each word's topic as 0
        z[m] = zeros(Int, doc_length)
        for n = 1:doc_length
            # select topic for word n in document m
            # draw from the topic distribution for that doc
            z[m][n] ~ Categorical(θ[m])
            # select symbol for word n in document m from topic z[m][n]
            # draw from the word distribution for that topic
            w[m][n] ~ Categorical(ψ[z[m][n]])
        end
    end
    # println("z:")
    # println(z)
    # println("w:")
    # println(w)
    return w
end


LDA (generic function with 2 methods)

In [4]:
# number of docs
M = 5
# number of topics
K = 10
# number of words in corpus
D = 10

10

TODO: Import the data. It should be an M x N matrix, where each entry is an int representing a word.

In [5]:
condition_data = Vector{Vector{Int}}(undef, 5)
condition_data[1] = [1, 2, 3, 4, 5, 6]
condition_data[2] = [1, 2, 3, 4, 5, 6]
condition_data[3] = [5, 4, 3, 2]
condition_data[4] = [1]
condition_data[5] = [7, 8, 9, 10]
condition_data

5-element Vector{Vector{Int64}}:
 [1, 2, 3, 4, 5, 6]
 [1, 2, 3, 4, 5, 6]
 [5, 4, 3, 2]
 [1]
 [7, 8, 9, 10]

Condition the model with the provided documents.

In [6]:
conditioned_LDA = LDA(condition_data, K, D)

DynamicPPL.Model{typeof(LDA), (:w, :K, :D), (), (), Tuple{Vector{Vector{Int64}}, Int64, Int64}, Tuple{}, DynamicPPL.DefaultContext}(LDA, (w = [[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6], [5, 4, 3, 2], [1], [7, 8, 9, 10]], K = 10, D = 10), NamedTuple(), DynamicPPL.DefaultContext())

Sample the model. It currently uses a Sequential Monte Carlo (SMC) sampler, but it can also be configured to use importance sampling (IS), Metropolis Hastings (MH), or Particle Gibbs (PG). It can also combine multiple samplers so one is used for the discrete variables and a different one is used for the continuous variables, such as Hamiltonian Markov Chain (HMC) or the No U-Turn Sampler (NUTS).

In [7]:
chain = sample(conditioned_LDA, SMC(), 1000)

Chains MCMC chain (1000×173×1 Array{Float64, 3}):

Log evidence      = -89.64882902264198
Iterations        = 1:1:1000
Number of chains  = 1
Samples per chain = 1000
Wall duration     = 19.18 seconds
Compute duration  = 19.18 seconds
parameters        = θ[1][1], θ[1][2], θ[1][3], θ[1][4], θ[1][5], θ[1][6], θ[1][7], θ[1][8], θ[1][9], θ[1][10], θ[2][1], θ[2][2], θ[2][3], θ[2][4], θ[2][5], θ[2][6], θ[2][7], θ[2][8], θ[2][9], θ[2][10], θ[3][1], θ[3][2], θ[3][3], θ[3][4], θ[3][5], θ[3][6], θ[3][7], θ[3][8], θ[3][9], θ[3][10], θ[4][1], θ[4][2], θ[4][3], θ[4][4], θ[4][5], θ[4][6], θ[4][7], θ[4][8], θ[4][9], θ[4][10], θ[5][1], θ[5][2], θ[5][3], θ[5][4], θ[5][5], θ[5][6], θ[5][7], θ[5][8], θ[5][9], θ[5][10], ψ[1][1], ψ[1][2], ψ[1][3], ψ[1][4], ψ[1][5], ψ[1][6], ψ[1][7], ψ[1][8], ψ[1][9], ψ[1][10], ψ[2][1], ψ[2][2], ψ[2][3], ψ[2][4], ψ[2][5], ψ[2][6], ψ[2][7], ψ[2][8], ψ[2][9], ψ[2][10], ψ[3][1], ψ[3][2], ψ[3][3], ψ[3][4], ψ[3][5], ψ[3][6], ψ[3][7], ψ[3][8], ψ[3][9], ψ[3][10], ψ[4][1], ψ[4][2], 

This represents the word distribution of each topic.

In [9]:
topic_word_dists = Vector{Vector{Float64}}(undef, K)
for j = 1:K
    topic_word_dists[j] = [mean(chain, "ψ[$j][$i]") for i in 1:D]
end
topic_word_dists

10-element Vector{Vector{Float64}}:
 [2.2766163621319826e-21, 1.1637022995069778e-21, 5.5938147947082435e-27, 3.6097202759207283e-306, 0.9999999999901361, 2.4890725053328744e-12, 6.331805594934613e-29, 5.545081779116791e-68, 7.375378833100619e-12, 1.0661670995337312e-23]
 [3.1397842044027584e-68, 0.884325984262162, 2.2924237346948127e-47, 4.669108220322716e-42, 9.609787347263898e-17, 2.6202826621161276e-36, 0.0016067595033762657, 0.11406725623446248, 3.184615123823896e-23, 9.974170106703793e-130]
 [2.1305012190661994e-118, 1.3618358374285582e-57, 6.3991265528278325e-9, 1.605196279117381e-23, 4.990826487337537e-45, 1.197194266404521e-12, 5.712610653461027e-15, 0.9999998496410728, 1.439585964834245e-7, 3.825931521017551e-73]
 [0.94582550248699, 1.9248487956267733e-59, 2.6035733151916175e-5, 1.1204403578084257e-64, 1.5471668381121686e-55, 4.513703497785489e-91, 2.551242902028252e-55, 2.845221201983633e-8, 0.0541484330983737, 2.2927208108715486e-10]
 [5.7494443825880506e-49, 3.687004436905

Query the distribution of topics in each document.

In [10]:
document_topic_distributions = Vector{Vector{Float64}}(undef, M)
for j = 1:M
    document_topic_distributions[j] = [mean(chain, "θ[$j][$i]") for i in 1:K]
end
document_topic_distributions

5-element Vector{Vector{Float64}}:
 [0.14769435482325843, 0.35730731003040367, 0.038796812086761286, 0.10378413747961478, 0.13001881216752248, 0.055272105655014954, 0.014937366538729346, 0.1274916960876791, 0.02407396879251538, 0.0006234363385005938]
 [0.06253960775065442, 0.05551944827250686, 0.03130570940591746, 0.4640553747039986, 0.0631813362676619, 0.0025671522660765094, 0.11236149487070009, 0.08500010246701618, 0.05927973697757995, 0.0641900370178884]
 [0.07319225708032638, 0.06342983146169087, 0.014130214059183828, 0.012134603712540944, 0.012830997561329854, 0.010347103746435445, 0.06688568371988787, 0.26507956693514373, 0.3207908524818106, 0.16117888924165044]
 [0.015944784780926976, 0.22145320287261155, 0.1008706728834599, 0.025330509564848995, 0.009420394757839769, 0.09642482806959077, 0.045097135103337155, 0.10680840339539827, 0.0039011637997435613, 0.3747489047722428]
 [0.056764954098153304, 0.16102695989495952, 0.12954303166533232, 0.04803807380745034, 0.0723326746884547, 

Get the highest probability topic for each movie.

In [11]:
highest_prob_topic_per_movie = Vector{Tuple{Int, Float64}}(undef, M)
for doc = 1:M
    max_prob = 0.0
    max_ind = 0
    for topic = 1:K
        if document_topic_distributions[doc][topic] > max_prob
            max_prob = document_topic_distributions[doc][topic]
            max_ind = topic
        end
    end
    highest_prob_topic_per_movie[doc] = (max_ind, max_prob) 
end
highest_prob_topic_per_movie

5-element Vector{Tuple{Int64, Float64}}:
 (2, 0.35730731003040367)
 (4, 0.4640553747039986)
 (9, 0.3207908524818106)
 (10, 0.3747489047722428)
 (7, 0.18734784860859208)