In [2]:
using Pkg
using StatsBase, CSV, DataFrames
Pkg.activate("TopicModels")
import TopicModels

[32m[1m Activating[22m[39m environment at `~/Documents/Thesis/Git_Main/julia_bayes/TopicModels/Project.toml`


## Sample Run on Dummy Data

In [2]:
corpus = TopicModels.documentset_readData("news-en.txt");

In [3]:
wordPrior = TopicModels.Dirichlet(12, 0.01)
M = 3
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [4]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [5]:
samples = TopicModels.lda_sample(corpus.documents, lda);

In [6]:
# 1st topic and top 5 words along with vocab proportion
words, proportions = TopicModels.lda_topicN(3, 4, corpus, lda);

In [7]:
println(words)
println(proportions)

Any["medal", "runner", "era", "culture"]
[0.39258, 0.19727, 0.19727, 0.19727]


## LDA on NIPS papers

In [3]:
# importing papers and few preprocessing steps
papers = CSV.read("papers.csv", DataFrame);
papers_txt = papers.paper_text;
stopwords = []
specialchars = ['!', '”', '#', '$', '%', '&', '’', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '>', '=', '@', '?', '[', ']', '^', '_', '{', '}', '|', '~']
open("stopwords.txt") do file
    for word in eachline(file)
        push!(stopwords, word)
    end
end

In [4]:
function clean_words(docs::Array, stopwords::Array, specialchars::Array) 
    #Remove stop words, special characters, numbers, and 2_or_less char words
    new_docs = []
    for line in docs
        doc = split(line)
        temp = []
        for word in doc
            word = lowercase(replace.(word, specialchars => ""))
            if !(word in stopwords) 
                if length(word) > 2 && tryparse(Float64, word) == nothing
                    push!(temp, word)
                end
            end
        end
        push!(new_docs, temp)
    end
    return new_docs
end

papers_txt = clean_words(papers_txt, stopwords, specialchars);

In [25]:
papers_txt = papers_txt[6242:7241]
size(papers_txt)

(1000,)

In [26]:
corpus = TopicModels.documentset_readData(papers_txt);
corpus.vocab_count

108086

In [27]:
wordPrior = TopicModels.Dirichlet(corpus.vocab_count, 0.01)
M = 20 # Number of topics
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [28]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [29]:
TopicModels.lda_sample(corpus.documents, lda);

In [31]:
top_N = 10
for i in 1:M
    #println("Topic $i top $top_N words:")
    words, proportions = TopicModels.lda_topicN(i, top_N, corpus, lda);
    println(words)
end

Any["matrix", "sparse", "matrices", "time", "rank", "error", "problem", "linear", "algorithm", "tensor"]
Any["adversarial", "training", "generative", "data", "gan", "samples", "distribution", "generator", "discriminator", "objective"]
Any["policy", "learning", "state", "reinforcement", "reward", "action", "agent", "policies", "value", "using"]
Any["model", "models", "data", "used", "using", "speech", "figure", "prediction", "modeling", "neural"]
Any["distribution", "inference", "model", "data", "models", "log", "gaussian", "posterior", "variational", "bayesian"]
Any["time", "system", "control", "noise", "figure", "model", "state", "process", "memory", "point"]
Any["graph", "algorithm", "clustering", "nodes", "node", "set", "algorithms", "cluster", "graphs", "problem"]
Any["learning", "algorithm", "algorithms", "loss", "time", "online", "machine", "problem", "cost", "pages"]
Any["set", "algorithm", "local", "one", "data", "used", "new", "two", "structure", "figure"]
Any["learning", "dee

In [32]:
TopicModels.lda_removeWord("model", 20, corpus, lda);

TopicModels.Dirichlet(108086, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 1080.8500001000002)

In [33]:
TopicModels.lda_gibbsSampling(corpus.documents, 20, lda);

In [34]:
top_N = 10
for i in 1:M
    #println("Topic $i top $top_N words:")
    words, proportions = TopicModels.lda_topicN(i, top_N, corpus, lda);
    println(words)
end

Any["matrix", "sparse", "matrices", "rank", "algorithm", "time", "estimation", "norm", "tensor", "problem"]
Any["adversarial", "training", "generative", "data", "gan", "samples", "distribution", "generator", "discriminator", "objective"]
Any["policy", "learning", "state", "reward", "reinforcement", "action", "agent", "policies", "using", "value"]
Any["model", "models", "data", "using", "speech", "neural", "modeling", "figure", "network", "used"]
Any["model", "distribution", "inference", "data", "models", "log", "gaussian", "posterior", "variational", "bayesian"]
Any["time", "model", "system", "noise", "state", "control", "figure", "process", "memory", "systems"]
Any["graph", "algorithm", "clustering", "nodes", "node", "set", "algorithms", "problem", "cluster", "graphs"]
Any["learning", "algorithm", "algorithms", "loss", "time", "online", "problem", "machine", "cost", "pages"]
Any["set", "algorithm", "local", "two", "data", "figure", "one", "used", "new", "node"]
Any["learning", "deep",

 # Human in the Loop Topic Modeling

In [6]:
corpus = TopicModels.documentset_readData("news-en.txt")
wordPrior = TopicModels.Dirichlet(12, 0.01)
M = 3
alpha = [0.01 for i in 1:M]
topicPrior = TopicModels.Dirichlet(alpha)
lda = TopicModels.LDA(topicPrior, wordPrior)
TopicModels.lda_sample(corpus.documents, lda);

In [7]:
for m in 1:3
    words, proportions = TopicModels.lda_topicN(m, 5, corpus, lda)
    println(words)
    println(proportions)
    println("----------------------")
end

Any["Moscow", "tank", "Russian", "unit", "Olympic"]
[0.48786, 0.48786, 0.00243, 0.00243, 0.00243]
----------------------
Any["Olympic", "medal", "Russian", "Moscow", "Marathon"]
[0.24754, 0.24754, 0.12438, 0.12438, 0.12438]
----------------------
Any["Russian", "unit", "Self-Defense", "Defense", "era"]
[0.24754, 0.24754, 0.12438, 0.12438, 0.12438]
----------------------


In [8]:
TopicModels.lda_removeWord("tank", 1, corpus, lda)
TopicModels.lda_gibbsSampling(corpus.documents, 20, lda);

5-element Array{Any,1}:
 Any[2, 1, 2, 3]
 Any[1, 1, 2, 1]
 Any[3, 2, 3, 3]
 Any[2, 1, 1, 1]
 Any[2, 1, 1, 3]

In [9]:
for m in 1:3
    words, proportions = TopicModels.lda_topicN(m, 5, corpus, lda)
    println(words)
    println(proportions)
    println("----------------------")
end

Any["Moscow", "Olympic", "medal", "runner", "era"]
[0.33041, 0.22064, 0.22064, 0.11087, 0.11087]
----------------------
Any["Russian", "tank", "Marathon", "Moscow", "unit"]
[0.49183, 0.32843, 0.16503, 0.00163, 0.00163]
----------------------
Any["unit", "Self-Defense", "Defense", "culture", "Russian"]
[0.39258, 0.19727, 0.19727, 0.19727, 0.00195]
----------------------
