In [1]:
using Pkg
using StatsBase
Pkg.activate("TopicModels")
import TopicModels

[32m[1m Activating[22m[39m environment at `~/Documents/Thesis/Git_Main/julia_bayes/TopicModels/Project.toml`
┌ Info: Precompiling TopicModels [cfcb1801-bb54-4f1b-8249-336c042d2c46]
└ @ Base loading.jl:1278


## Sample Run on Dummy Data

In [2]:
corpus = TopicModels.documentset_readData("news-en.txt");

In [3]:
wordPrior = TopicModels.Dirichlet(12, 0.01)
M = 3
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [4]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [5]:
samples = TopicModels.lda_sample(corpus.documents, lda);

In [6]:
# 1st topic and top 5 words along with vocab proportion
words, proportions = TopicModels.lda_topicN(3, 4, corpus, lda);

In [7]:
println(words)
println(proportions)

Any["medal", "runner", "era", "culture"]
[0.39258, 0.19727, 0.19727, 0.19727]


## LDA on NIPS papers

In [3]:
# importing papers and few preprocessing steps
papers = CSV.read("papers.csv", DataFrame);
papers_txt = papers.paper_text;
stopwords = []
specialchars = ['!', '”', '#', '$', '%', '&', '’', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '>', '=', '@', '?', '[', ']', '^', '_', '{', '}', '|', '~']
open("stopwords.txt") do file
    for word in eachline(file)
        push!(stopwords, word)
    end
end

In [4]:
function clean_words(docs::Array, stopwords::Array, specialchars::Array) 
    #Remove stop words, special characters, numbers, and 2_or_less char words
    new_docs = []
    for line in docs
        doc = split(line)
        temp = []
        for word in doc
            word = lowercase(replace.(word, specialchars => ""))
            if !(word in stopwords) 
                if length(word) > 2 && tryparse(Float64, word) == nothing
                    push!(temp, word)
                end
            end
        end
        push!(new_docs, temp)
    end
    return new_docs
end

papers_txt = clean_words(papers_txt, stopwords, specialchars);

In [25]:
papers_txt = papers_txt[6242:7241]
size(papers_txt)

(1000,)

In [26]:
corpus = TopicModels.documentset_readData(papers_txt);
corpus.vocab_count

108086

In [27]:
wordPrior = TopicModels.Dirichlet(corpus.vocab_count, 0.01)
M = 20 # Number of topics
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [28]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [29]:
TopicModels.lda_sample(corpus.documents, lda);

In [31]:
top_N = 10
for i in 1:M
    #println("Topic $i top $top_N words:")
    words, proportions = TopicModels.lda_topicN(i, top_N, corpus, lda);
    println(words)
end

Any["matrix", "sparse", "matrices", "time", "rank", "error", "problem", "linear", "algorithm", "tensor"]
Any["adversarial", "training", "generative", "data", "gan", "samples", "distribution", "generator", "discriminator", "objective"]
Any["policy", "learning", "state", "reinforcement", "reward", "action", "agent", "policies", "value", "using"]
Any["model", "models", "data", "used", "using", "speech", "figure", "prediction", "modeling", "neural"]
Any["distribution", "inference", "model", "data", "models", "log", "gaussian", "posterior", "variational", "bayesian"]
Any["time", "system", "control", "noise", "figure", "model", "state", "process", "memory", "point"]
Any["graph", "algorithm", "clustering", "nodes", "node", "set", "algorithms", "cluster", "graphs", "problem"]
Any["learning", "algorithm", "algorithms", "loss", "time", "online", "machine", "problem", "cost", "pages"]
Any["set", "algorithm", "local", "one", "data", "used", "new", "two", "structure", "figure"]
Any["learning", "dee

In [32]:
TopicModels.lda_removeWord("model", 20, corpus, lda);

TopicModels.Dirichlet(108086, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 1080.8500001000002)

In [33]:
TopicModels.lda_gibbsSampling(corpus.documents, 20, lda);

In [34]:
top_N = 10
for i in 1:M
    #println("Topic $i top $top_N words:")
    words, proportions = TopicModels.lda_topicN(i, top_N, corpus, lda);
    println(words)
end

Any["matrix", "sparse", "matrices", "rank", "algorithm", "time", "estimation", "norm", "tensor", "problem"]
Any["adversarial", "training", "generative", "data", "gan", "samples", "distribution", "generator", "discriminator", "objective"]
Any["policy", "learning", "state", "reward", "reinforcement", "action", "agent", "policies", "using", "value"]
Any["model", "models", "data", "using", "speech", "neural", "modeling", "figure", "network", "used"]
Any["model", "distribution", "inference", "data", "models", "log", "gaussian", "posterior", "variational", "bayesian"]
Any["time", "model", "system", "noise", "state", "control", "figure", "process", "memory", "systems"]
Any["graph", "algorithm", "clustering", "nodes", "node", "set", "algorithms", "problem", "cluster", "graphs"]
Any["learning", "algorithm", "algorithms", "loss", "time", "online", "problem", "machine", "cost", "pages"]
Any["set", "algorithm", "local", "two", "data", "figure", "one", "used", "new", "node"]
Any["learning", "deep",

 # Human in the Loop Topic Modeling with APIs

In [2]:
corpus = TopicModels.preprocess("papers.csv", 1000);

In [3]:
lda = TopicModels.train(corpus, 20);

In [4]:
TopicModels.show_topics(corpus, lda, 5);

Any["tree", "causal", "graph", "model", "structure"]
[0.01491, 0.01017, 0.00972, 0.0091, 0.00861]
----------------------
Any["distributed", "communication", "parallel", "speech", "system"]
[0.01602, 0.01339, 0.00854, 0.00807, 0.00761]
----------------------
Any["model", "learning", "models", "neural", "classification"]
[0.01847, 0.01383, 0.01044, 0.00929, 0.00909]
----------------------
Any["network", "networks", "neural", "learning", "training"]
[0.04085, 0.03544, 0.03032, 0.01448, 0.01397]
----------------------
Any["learning", "task", "tasks", "training", "information"]
[0.02579, 0.01454, 0.01381, 0.0123, 0.00985]
----------------------
Any["inference", "variational", "generative", "data", "distribution"]
[0.01724, 0.01572, 0.01421, 0.01284, 0.01253]
----------------------
Any["time", "model", "state", "system", "dynamics"]
[0.01939, 0.01934, 0.01736, 0.01468, 0.01301]
----------------------
Any["training", "set", "error", "input", "using"]
[0.01394, 0.01296, 0.01295, 0.00882, 0.008

In [7]:
TopicModels.apply_refinement(corpus, lda, "remove", "model", 1);

In [8]:
TopicModels.show_topics(corpus, lda, 5);

Any["tree", "causal", "structure", "learning", "node"]
[0.01452, 0.01132, 0.008, 0.00684, 0.00671]
----------------------
Any["distributed", "communication", "time", "parallel", "computation"]
[0.0162, 0.01345, 0.0094, 0.00743, 0.00727]
----------------------
Any["model", "learning", "models", "neural", "classification"]
[0.01554, 0.0112, 0.01058, 0.00966, 0.00956]
----------------------
Any["network", "networks", "neural", "learning", "layer"]
[0.04174, 0.03607, 0.03141, 0.01505, 0.01477]
----------------------
Any["learning", "training", "task", "tasks", "model"]
[0.02812, 0.01439, 0.01394, 0.01243, 0.01097]
----------------------
Any["inference", "variational", "generative", "distribution", "latent"]
[0.01735, 0.01547, 0.01443, 0.01311, 0.01281]
----------------------
Any["model", "time", "state", "system", "dynamics"]
[0.0187, 0.01858, 0.01801, 0.01476, 0.01331]
----------------------
Any["training", "set", "error", "input", "one"]
[0.01268, 0.01188, 0.0105, 0.01001, 0.00951]
-----

In [9]:
TopicModels.apply_refinement(corpus, lda, "add", "node", 2);

In [11]:
TopicModels.show_topics(corpus, lda, 5);

Any["node", "tree", "causal", "variables", "structure"]
[0.01417, 0.01356, 0.01129, 0.00682, 0.00682]
----------------------
Any["node", "distributed", "communication", "parallel", "time"]
[0.03395, 0.01575, 0.01249, 0.00748, 0.00709]
----------------------
Any["model", "models", "learning", "attention", "classification"]
[0.01731, 0.01129, 0.01089, 0.00975, 0.00936]
----------------------
Any["network", "networks", "neural", "learning", "training"]
[0.04226, 0.03604, 0.03199, 0.01466, 0.01423]
----------------------
Any["learning", "training", "task", "tasks", "model"]
[0.02811, 0.01529, 0.01407, 0.01217, 0.00995]
----------------------
Any["inference", "variational", "generative", "distribution", "latent"]
[0.01722, 0.01538, 0.0139, 0.01368, 0.01257]
----------------------
Any["time", "state", "model", "system", "dynamics"]
[0.01847, 0.01811, 0.01695, 0.01467, 0.01329]
----------------------
Any["training", "set", "input", "error", "one"]
[0.01276, 0.01168, 0.01059, 0.01006, 0.00927]