In [1]:
using Pkg
using StatsBase, CSV, DataFrames
Pkg.activate("TopicModels")
import TopicModels

[32m[1m Activating[22m[39m environment at `~/Documents/Thesis/Git_Main/handson_julia/TopicModels/Project.toml`
┌ Info: Precompiling TopicModels [cfcb1801-bb54-4f1b-8249-336c042d2c46]
└ @ Base loading.jl:1278


## Sample Run on Dummy Data

In [2]:
corpus = TopicModels.documentset_readData("news.txt");

In [3]:
wordPrior = TopicModels.Dirichlet(12, 0.01)
M = 3
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [4]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [5]:
samples = TopicModels.lda_sample(corpus.documents, lda);

In [12]:
# 1st topic and top 5 words along with vocab proportion
words, proportions = TopicModels.lda_topicN(3, 4, corpus, lda);

In [13]:
println(words)
println(proportions)

Any["部隊", "選手", "時代", "文化"]
[0.39258, 0.19727, 0.19727, 0.19727]


## LDA on NIPS papers

In [17]:
# importing papers and few preprocessing steps
papers = CSV.read("papers.csv", DataFrame);
papers_txt = papers.paper_text;
stopwords = []
specialchars = ['!', '”', '#', '$', '%', '&', '’', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '>', '=', '@', '?', '[', ']', '^', '_', '{', '}', '|', '~']
open("stopwords.txt") do file
    for word in eachline(file)
        push!(stopwords, word)
    end
end

In [18]:
function clean_words(docs::Array, stopwords::Array, specialchars::Array) 
    #Remove stop words, special characters, numbers, and 2_or_less char words
    new_docs = []
    for line in docs
        doc = split(line)
        temp = []
        for word in doc
            word = lowercase(replace.(word, specialchars => ""))
            if !(word in stopwords) 
                if length(word) > 2 && tryparse(Float64, word) == nothing
                    push!(temp, word)
                end
            end
        end
        push!(new_docs, temp)
    end
    return new_docs
end

papers_txt = clean_words(papers_txt, stopwords, specialchars);

In [19]:
corpus = TopicModels.documentset_readData(papers_txt);
corpus.vocab_count

399311

In [20]:
wordPrior = TopicModels.Dirichlet(corpus.vocab_count, 0.01)
M = 30 # Number of topics
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [21]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [22]:
samples = TopicModels.lda_sample(corpus.documents, lda);

In [23]:
top_N = 10
for i in 1:M
    println("Topic $i top $top_N words:")
    words, proportions = TopicModels.lda_topicN(i, top_N, corpus, lda);
    println(words)
end

Topic 1 top 10 words:
Any["distribution", "inference", "sampling", "posterior", "variational", "bayesian", "log", "variables", "models", "distributions"]
Topic 2 top 10 words:
Any["theorem", "function", "let", "bound", "case", "proof", "functions", "lemma", "following", "result"]
Topic 3 top 10 words:
Any["label", "labels", "learning", "active", "ranking", "query", "model", "set", "queries", "number"]
Topic 4 top 10 words:
Any["distribution", "estimation", "estimate", "probability", "sample", "information", "estimator", "density", "samples", "distributions"]
Topic 5 top 10 words:
Any["network", "networks", "neural", "input", "learning", "output", "layer", "units", "hidden", "training"]
Topic 6 top 10 words:
Any["training", "classification", "set", "error", "examples", "data", "classifier", "learning", "class", "test"]
Topic 7 top 10 words:
Any["learning", "loss", "bound", "algorithm", "bounds", "risk", "distribution", "complexity", "sample", "algorithms"]
Topic 8 top 10 words:
Any["mod