# DocumentSet Class using mutable struct

In [1]:
using StatsBase, CSV, DataFrames

In [2]:
mutable struct DocumentSet
    documents::Array
    document_size::Int
    vocab_count::Int
    vocabulary::Dict{}
    reverse_vocabulary::Array
end

function documentset_readData(path::String, documentset_obj::DocumentSet)
    open(path) do file
        for doc in eachline(file)
            documentset_addDocument(doc, documentset_obj)
        end
    end
end

function documentset_readData(texts::Array, documentset_obj::DocumentSet)
    for doc in texts
        documentset_addDocument(doc, documentset_obj)
    end
end

function documentset_addDocument(line::String, documentset_obj::DocumentSet)
    if isempty(line)
        return nothing
    end
    words = split(line)
    codes = []
    for i in words
        if haskey(documentset_obj.vocabulary, i)
            push!(codes, documentset_obj.vocabulary[i])
        else 
            documentset_obj.vocab_count+=1
            documentset_obj.vocabulary[i] = documentset_obj.vocab_count
            push!(documentset_obj.reverse_vocabulary, i)
            push!(codes, documentset_obj.vocab_count)
        end
    end
    push!(documentset_obj.documents, codes)
    documentset_obj.document_size+=1
end

function documentset_addDocument(words::Array, documentset_obj::DocumentSet)
    if isempty(words)
        return nothing
    end
    codes = []
    for i in words
        if haskey(documentset_obj.vocabulary, i)
            push!(codes, documentset_obj.vocabulary[i])
        else 
            documentset_obj.vocab_count+=1
            documentset_obj.vocabulary[i] = documentset_obj.vocab_count
            push!(documentset_obj.reverse_vocabulary, i)
            push!(codes, documentset_obj.vocab_count)
        end
    end
    push!(documentset_obj.documents, codes)
    documentset_obj.document_size+=1
end

function documentset_transform(line::String, documentset_obj::DocumentSet)
    words = split(line)
    codes = []
    for i in words
        code = get(documentset_obj.vocabulary, i, -1)
        if code != -1
            push!(codes, code)
        end  
    end
    return sort(codes)
end

function documentset_sampleDocuments(numDocs::Int, documentset_obj::DocumentSet)
    subD = sample(documentset_obj.documents, numDocs, replace = false)
    return subD  
end

function documentset_OnlinesampleDocuments(numDocs::Int, iter::Int, documentset_obj::DocumentSet)
    subD = documentset_obj.documents[(iter-1)*numDocs+1: (iter)*numDocs]
    return subD
end

function documentset_getTermFreq(documentset_obj::DocumentSet)
    tf = [0 for i=1:documentset_obj.vocab_count]
    for doc in documentset_obj.documents
        for w in doc
            tf[w]+=1 
        end
    end
    return tf
end

DocumentSet() = DocumentSet([],0, 0, Dict(), [])

DocumentSet

## Class call and using data

In [3]:
global corpus = DocumentSet()
documentset_readData("dummy.txt", corpus) #Or use readData(readlines("dummy.txt"), corpus)
V = size(corpus.reverse_vocabulary)[1]

10

## Extra function testing

In [28]:
println("Total docs: $(corpus.document_size)")

Total docs: 5


In [36]:
println(corpus.reverse_vocabulary)
println(documentset_getTermFreq(corpus))

Any["This", "is", "dummy", "line", "number", "1", "2", "3", "4th", "5th", "6th"]
[3, 3, 6, 6, 3, 1, 1, 1, 1, 1, 1]


In [37]:
corpus.documents

6-element Array{Any,1}:
 Any[1, 2, 3, 4, 5, 6]
 Any[1, 2, 3, 4, 5, 7]
 Any[1, 2, 3, 4, 5, 8]
 Any[9, 3, 4]
 Any[10, 3, 4]
 Any[11, 3, 4]

In [38]:
corpus.vocabulary

Dict{Any,Any} with 11 entries:
  "number" => 5
  "1"      => 6
  "is"     => 2
  "2"      => 7
  "6th"    => 11
  "dummy"  => 3
  "5th"    => 10
  "line"   => 4
  "This"   => 1
  "4th"    => 9
  "3"      => 8

In [43]:
documentset_transform("5th dummy line", corpus)

3-element Array{Any,1}:
  3
  4
 10

In [44]:
documentset_OnlinesampleDocuments(1,3, corpus)

1-element Array{Any,1}:
 Any[1, 2, 3, 4, 5, 8]

In [35]:
documentset_addDocument("6th dummy line", corpus)

6

# SymmetricDirichlet Class(not extending Dirichlet) using struct
Don't need to create this class, we can use Dirichlet Class for such kind of variables as well

In [3]:
struct SymmetricDirichlet
    K::Int16
    alpha::Float16
    sumAlpha::Float16
end

SymmetricDirichlet(K, a) = SymmetricDirichlet(K, a, K*a)

SymmetricDirichlet

## Class call

In [6]:
M=100
alpha = [0.01 for i in 1:M];
wordPrior = SymmetricDirichlet(V,0.01)

SymmetricDirichlet(10, Float16(0.01), Float16(0.1))

# Dirichlet Class using mutable struct

In [3]:
global MINIMUM_PARAM = 10E-200

mutable struct Dirichlet
    K::Int64
    alpha::Array{Float64}
    sumAlpha::Float64
end

function dirichlet_optimizeParam(Ck, ndkMax, C_, ndMax, numIteration, dirichlet_obj::Dirichlet)
    function digammaRecurrence(nMax, C, z)
        if z==0.0
            return 0.0
        end
        
        R=0; S=0;
        for n in 1:nMax
            R+= 1.0 / (n-1+z)
            S+= C[n]*R
        end
        return S
    end
            
    for i in 1:numIteration
        demon = digammaRecurrence(ndMax, C_, dirichlet_obj.sumAlpha)
        
        for k in 1:dirichlet_obj.K
            numer = digammaRecurrence(ndkMax[k], Ck[k], dirichlet_obj.alpha[k])
            dirichlet_obj.alpha[k] *= (numer/demon)
            dirichlet_obj.alpha[k] = max(dirichlet_obj.alpha[k], MINIMUM_PARAM)
        end
        dirichlet_obj.sumAlpha = sum(dirichlet_obj.alpha)
    end
end

function dirichlet_set(param, dirichlet_obj::Dirichlet)
    for k in 1:dirichlet_obj.K
        dirichlet_obj.alpha[k] = param[k] > MINIMUM_PARAM ? param[k] : MINIMUM_PARAM
    end
end

Dirichlet(param::Array) = Dirichlet(size(param)[1], param, sum(param))
Dirichlet(K::Int, a::Float64) = Dirichlet(K, [a for i in 1:K], K*a)

Dirichlet

## Function call

In [8]:
topicPrior = Dirichlet(alpha);

# Class Polya using mutable struct

In [4]:
mutable struct Polya
    K::Int64
    dir::Dirichlet
    n::Array{Int64}
    N::Int64
end

function polya_p(x::Int, polya_obj::Polya)
    return (polya_obj.n[x]+polya_obj.dir.alpha[x]) / (polya_obj.N+polya_obj.dir.sumAlpha)
end

function polya_p(X::Array, polya_obj::Polya)
    p = 1.0
    for x in X
        p*= polya_p(x, polya_obj)
        polya_observe(x, polya_obj)
    end
    for x in X
        polya_forget(x, polya_obj)
    end
    return p
end
  
function polya_observe(x::Int, polya_obj::Polya)
    polya_obj.n[x]+=1
    polya_obj.N+=1  
end

function polya_observe(X::Array, polya_obj::Polya)
    for x in X
        polya_observe(x, polya_obj)
    end
end

function polya_forget(x::Int, polya_obj::Polya)
    polya_obj.n[x] -= 1
    polya_obj.N -= 1
end

function polya_forget(X::Array, polya_obj::Polya)
    for x in X
        polya_forget(x, polya_obj)
    end
end

function polya_getCount(k::Int, polya_obj::Polya)
    return polya_obj.n[k]
end

Polya(param::Dirichlet) = Polya(param.K, param, [0 for i in 1:param.K], 0)

Polya

## Sampler_class's sample function is used in LDA class so defining only the function instead of complete class

In [5]:
function Sampler_sample(p)
    u = rand()*sum(p)
    for i in 1:size(p)[1]
        if u<=p[i]
            return i
        end
        u-=p[i]
    end
    return size(p)[1]
end

Sampler_sample (generic function with 1 method)

## Polya Test Class

In [6]:
param = [3.0, 2.0, 2.0]
sut = Polya(Dirichlet(param))

Polya(3, Dirichlet(3, [3.0, 2.0, 2.0], 7.0), [0, 0, 0], 0)

In [7]:
X = [1,3,3,1,1,2]
polya_p(X, sut) # In java test is(closeTo(0.00108225108, 10E-10))

0.001082251082251082

In [8]:
corpus1 = DocumentSet()
documentset_readData("news.txt", corpus1)
sut = Polya(Dirichlet(12, 1.0))
for doc in corpus1.documents
    for x in doc
        polya_observe(x, sut)
    end
end

In [9]:
println(polya_p(1, sut)) # in java is(0.125)
println(polya_p(5, sut)) # in java is(0.09375)
println(polya_p(8, sut)) # in java is(0.0625)

0.125
0.09375
0.0625


# Class LDA using mutable struct

In [6]:
mutable struct LDA
    numIteration::Int64
    M::Int64
    topicDir::Dirichlet
    wordPolya::Array{Polya}
    X::Array{Any,1}
    topicPolya::Array{Polya}
end

function lda_sample(docs, lda_obj::LDA)
    lda_obj.X = docs
    D = size(docs)[1]
    #lda_obj.topicPolya =  Polya[D]   not sure how can i make it happen, doing it with iterations, working fine
    ndMax = Int(floor(maximum([size(doc)[1] for doc in docs])))
    
    C_ = [0 for i in 1:ndMax+1]
    for d in 1:D
        C_[(size(docs[d])[1])+1]+=1 # In julia indexing start from 1 so adding 1 in every index to compensate 0 size docs
    end
    
    samples = [[] for i=1:D]
    for d in 1:D
        Nd = size(docs[d])[1]
        
        samples[d] = [0 for i in 1:Nd]
        push!(lda_obj.topicPolya, Polya(lda_obj.topicDir))
        
        temp = []
        #randomSamples = sample(1:lda_obj.M, Nd, replace=false)
        for i in 1:Nd
            randomSample = rand(1:lda_obj.M)
            push!(temp, randomSample)
            lda_addSample(d, i, randomSample, lda_obj)
        end
        samples[d] = temp
    end
    
    for iteration in 1:lda_obj.numIteration
        for d in 1:D
            Nd = size(docs[d])[1]
            for i in 1:Nd
                lda_removeSample(d, i, samples[d][i], lda_obj)
                samples[d][i] = Sampler_sample(lda_posterior(d, i, lda_obj))
                lda_addSample(d, i, samples[d][i], lda_obj)
            end
        end
     
        ndkMax = [0 for i in 1:lda_obj.M]
        Ck = [[0 for j in 1:ndMax+1] for i in 1:lda_obj.M]
        for m in 1:lda_obj.M
            for d in 1:D
                ndk = lda_obj.topicPolya[d].n[m]
                Ck[m][ndk+1]+=1
                ndkMax[m] = max(ndkMax[m], ndk)
            end
        end
        dirichlet_optimizeParam(Ck, ndkMax, C_, ndMax+1, 20, lda_obj.topicDir)
    end
    return samples
end

function lda_posterior(d::Int, i::Int, lda_obj::LDA)
    v = lda_obj.X[d][i]
    posterior = [0.0 for i in 1:lda_obj.M]
    for m in 1:lda_obj.M
        posterior[m] = polya_p(m, lda_obj.topicPolya[d]) * polya_p(v, lda_obj.wordPolya[m])
    end
    return posterior
end

function lda_addSample(d::Int, i::Int, m::Int, lda_obj::LDA)
    v = lda_obj.X[d][i]
    polya_observe(m, lda_obj.topicPolya[d])
    polya_observe(v, lda_obj.wordPolya[m])
end

function lda_removeSample(d::Int, i::Int, m::Int, lda_obj::LDA)
    v = lda_obj.X[d][i]
    polya_forget(m, lda_obj.topicPolya[d])
    polya_forget(v, lda_obj.wordPolya[m])
end

function lda_wordPredict(m::Int, v::Int, lda_obj::LDA)
    return polya_p(v, lda_obj.wordPolya[m])
end

function lda_topicPredict(d::Int, m::Int, lda_obj::LDA)
    return polya_p(m, lda_obj.topicPolya[d])
end

LDA(topicPrior::Dirichlet, wordPrior::Dirichlet) = LDA(100, topicPrior.K, topicPrior, [Polya(wordPrior) for i in 1:topicPrior.K], [[]], [])  

LDA

## First compelte test starts from here

In [10]:
corpus = DocumentSet()
documentset_readData("news.txt", corpus)

In [11]:
wordPrior = Dirichlet(12, 0.01)
M = 3
alpha = [0.01 for i in 1:M];
topicPrior = Dirichlet(alpha);

In [13]:
lda = LDA(topicPrior, wordPrior);

In [14]:
samples = lda_sample(corpus.documents, lda)

5-element Array{Array{Any,1},1}:
 [1, 1, 1, 1]
 [2, 2, 1, 2]
 [3, 1, 1, 3]
 [1, 3, 2, 2]
 [1, 3, 2, 3]

In [15]:
for m in 1:3
    for v in 1:12
        word = corpus.reverse_vocabulary[v]
        val = lda_wordPredict(m, v, lda)
        println("$m \t $word \t $val" )
    end
end

1 	 ロシア 	 0.3300438596491228
1 	 モスクワ 	 0.11074561403508773
1 	 戦車 	 0.22039473684210525
1 	 部隊 	 0.22039473684210525
1 	 五輪 	 0.0010964912280701756
1 	 メダル 	 0.0010964912280701756
1 	 自衛隊 	 0.0010964912280701756
1 	 国防 	 0.0010964912280701756
1 	 マラソン 	 0.11074561403508773
1 	 選手 	 0.0010964912280701756
1 	 時代 	 0.0010964912280701756
1 	 文化 	 0.0010964912280701756
2 	 ロシア 	 0.0016339869281045752
2 	 モスクワ 	 0.32843137254901955
2 	 戦車 	 0.0016339869281045752
2 	 部隊 	 0.0016339869281045752
2 	 五輪 	 0.32843137254901955
2 	 メダル 	 0.32843137254901955
2 	 自衛隊 	 0.0016339869281045752
2 	 国防 	 0.0016339869281045752
2 	 マラソン 	 0.0016339869281045752
2 	 選手 	 0.0016339869281045752
2 	 時代 	 0.0016339869281045752
2 	 文化 	 0.0016339869281045752
3 	 ロシア 	 0.001953125
3 	 モスクワ 	 0.001953125
3 	 戦車 	 0.001953125
3 	 部隊 	 0.001953125
3 	 五輪 	 0.001953125
3 	 メダル 	 0.001953125
3 	 自衛隊 	 0.197265625
3 	 国防 	 0.197265625
3 	 マラソン 	 0.001953125
3 	 選手 	 0.197265625
3 	 時代 	 0.197265625
3 	 文化 	 0.197265625


In [62]:
for d in 1:5
    for m in 1:4
        val = lda_topicPredict(d, m, lda)
        println("$d \t $m \t $val" )
    end
end

1 	 1 	 0.20038549687894103
1 	 2 	 0.3700239077010383
1 	 3 	 0.1181801937565314
1 	 4 	 0.3114104016634894
2 	 1 	 0.20038549687894103
2 	 2 	 0.14652320457795337
2 	 3 	 0.1181801937565314
2 	 4 	 0.5349111047865743
3 	 1 	 0.08863514531739855
3 	 2 	 0.5935246108241233
3 	 3 	 0.1181801937565314
3 	 4 	 0.1996600501019469
4 	 1 	 0.08863514531739855
4 	 2 	 0.14652320457795337
4 	 3 	 0.1181801937565314
4 	 4 	 0.6466614563481168
5 	 1 	 0.08863514531739855
5 	 2 	 0.14652320457795337
5 	 3 	 0.4534312484411588
5 	 4 	 0.3114104016634894


In [15]:
for m in 1:M
    val = topicPrior.alpha[m]
    println("$m \t alpha \t $val")
end

1 	 alpha 	 1.0e-199
2 	 alpha 	 1.0485513456954876
3 	 alpha 	 1.0e-199
4 	 alpha 	 1.0e-199
5 	 alpha 	 1.0e-199
6 	 alpha 	 1.0e-199
7 	 alpha 	 1.0e-199
8 	 alpha 	 1.0e-199
9 	 alpha 	 1.0e-199
10 	 alpha 	 1.0e-199
11 	 alpha 	 1.0e-199
12 	 alpha 	 1.0e-199
13 	 alpha 	 1.0e-199
14 	 alpha 	 1.0e-199
15 	 alpha 	 1.0e-199
16 	 alpha 	 1.0e-199
17 	 alpha 	 1.0e-199
18 	 alpha 	 1.0e-199
19 	 alpha 	 1.0e-199
20 	 alpha 	 1.0e-199
21 	 alpha 	 1.0e-199
22 	 alpha 	 1.0e-199
23 	 alpha 	 1.0e-199
24 	 alpha 	 1.0e-199
25 	 alpha 	 1.0e-199
26 	 alpha 	 1.0e-199
27 	 alpha 	 1.0e-199
28 	 alpha 	 1.0e-199
29 	 alpha 	 1.0e-199
30 	 alpha 	 1.0e-199
31 	 alpha 	 1.0e-199
32 	 alpha 	 1.0e-199
33 	 alpha 	 1.0e-199
34 	 alpha 	 1.0e-199
35 	 alpha 	 1.0e-199
36 	 alpha 	 1.0e-199
37 	 alpha 	 1.0e-199
38 	 alpha 	 1.0e-199
39 	 alpha 	 1.0e-199
40 	 alpha 	 1.0e-199
41 	 alpha 	 1.0e-199
42 	 alpha 	 1.0e-199
43 	 alpha 	 1.0e-199
44 	 alpha 	 1.0e-199
45 	 alpha 	 1.137249541668945


## Actual topic modeling on NIPS papers dataset(used in fragmentation project as well)

In [71]:
papers = CSV.read("papers.csv", DataFrame);

In [72]:
papers_txt = papers.paper_text;

In [73]:
stopwords = []
specialchars = ['!', '”', '#', '$', '%', '&', '’', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '>', '=', '@', '?', '[', ']', '^', '_', '{', '}', '|', '~']
open("stopwords.txt") do file
        for word in eachline(file)
            push!(stopwords, word)
        end
    end

In [140]:
function remove_stop_words(docs::Array, stopwords::Array)
    new_docs = []
    for line in docs
        doc = split(line)
        temp = []
        for word in doc
            if !(word in stopwords) 
                word = replace.(word, specialchars => "")
                if length(word) > 2 && tryparse(Float64, word) == nothing
                    push!(temp, lowercase(word))
                end
            end
        end
        push!(new_docs, temp)
    end
    return new_docs
end

remove_stop_words (generic function with 1 method)

In [141]:
new_paper_txt = remove_stop_words(papers_txt, stopwords);

In [142]:
corpus = DocumentSet()

DocumentSet(Any[], 0, 0, Dict{Any,Any}(), Any[])

In [143]:
documentset_readData(new_paper_txt, corpus)

In [144]:
corpus.vocab_count #Previous implementation value 478255

399426

In [145]:
wordPrior = Dirichlet(corpus.vocab_count, 0.01)
M = 30
alpha = [0.01 for i in 1:M];
topicPrior = Dirichlet(alpha);

In [146]:
lda = LDA(topicPrior, wordPrior);

In [147]:
samples = lda_sample(corpus.documents, lda);

In [148]:
val1 = [];
for m in 1:1
    for v in 1:size(corpus.reverse_vocabulary)[1]
        val = lda_wordPredict(m, v, lda)
        push!(val1, val)
    end
end

In [149]:
size(val1)

(399426,)

In [150]:
top_words = sortperm(val1, rev=true)[1:10]

10-element Array{Int64,1}:
  136
 1988
  172
 1995
   25
 2077
 8085
 2995
 1825
  355

In [151]:
for i in top_words
    println(corpus.reverse_vocabulary[i])
end

problem
optimization
solution
regression
the
regularization
convex
sparse
constraints
min


In [155]:
for m in 1:M
    val1 = []
    for v in 1:size(corpus.reverse_vocabulary)[1]
        val = lda_wordPredict(m, v, lda)
        push!(val1, val)
    end
    top_words = sortperm(val1, rev=true)[1:10]
    wrds = []
    for i in top_words
        push!(wrds, corpus.reverse_vocabulary[i])
    end
    println("Topic $m : $wrds")
end

Topic 1 : Any["problem", "optimization", "solution", "regression", "the", "regularization", "convex", "sparse", "constraints", "min"]
Topic 2 : Any["tree", "node", "nodes", "trees", "game", "the", "search", "strategy", "games", "cost"]
Topic 3 : Any["time", "state", "the", "dynamics", "process", "learning", "rate", "system", "one", "point"]
Topic 4 : Any["network", "networks", "the", "learning", "input", "training", "neural", "units", "output", "hidden"]
Topic 5 : Any["model", "latent", "data", "topic", "models", "the", "number", "process", "dirichlet", "topics"]
Topic 6 : Any["matrix", "algorithm", "matrices", "sparse", "the", "rank", "tensor", "data", "norm", "subspace"]
Topic 7 : Any["data", "the", "number", "time", "using", "performance", "method", "algorithm", "set", "accuracy"]
Topic 8 : Any["gradient", "algorithm", "optimization", "convergence", "algorithms", "stochastic", "methods", "method", "descent", "convex"]
Topic 9 : Any["graph", "set", "nodes", "graphs", "algorithm", "ed