# DocumentSet Class using mutable struct

In [1]:
using StatsBase

In [3]:
mutable struct DocumentSet
    documents::Array
    document_size::Int
    vocab_count::Int
    vocabulary::Dict{}
    reverse_vocabulary::Array
end

function documentset_readData(path::String, documentset_obj::DocumentSet)
    open(path) do file
        for line in eachline(file)
            documentset_addDocument(line, documentset_obj)
        end
    end
end

function documentset_readData(texts::Array, documentset_obj::DocumentSet)
    for doc in texts
        documentset_addDocument(doc)
    end
end

function documentset_addDocument(line, documentset_obj::DocumentSet)
    if isempty(line)
        return nothing
    end
    words = split(line)
    codes = []
    for i in words
        if haskey(documentset_obj.vocabulary, i)
            push!(codes, documentset_obj.vocabulary[i])
        else 
            documentset_obj.vocab_count+=1
            documentset_obj.vocabulary[i] = documentset_obj.vocab_count
            push!(documentset_obj.reverse_vocabulary, i)
            push!(codes, documentset_obj.vocab_count)
        end
    end
    push!(documentset_obj.documents, codes)
    documentset_obj.document_size+=1
end

function documentset_transform(line, documentset_obj::DocumentSet)
    words = split(line)
    codes = []
    for i in words
        code = get(documentset_obj.vocabulary, i, -1)
        if code != -1
            push!(codes, code)
        end  
    end
    return sort(codes)
end

function documentset_sampleDocuments(numDocs, documentset_obj::DocumentSet)
    subD = sample(documentset_obj.documents, numDocs, replace = false)
    return subD  
end

function documentset_OnlinesampleDocuments(numDocs, iter, documentset_obj::DocumentSet)
    subD = documentset_obj.documents[(iter-1)*numDocs+1: (iter)*numDocs]
    return subD
end

function documentset_getTermFreq(documentset_obj::DocumentSet)
    tf = [0 for i=1:documentset_obj.vocab_count]
    for doc in documentset_obj.documents
        for w in doc
            tf[w]+=1 
        end
    end
    return tf
end

DocumentSet() = DocumentSet([],0, 0, Dict(), [])

DocumentSet

## Class call and using data

In [12]:
global corpus = DocumentSet()
documentset_readData("dummy.txt", corpus) #Or use readData(readlines("dummy.txt"), corpus)
V = size(corpus.reverse_vocabulary)[1]

11

## Extra function testing

In [5]:
println("Total docs: $(corpus.document_size)")

Total docs: 5


In [6]:
println(corpus.reverse_vocabulary)
println(documentset_getTermFreq(corpus))

Any["This", "is", "dummy", "line", "number", "1", "numebr", "2", "3", "4th", "5th"]
[3, 3, 5, 5, 1, 1, 2, 1, 1, 1, 1]


In [7]:
corpus.documents

5-element Array{Any,1}:
 Any[1, 2, 3, 4, 5, 6]
 Any[1, 2, 3, 4, 7, 8]
 Any[1, 2, 3, 4, 7, 9]
 Any[10, 3, 4]
 Any[11, 3, 4]

In [8]:
corpus.vocabulary

Dict{Any,Any} with 11 entries:
  "numebr" => 7
  "number" => 5
  "1"      => 6
  "is"     => 2
  "2"      => 8
  "dummy"  => 3
  "5th"    => 11
  "line"   => 4
  "This"   => 1
  "4th"    => 10
  "3"      => 9

In [9]:
documentset_transform("5th dummy line", corpus)

3-element Array{Any,1}:
  3
  4
 11

In [10]:
documentset_OnlinesampleDocuments(1,5, corpus)

1-element Array{Any,1}:
 Any[11, 3, 4]

# SymmetricDirichlet Class(not extending Dirichlet) using struct

In [13]:
struct SymmetricDirichlet
    K::Int16
    alpha::Float16
    sumAlpha::Float16
end

SymmetricDirichlet(K, a) = SymmetricDirichlet(K, a, K*a)

SymmetricDirichlet

## Class call

In [15]:
M=100
alpha = [0.01 for i in 1:M];
wordPrior = SymmetricDirichlet(V,0.01)

SymmetricDirichlet(11, Float16(0.01), Float16(0.11))

# Dirichlet Class using mutable struct

In [16]:
global MINIMUM_PARAM = 10E-200

mutable struct Dirichlet
    K::Int16
    alpha::Array{Float16}
    sumAlpha::Float16
end

function dirichlet_optimizeParam(Ck, ndkMax, C_, ndMax, numIteration, dirichlet_obj::Dirichlet)
    function digammaRecurrence(nMax, C, z)
        if z==0
            return 0
        end
        
        R=0; S=0;
        for n in 1:nMax
            R+= 1 / (n-1+z)
            S+= C[n]*R
        end
        return S
    end
            
    for i in 1:numIteration
        demon = digammaRecurrence(ndMax, C_, dirichlet_obj.sumAlpha)
        
        for k in 1:dirichlet_obj.K
            numer = digammaRecurrence(ndkMax[k], Ck[k], alpha[k])
            dirichlet_obj.alpha[k] *= (numer/demon)
            dirichlet_obj.alpha[k] = max(dirichlet_obj.alpha[k], MINIMUM_PARAM)
        end
        dirichlet_obj.sumAlpha = sum(dirichlet_obj.alpha)
    end
end

function dirichlet_set(param, dirichlet_obj::Dirichlet)
    for k in 1:dirichlet_obj.K
        dirichlet_obj.alpha[k] = param[k] > MINIMUM_PARAM ? param[k] : MINIMUM_PARAM
    end
end

Dirichlet(param) = Dirichlet(size(param)[1], param, sum(param))

Dirichlet

## Function call

In [17]:
topicPrior = Dirichlet(alpha);

# Class Polya using mutable struct

In [21]:
mutable struct Polya
    K::Int16
    dir::Dirichlet
    n::Array{Int16}
    N::Int16
end

function polya_p(x::Int, polya_obj::Polya)
    return (polya_obj.n[x]+polya_obj.dir.alpha[x]) / (polya_obj.N+polya_obj.dir.sumAlpha)
end

function polya_p(X::Array, polya_obj::Polya)
    p = 1.0
    for x in X
        p*= polya_p(x, polya_obj)
        polya_observe(x, polya_obj)
    end
    for x in X
        polya_forget(x, polya_obj)
    end
end
  
function polya_observe(x, polya_obj::Polya)
    polya_obj.n[x]+=1
    polya_obj.N+=1  
end

function polya_observe(X::Array, polya_obj::Polya)
    for x in X
        polya_observe(x, polya_obj)
    end
end

function polya_forget(x, polya_obj::Polya)
    polya_obj.n[x] -= 1
    polya_obj.N -= 1
end

function polya_forget(X::Array, polya_obj::Polya)
    for x in X
        polya_forget(x, polya_obj)
    end
end

function polya_getCount(k, polya_obj::Polya)
    return polya_obj.n[k]
end

Polya(param::Dirichlet) = Polya(param.K, param, [0 for i in 1:param.K], 0)

Polya

In [22]:
function Sampler_sample(p)
    u = rand()*sum(p)
    for i in 1:size(p)
        if u<=p[i]
            return i
        end
        u-=p[i]
    end
    return size(p)
end

Sampler_sample (generic function with 1 method)

# Class LDA using mutable struct

In [24]:
mutable struct LDA
    numIteration::Int16
    M::Int16
    topicDir::Dirichlet
    wordPolya::Array{Polya}
    X::Array
    topicPolya::Array{Polya}
end

function lda_sample(docs, lda_obj::LDA)
    lda_obj.X = docs
    D = size(docs)
    #lda_obj.topicPolya =  Polya[D]   not sure how can i make it happen
    ndMax = Int(floor( maximum([size(doc) for doc in docs]) ))
    
    C_ = [0 for i in ndMax]
    for d in 1:D
        C_[size(docs[d])]+=1
    end
    
    samples = [[] for i=1:D]
    for d in 1:D
        Nd = size(docs[d])
        
        push!(samples, [0 for i in 1:Nd])
        push!(lda_obj.topicPolya, Polya(lda_obj.topicDir))
        
        temp = []
        for i in 1:Nd
            randomSample = rand(1:lda_obj.M)
            push!(temp, randomSample)
            lda_addSample(d, i, randomSample)
        end
        samples[d] = temp
    end
    
    for iteration in 1:lda_obj.numIteration
        for d in 1:D
            Nd = size(docs[d])
            for i in 1:Nd
                lda_removeSample(d, i, samples[d][i])
                samples[d][i] = Sampler_sample(lda_posterior(d, i))
                lda_addSample(d, i, samples[d][i])
            end
        end
        
        ndkMax = [1 for i in 1:lda_obj.M]
        Ck = [[0 for i in 1:M] for i in 1:ndMax+1]
        for m in 1:M
            for d in 1:D
                ndk = lda_obj.topicPolya[d].polya_getCount(m, lda_obj.topicPolya[d])
                Ck[m][ndk]+=1
                ndkMax[m] = max(ndkMax[m], ndk)
            end
        end
        lda_obj.topicDir.dirichlet_optimizeParam(Ck, ndkMax, C_, ndMax, 20, lda_obj.topicDir)
    end
    return samples
end

#LDA(topicPrior::Dirichlet, wordPrior::Dirichlet) = LDA(100, topicPrior.K, topicPrior, 

lda_sample (generic function with 1 method)