In [1]:
using Pkg
using Distances
using StatsBase
using DataStructures
using ProgressMeter
using JSON
Pkg.activate("TopicModels")
import TopicModels
using PyCall
using PyPlot

using LSHFunctions, LinearAlgebra, BenchmarkTools
#corpora = pyimport("gensim.corpora")
#ch = pyimport("gensim.models.coherencemodel")

[32m[1m Activating[22m[39m environment at `~/Documents/Study/Git_Personal/julia_bayes/TopicModels/Project.toml`
┌ Info: Precompiling TopicModels [cfcb1801-bb54-4f1b-8249-336c042d2c46]
└ @ Base loading.jl:1278


In [2]:
#nn = pyimport("torch.nn")
np = pyimport("numpy")
torch = pyimport("torch")
#plt = pyimport("matplotlib.pyplot")

PyObject <module 'torch' from '/Users/khan/.julia/conda/3/lib/python3.8/site-packages/torch/__init__.py'>

## Sample Run on Dummy Data

In [2]:
corpus = TopicModels.readData("news-en.txt");

In [3]:
wordPrior = TopicModels.Dirichlet(12, 0.01)
M = 3
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [4]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [5]:
samples = TopicModels.lda_sample(corpus.documents, lda);

In [6]:
# 1st topic and top 5 words along with vocab proportion
words, proportions = TopicModels.lda_topicN(3, 4, corpus, lda);

In [7]:
println(words)
println(proportions)

Any["medal", "runner", "era", "culture"]
[0.39258, 0.19727, 0.19727, 0.19727]


 # Human in the Loop Topic Modeling with APIs

In [3]:
corpus, no_lemma_docs = TopicModels.preprocess("nips/papers_refined.csv", 1000); #took 1:35 minutes

In [4]:
#load lda object from saved json
#lda = TopicModels.loadLDA("nips/lda_obj.json"); #only takes few seconds 

#or train and save new lda model from below command, .
lda = TopicModels.train(corpus, 10);
#TopicModels.saveLDA(lda, "nips/lda_obj.json");

In [5]:
topics, proportions = TopicModels.show_topics(lda, corpus, 10);

Any["learning", "networks", "training", "image", "neural", "deep", "model", "network", "images", "layer"]
----------------------
Any["data", "learning", "model", "samples", "features", "classification", "function", "regression", "distribution", "feature"]
----------------------
Any["algorithm", "learning", "probability", "error", "set", "distribution", "number", "regret", "case", "one"]
----------------------
Any["graph", "algorithm", "nodes", "set", "node", "clustering", "data", "model", "pages", "algorithms"]
----------------------
Any["network", "neural", "networks", "training", "learning", "units", "system", "output", "input", "hidden"]
----------------------
Any["model", "inference", "models", "distribution", "latent", "variational", "log", "posterior", "data", "bayesian"]
----------------------
Any["matrix", "linear", "random", "kernel", "approximation", "error", "problem", "points", "number", "algorithm"]
----------------------
Any["algorithm", "optimization", "theorem", "functi

## Top equal number of docs for each topic(number of docs/number of topics)

In [5]:
topic_distributions = TopicModels.sortedTopDocsForTopics(lda, corpus);

In [8]:
lda.topicPolya[1]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.579974296129749, 0.5744191180525104, 0.32848648063939023, 0.3135048762874856, 0.3534344433827259, 0.4007397684235953, 0.32384814477454843, 0.393391052177399, 0.4892796297821862, 0.7773404107210861], 4.534418220370676), [0, 142, 1108, 21, 23, 18, 36, 996, 197, 244], 2785)

In [18]:
TopicModels.topicPredict(lda, 1, 10)

0.08774845680767072

In [7]:
fileKPDoc = "nips/kp/simplified_last_1000_keyphrases_clean.json"
fileEMB = "nips/kp/1000_docs_keyphrase_embedding.json"
fileSim = "nips/kp/1000_docs_kp_similarity_python.json"
fileKP = "nips/kp/keyphrases_from_python.json"
kp = TopicModels.load_keyphrase(fileKPDoc, fileEMB, fileSim, fileKP);
# Took 0:54 minutes

In [8]:
cluster_kp, docs_have = TopicModels.top_x_kp_of_topic_m(kp, topic_distributions, 5, 1);

In [9]:
println(sort(cluster_kp[4]))
print(sort(docs_have[4]))

OrderedDict{Any,Any}("machine learning" => Any["machine learning", "meta - learning", "parallel machine learning", "human machine learning", "neural programming", "neural logic programming", "computational intelligence", "training algorithms"])
Any[121, 128, 161, 180, 201, 205, 218, 294, 320, 390, 471, 603, 627, 645, 703]

In [10]:
lda.topicPolya[121]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.37874374128870947, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 4.528132496660802), [1917, 60, 43, 146, 95, 103, 630, 23, 279, 0], 3296)

In [11]:
lda.topicPolya[645]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.37874374128870947, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 4.528132496660802), [1305, 311, 282, 68, 249, 48, 39, 174, 16, 351], 2843)

In [12]:
TopicModels.apply_refinement(lda, corpus, "R_kp", 4, 1, kp); # Take 2:30 mints

In [14]:
lda.topicPolya[121]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [1.0e-7, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 4.149388855372093), [0, 236, 28, 511, 493, 237, 789, 41, 764, 197], 3296)

In [13]:
TopicModels.show_topics(lda, corpus, 10);

Any["networks", "image", "training", "learning", "neural", "deep", "network", "model", "images", "layer"]
----------------------
Any["data", "learning", "model", "samples", "classification", "features", "function", "training", "regression", "feature"]
----------------------
Any["algorithm", "learning", "probability", "error", "distribution", "set", "number", "regret", "case", "given"]
----------------------
Any["graph", "algorithm", "nodes", "set", "node", "clustering", "data", "model", "pages", "algorithms"]
----------------------
Any["network", "neural", "networks", "training", "learning", "input", "system", "output", "units", "one"]
----------------------
Any["model", "inference", "distribution", "models", "latent", "variational", "log", "posterior", "data", "gaussian"]
----------------------
Any["matrix", "linear", "random", "kernel", "algorithm", "error", "approximation", "problem", "points", "number"]
----------------------
Any["algorithm", "optimization", "theorem", "function", 

In [63]:
function addDoc(self, corpus, docs, topic)
    if typeof(docs) == Int64
        docs = [docs]
    end
    for doc_idx in docs
        param = copy(self.topicPolya[doc_idx].dir.alpha)
        difference = maximum(self.topicPolya[doc_idx].n) - self.topicPolya[doc_idx].n[topic]
        param[topic] = self.topicPolya[doc_idx].dir.alpha[topic] + difference
        for w in enumerate(corpus.documents[doc_idx])
            if self.Samples[doc_idx][w[1]]!=topic
                TopicModels.removeSample(self, doc_idx, w[1], self.Samples[doc_idx][w[1]])
                self.Samples[doc_idx][w[1]] = 0
            end
        end
        self.topicPolya[doc_idx].dir = TopicModels.Dirichlet(param)    
    end          
end

addDoc (generic function with 1 method)

In [64]:
println(sort(topic_distributions[1]))

[1, 3, 10, 14, 16, 17, 18, 22, 34, 38, 39, 45, 47, 49, 50, 52, 55, 81, 86, 88, 92, 121, 128, 148, 158, 161, 165, 166, 168, 179, 180, 190, 191, 194, 199, 201, 205, 211, 218, 223, 246, 272, 282, 294, 306, 314, 320, 321, 348, 354, 368, 369, 372, 377, 378, 390, 402, 412, 427, 433, 441, 442, 471, 482, 487, 498, 502, 510, 516, 517, 534, 540, 541, 554, 556, 557, 570, 587, 589, 592, 596, 597, 600, 603, 627, 629, 645, 652, 658, 666, 668, 671, 678, 681, 682, 691, 697, 698, 703, 855]


In [65]:
lda.topicPolya[5]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.37874374128870947, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 4.528132496660802), [5, 504, 549, 156, 14, 55, 1033, 521, 0, 1], 2838)

In [66]:
addDoc(lda, corpus, 5, 1);

In [68]:
TopicModels.gibbsSampling(lda, corpus.documents, 20);

In [72]:
lda.topicPolya[5]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [1028.3787437412886, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 1032.5281324966606), [662, 270, 438, 119, 3, 0, 784, 558, 3, 1], 2838)

In [73]:
topic_distributions = TopicModels.sortedTopDocsForTopics(lda, corpus);

In [74]:
println(sort(topic_distributions[1]))

[1, 3, 5, 10, 14, 16, 17, 18, 22, 34, 38, 39, 40, 45, 47, 49, 50, 52, 55, 67, 81, 86, 88, 92, 95, 119, 143, 148, 158, 165, 166, 168, 179, 188, 190, 191, 194, 199, 211, 223, 246, 256, 272, 282, 306, 314, 321, 334, 348, 354, 368, 369, 372, 377, 378, 402, 412, 427, 433, 441, 442, 475, 482, 487, 498, 501, 502, 510, 515, 516, 517, 534, 540, 541, 543, 552, 554, 556, 557, 570, 587, 589, 592, 596, 597, 600, 621, 629, 642, 652, 658, 666, 668, 671, 678, 681, 682, 691, 697, 698]


In [75]:
println(sort(topic_distributions[2]))

[8, 13, 19, 21, 22, 29, 32, 36, 39, 44, 47, 54, 58, 63, 74, 80, 82, 100, 102, 110, 112, 113, 123, 128, 129, 137, 139, 169, 172, 193, 197, 206, 219, 226, 234, 235, 243, 256, 268, 271, 294, 303, 304, 311, 323, 326, 344, 355, 372, 381, 390, 391, 393, 401, 408, 431, 438, 442, 445, 449, 460, 477, 480, 517, 530, 535, 543, 547, 548, 555, 558, 561, 563, 571, 575, 584, 595, 624, 643, 649, 665, 666, 667, 684, 687, 695, 703, 717, 752, 783, 794, 841, 904, 923, 936, 954, 955, 971, 986, 989]


In [77]:
lda.topicPolya[9]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.37874374128870947, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 4.528132496660802), [84, 48, 119, 125, 660, 3, 20, 5, 61, 1], 1126)

In [78]:
addDoc(lda, corpus, 9, 2);

In [80]:
TopicModels.gibbsSampling(lda, corpus.documents, 20);

In [81]:
lda.topicPolya[9]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.37874374128870947, 612.5164471077485, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 616.5281324966609), [52, 361, 41, 83, 531, 3, 13, 4, 38, 0], 1126)

In [83]:
topic_distributions = TopicModels.sortedTopDocsForTopics(lda, corpus);

In [84]:
println(sort(topic_distributions[2]))

[8, 9, 13, 19, 21, 22, 29, 32, 36, 39, 44, 47, 54, 58, 63, 74, 80, 82, 100, 102, 110, 112, 113, 123, 128, 129, 137, 139, 169, 172, 193, 206, 219, 226, 234, 235, 243, 256, 268, 271, 276, 294, 303, 304, 311, 320, 323, 326, 344, 355, 372, 381, 389, 390, 391, 393, 401, 408, 422, 428, 431, 438, 442, 445, 449, 460, 471, 480, 485, 504, 517, 535, 543, 547, 548, 553, 555, 558, 561, 563, 571, 584, 595, 624, 643, 649, 665, 666, 667, 684, 687, 695, 703, 728, 783, 841, 904, 936, 955, 989]


In [87]:
println(sort(topic_distributions[5]))

[64, 66, 109, 131, 153, 253, 383, 407, 418, 429, 462, 484, 507, 594, 616, 639, 661, 683, 714, 716, 724, 729, 731, 734, 735, 737, 741, 742, 743, 744, 746, 748, 754, 755, 760, 761, 764, 766, 771, 776, 778, 779, 781, 782, 786, 791, 793, 800, 802, 807, 808, 809, 813, 814, 815, 817, 819, 821, 822, 825, 828, 831, 838, 839, 844, 858, 864, 866, 867, 874, 877, 878, 880, 881, 885, 888, 889, 905, 907, 913, 917, 918, 922, 927, 933, 935, 941, 944, 949, 963, 965, 966, 968, 973, 975, 980, 984, 987, 988, 992]


In [89]:
lda.topicPolya[67]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.37874374128870947, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 0.654889287463685, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 4.528132496660802), [1160, 605, 482, 1, 92, 484, 22, 275, 0, 35], 3156)

In [90]:
addDoc(lda, corpus, 67, 5);

In [92]:
TopicModels.gibbsSampling(lda, corpus.documents, 20);

In [93]:
lda.topicPolya[67]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.37874374128870947, 0.5164471077484565, 0.6415088998692902, 0.35160601036829925, 1068.6548892874637, 0.36340530184332687, 0.5354276647937665, 0.44829370150100706, 0.37649809841356136, 0.2613126833707], 1072.5281324966609), [703, 452, 377, 0, 1059, 281, 20, 249, 0, 15], 3156)

In [94]:
topic_distributions = TopicModels.sortedTopDocsForTopics(lda, corpus);
println(sort(topic_distributions[5]))

[64, 66, 67, 109, 153, 253, 383, 407, 418, 429, 462, 484, 507, 594, 616, 639, 661, 683, 712, 714, 724, 729, 731, 734, 735, 737, 741, 742, 743, 744, 746, 748, 754, 760, 764, 766, 771, 776, 778, 779, 781, 782, 786, 791, 793, 800, 802, 804, 807, 808, 809, 813, 814, 815, 817, 819, 821, 822, 825, 828, 831, 838, 839, 844, 854, 858, 864, 866, 867, 874, 877, 878, 880, 881, 885, 888, 889, 896, 905, 907, 913, 917, 918, 922, 927, 935, 941, 944, 949, 957, 963, 965, 966, 968, 973, 975, 984, 987, 988, 992]


# Save LDA object

In [229]:
lda_dict = Dict()
lda_dict["numIteration"] = lda.numIteration
lda_dict["M"] = lda.M
lda_dict["topicDir_param"] = lda.topicDir.alpha
lda_dict["wordPolya_n"] = [i.n for i in lda.wordPolya]
lda_dict["X"] = lda.X
lda_dict["topicPolya_n"] = [i.n for i in lda.topicPolya]
lda_dict["Samples"] = lda.Samples
lda_json_string = JSON.json(lda_dict)

open("lda_obj.json","w") do f 
    write(f, lda_json_string) 
end

18214193

In [None]:
lda1_raw = JSON.parsefile("lda_obj.json");
numIteration = lda1_raw["numIteration"]
M = lda1_raw["M"]
topicDir = TopicModels.Dirichlet(lda1_raw["topicDir_param"])
wordPolya = [TopicModels.Polya(TopicModels.Dirichlet(length(i), 0.01), i) for i in lda1_raw["wordPolya_n"]]
X = lda1_raw["X"]
topicPolya = [TopicModels.Polya(topicDir, i) for i in lda1_raw["topicPolya_n"]]
Samples = lda1_raw["Samples"]
lda_obj = TopicModels.LDA(numIteration, M, topicDir, wordPolya, X, topicPolya, Samples);

# Topic Coherence, add and remove word refinement

In [6]:
TopicModels.topic_coherence(corpus, topics, true)

-2.7308979585130184

In [7]:
TopicModels.apply_refinement(lda, corpus, "remove", "model", 1);

In [8]:
topics, proportions = TopicModels.show_topics(corpus, lda, 10);

Any["variable", "causal", "data", "group", "test", "individual", "distribution", "information", "outcome", "hypothesis"]
[0.01333, 0.00915, 0.00838, 0.00817, 0.00716, 0.00677, 0.00609, 0.00606, 0.00596, 0.00581]
----------------------
Any["algorithm", "time", "problem", "one", "number", "also", "result", "value", "given", "two"]
[0.03303, 0.01766, 0.01364, 0.01102, 0.01081, 0.00674, 0.00599, 0.00536, 0.00536, 0.00493]
----------------------
Any["image", "network", "model", "convolutional", "training", "adversarial", "learning", "deep", "sample", "loss"]
[0.02886, 0.017, 0.00936, 0.00924, 0.00892, 0.00814, 0.00783, 0.0075, 0.00735, 0.00707]
----------------------
Any["error", "memory", "noise", "data", "distributed", "block", "bit", "vector", "performance", "code"]
[0.01721, 0.01412, 0.01332, 0.00967, 0.00902, 0.00875, 0.00864, 0.00794, 0.00773, 0.00715]
----------------------
Any["network", "unit", "neural", "input", "weight", "output", "learning", "hidden", "training", "state"]
[0.058

In [9]:
TopicModels.topic_coherence(corpus, topics, true)

-2.6578622641487515

In [None]:
TopicModels.apply_refinement(corpus, lda, "add", "node", 2);

In [None]:
TopicModels.show_topics(corpus, lda, 10);

## Find the top topic for each doc

In [None]:
function topic_of_each_doc(lda, corpus)
    top_topic_for_each_doc = []
    for i in 1:corpus.document_size
        top_val = 0
        top_topic = 0
        for j in 1:lda.M
            v = TopicModels.lda_topicPredict(i, j, lda)
            if v>=top_val
                top_val = v
                top_topic = j
            end
        end
        push!(top_topic_for_each_doc, top_topic)
    end
    return top_topic_for_each_doc
end
#top_topic_for_each_doc = topic_of_each_doc(corpus, lda);

In [37]:
top_topic_for_each_doc

999-element Array{Any,1}:
  2
  6
  2
  1
  6
  4
  9
  1
  6
  5
  1
  5
  5
  ⋮
  9
  6
 10
  6
 10
 10
 10
 10
  6
 10
 10
  5

In [38]:
function view_top_docs(top_topic_for_each_doc, document_file, topic)
    papers = CSV.read(document_file, DataFrame);
    titles = papers.title
    titles = titles[length(titles)-corpus.document_size+1:length(titles)]
    for (idx, t) in enumerate(top_topic_for_each_doc)
        if t == topic
            #println(idx, " " , titles[idx])
            print(idx, ", ")
        end
    end
end

view_top_docs (generic function with 1 method)

In [32]:
using CSV, DataFrames
view_top_docs(top_topic_for_each_doc, "papers.csv", 1)

14, 15, 24, 51, 64, 70, 76, 79, 

# HLTM remove document implementation

In [13]:
corpus, no_lemma_docs = TopicModels.preprocess("papers.csv", 20);

In [14]:
lda = TopicModels.train(corpus, 3);

In [15]:
topics, proportions = TopicModels.show_topics(corpus, lda, 5);

Any["network", "input", "networks", "training", "learning"]
[0.02076, 0.01477, 0.01446, 0.01322, 0.01085]
----------------------
Any["neural", "algorithm", "data", "case", "weight"]
[0.01855, 0.01428, 0.01418, 0.01023, 0.00927]
----------------------
Any["state", "model", "learning", "models", "number"]
[0.01429, 0.01354, 0.01193, 0.01107, 0.00774]
----------------------


In [16]:
lda.wordPolya

3-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [7, 36, 8, 10, 14, 128, 17, 7, 0, 0  …  1, 0, 0, 0, 0, 2, 0, 0, 0, 1], 9628)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 13, 0, 0, 0, 7  …  0, 0, 0, 0, 0, 0, 1, 0, 0, 0], 9330)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 0, 0, 0, 6, 0  …  0, 1, 1, 2, 2, 0, 0, 1, 1, 0], 9252)

In [17]:
lda.topicPolya

20-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [720, 398, 334], 1452)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [111, 370, 1065], 1546)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [1325, 271, 214], 1810)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [963, 138, 561], 1662)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [102, 250, 128], 480)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [559, 274, 422], 1255)
 TopicModels.Polya(3, TopicModels.Dirichle

In [18]:
lda.topicDir

TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845)

In [19]:
lda.Samples

20-element Array{Any,1}:
 Any[1, 1, 1, 1, 1, 1, 1, 1, 3, 2  …  1, 1, 3, 1, 1, 3, 3, 3, 1, 2]
 Any[3, 3, 3, 3, 3, 2, 3, 3, 3, 3  …  2, 2, 3, 3, 2, 3, 2, 3, 3, 3]
 Any[1, 1, 1, 2, 1, 1, 1, 1, 1, 1  …  3, 1, 1, 1, 1, 1, 1, 1, 3, 1]
 Any[1, 1, 3, 1, 1, 1, 1, 2, 1, 2  …  3, 3, 1, 1, 2, 1, 3, 1, 1, 3]
 Any[2, 2, 3, 2, 2, 2, 3, 3, 1, 2  …  1, 1, 3, 3, 1, 2, 3, 3, 2, 1]
 Any[1, 3, 2, 1, 2, 1, 1, 1, 3, 1  …  2, 3, 1, 3, 3, 2, 3, 3, 2, 3]
 Any[1, 1, 1, 1, 3, 1, 3, 3, 2, 1  …  3, 1, 3, 1, 3, 1, 1, 1, 3, 3]
 Any[1, 1, 1, 2, 2, 2, 2, 2, 1, 1  …  1, 2, 2, 2, 1, 1, 2, 1, 2, 1]
 Any[2, 2, 2, 3, 2, 3, 2, 3, 2, 2  …  2, 3, 1, 2, 3, 1, 1, 3, 2, 2]
 Any[2, 2, 2, 2, 2, 2, 2, 2, 2, 2  …  1, 1, 3, 2, 3, 3, 2, 3, 2, 2]
 Any[2, 1, 2, 2, 1, 1, 2, 3, 1, 3  …  1, 1, 1, 1, 3, 1, 2, 2, 1, 2]
 Any[2, 2, 2, 3, 2, 1, 2, 3, 2, 2  …  2, 3, 2, 2, 3, 1, 3, 2, 3, 3]
 Any[3, 3, 3, 2, 3, 3, 3, 1, 3, 1  …  2, 2, 1, 3, 2, 3, 2, 3, 3, 3]
 Any[1, 3, 1, 1, 1, 1, 1, 3, 1, 1  …  1, 1, 3, 1, 3, 3, 1, 1, 1, 1]
 Any[3, 3, 3, 3, 1, 3, 

In [20]:
TopicModels.apply_refinement(lda, corpus, "R_D", 1, 1);

In [21]:
lda.wordPolya

3-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 39, 0, 0, 0, 0  …  1, 0, 0, 0, 2, 2, 0, 0, 1, 1], 8005)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 102, 0, 0, 0, 5  …  0, 0, 1, 0, 0, 0, 0, 1, 0, 0], 9991)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [7, 36, 8, 10, 14, 0, 17, 7, 6, 2  …  0, 1, 0, 2, 0, 0, 1, 0, 0, 0], 10214)

In [22]:
lda.topicPolya

20-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [1.0e-7, 0.6433493639156528, 0.6922273273374719], 1.3355767912531247), [0, 647, 805], 1452)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [48, 355, 1143], 1546)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [1291, 264, 255], 1810)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [944, 131, 587], 1662)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [78, 235, 167], 480)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [482, 283, 490], 1255)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.48669688

In [23]:
lda.Samples

20-element Array{Any,1}:
 Any[3, 3, 3, 3, 3, 2, 3, 3, 3, 2  …  3, 2, 2, 2, 2, 3, 3, 2, 2, 2]
 Any[3, 2, 3, 3, 3, 3, 3, 3, 3, 3  …  2, 2, 3, 3, 2, 3, 3, 2, 3, 3]
 Any[1, 1, 1, 2, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 3, 2, 3, 3]
 Any[2, 1, 1, 1, 1, 3, 1, 1, 1, 3  …  3, 3, 1, 1, 2, 1, 1, 1, 1, 3]
 Any[2, 2, 1, 2, 2, 2, 3, 2, 3, 2  …  1, 1, 3, 3, 1, 2, 3, 3, 2, 2]
 Any[1, 3, 2, 1, 2, 3, 3, 1, 3, 3  …  2, 3, 3, 3, 3, 2, 3, 3, 2, 1]
 Any[1, 1, 1, 1, 3, 1, 3, 3, 2, 3  …  1, 2, 3, 1, 3, 1, 1, 1, 3, 3]
 Any[1, 1, 2, 2, 1, 2, 2, 2, 1, 1  …  3, 2, 1, 2, 1, 1, 2, 1, 2, 1]
 Any[2, 2, 2, 2, 2, 1, 2, 3, 2, 2  …  2, 2, 3, 2, 3, 1, 3, 3, 2, 2]
 Any[2, 2, 2, 2, 2, 2, 2, 2, 2, 2  …  1, 2, 3, 2, 3, 3, 2, 1, 2, 2]
 Any[1, 1, 2, 2, 1, 1, 2, 1, 1, 1  …  2, 1, 1, 2, 3, 1, 2, 2, 1, 1]
 Any[2, 2, 2, 2, 2, 1, 2, 3, 2, 2  …  2, 3, 2, 2, 3, 2, 3, 2, 3, 2]
 Any[3, 3, 3, 2, 3, 3, 3, 2, 3, 2  …  2, 2, 3, 3, 2, 3, 2, 3, 3, 3]
 Any[1, 3, 3, 1, 1, 1, 1, 3, 1, 1  …  1, 1, 3, 1, 1, 3, 1, 1, 1, 1]
 Any[3, 3, 3, 3, 3, 3, 

# Automatic Topic Labeling Experiment

In [1]:
using Pkg
using Distances
using StatsBase
using DataStructures
using ProgressMeter
Pkg.activate("TopicModels")
import TopicModels
#using PyCall

#corpora = pyimport("gensim.corpora")
#ch = pyimport("gensim.models.coherencemodel")

[32m[1m Activating[22m[39m environment at `~/Documents/Study/Git_Personal/julia_bayes/TopicModels/Project.toml`
┌ Info: Precompiling TopicModels [cfcb1801-bb54-4f1b-8249-336c042d2c46]
└ @ Base loading.jl:1278


In [2]:
corpus, no_lemma_docs = TopicModels.preprocess("papers.csv", 100);

In [3]:
candidate_label_distribution = TopicModels.train_phrase_model(no_lemma_docs, corpus.vocabulary, 10, true);

In [4]:
lda = TopicModels.train(corpus, 10);

In [5]:
topics, proportions = TopicModels.show_topics(corpus, lda, corpus.vocab_count, false);

In [6]:
for t in topics
    println(t[1:10])
end

Any["input", "figure", "current", "motion", "direction", "analog", "single", "voltage", "synapses", "layer"]
Any["state", "learning", "algorithm", "states", "probability", "function", "convergence", "value", "algorithms", "process"]
Any["network", "neural", "units", "networks", "output", "training", "control", "figure", "architecture", "model"]
Any["model", "system", "two", "field", "one", "visual", "weights", "time", "first", "figure"]
Any["recognition", "training", "distance", "input", "information", "used", "pattern", "figure", "performance", "test"]
Any["data", "algorithm", "problem", "set", "models", "new", "given", "mixture", "classification", "points"]
Any["image", "network", "networks", "mlp", "features", "road", "images", "model", "objects", "layer"]
Any["learning", "space", "generalization", "case", "training", "distribution", "examples", "sequence", "error", "matrix"]
Any["memory", "neurons", "learning", "computational", "model", "activation", "tasks", "neuron", "synaptic", 

In [7]:
#New code
top_3_bi_tri_labels_only = TopicModels.label_ranking(candidate_label_distribution, topics, proportions)

[32mComputing...100%|███████████████████████████████████████| Time: 0:44:04[39m


Dict{Any,Any} with 10 entries:
  7  => Array{Any,1}[["hidden_layer", 2.78953, [0.00218779, 0.0116524, 0.010924…
  4  => Array{Any,1}[["can_be", 2.56588, [0.00419746, 0.00339036, 0.00328274, 0…
  9  => Array{Any,1}[["it_is", 2.80177, [0.00166972, 0.000501499, 0.00517437, 0…
  10 => Array{Any,1}[["can_be", 1.9869, [0.000646218, 0.00269087, 0.00172235, 0…
  2  => Array{Any,1}[["can_be", 2.29343, [0.00258326, 0.00559643, 0.0037132, 0.…
  3  => Array{Any,1}[["neural_network", 2.24291, [0.00704973, 0.00472876, 0.001…
  5  => Array{Any,1}[["training_data", 2.66596, [0.00135389, 0.00968897, 0.0006…
  8  => Array{Any,1}[["can_be", 2.3162, [0.00559643, 0.00172235, 0.00139951, 0.…
  6  => Array{Any,1}[["can_be", 2.27858, [0.00382081, 0.0037132, 0.00349797, 0.…
  1  => Array{Any,1}[["has_been", 2.75192, [0.0031105, 0.000865275, 0.00328321,…

In [8]:
# For comparing all words of topic distributions with base_count_kl = 0.01
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_bi_tri_labels_only[pos]])
    #println(i[1:10]) 
end

["has_been", "can_be", "such_as"]
["can_be", "value_function", "note_that"]
["neural_network", "error_between", "after_training"]
["can_be", "is_not", "has_been"]
["training_data", "this_is", "can_be"]
["can_be", "it_is", "em_algorithm"]
["hidden_layer", "video_camera", "input_features"]
["can_be", "we_have", "it_is"]
["it_is", "such_as", "biological_neural"]
["can_be", "it_is", "this_is"]


## For comparing top n words of topic distributions

In [25]:
# For comparing top 1000 words of topic distributions
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_labels[pos]])
    println(i[1:10])
end

["procedureoptimal", "steadily", "glennygedacuk"]
Any["figure", "however", "examples", "noise", "case", "point", "equation", "approximation", "rule", "given"]
["validityinterval", "breastcancerwisc", "uel"]
Any["networks", "rules", "neural", "units", "network", "mlp", "hidden", "rule", "training", "prediction"]
["datadriven", "irrespective", "distort"]
Any["image", "tasks", "object", "algorithm", "images", "one", "objects", "models", "view", "learn"]
["qlg", "mbl", "vgo"]
Any["wta", "input", "current", "voltage", "gate", "inversion", "source", "floating", "correlogram", "connectivity"]
["anatolviical", "ity", "limonenelinalool"]
Any["model", "models", "activation", "inhibitory", "output", "receptor", "layer", "excitatory", "module", "cell"]
["jointangle", "cameradriven", "orderly"]
Any["units", "memory", "unit", "patterns", "number", "pattern", "representation", "region", "joint", "active"]
["gos", "ilx", "bkkx"]
Any["data", "problem", "given", "one", "function", "variance", "paper", "

In [19]:
# For comparing top 100 words of topic distributions
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_labels[pos]])
    println(i[1:10])
end

["openversusclosed", "procedureoptimal", "lksaulopsychemitedu"]
Any["figure", "however", "examples", "noise", "case", "point", "equation", "approximation", "rule", "given"]
["chowdhury", "afscsprojectconnectbench", "validityinterval"]
Any["networks", "rules", "neural", "units", "network", "mlp", "hidden", "rule", "training", "prediction"]
["datadriven", "architecuture", "neuristique"]
Any["image", "tasks", "object", "algorithm", "images", "one", "objects", "models", "view", "learn"]
["buccleuch", "qlg", "mbl"]
Any["wta", "input", "current", "voltage", "gate", "inversion", "source", "floating", "correlogram", "connectivity"]
["anatolviical", "ity", "norberto"]
Any["model", "models", "activation", "inhibitory", "output", "receptor", "layer", "excitatory", "module", "cell"]
["systvol", "bartlett", "rethworcementlearmng"]
Any["units", "memory", "unit", "patterns", "number", "pattern", "representation", "region", "joint", "active"]
["nijmegen", "psycholinguistik", "aonori"]
Any["data", "pro

In [16]:
# For comparing top 50 words of topic distributions
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_labels[pos]])
    println(i[1:10])
end

["procedureoptimal", "openversusclosed", "glennygedacuk"]
Any["figure", "however", "examples", "noise", "case", "point", "equation", "approximation", "rule", "given"]
["validityinterval", "chowdhury", "afscsprojectconnectbench"]
Any["networks", "rules", "neural", "units", "network", "mlp", "hidden", "rule", "training", "prediction"]
["datadriven", "architecuture", "neuristique"]
Any["image", "tasks", "object", "algorithm", "images", "one", "objects", "models", "view", "learn"]
["qlg", "buccleuch", "ioeqo"]
Any["wta", "input", "current", "voltage", "gate", "inversion", "source", "floating", "correlogram", "connectivity"]
["anatolviical", "ity", "norberto"]
Any["model", "models", "activation", "inhibitory", "output", "receptor", "layer", "excitatory", "module", "cell"]
["cameradriven", "sloppy", "recognizable"]
Any["units", "memory", "unit", "patterns", "number", "pattern", "representation", "region", "joint", "active"]
["gos", "nijmegen", "psycholinguistik"]
Any["data", "problem", "give

## Word count in topic labels

In [83]:
sort(collect(labels_freq["coronary_artery_bypass"]), by=x->x[2], rev=true)

278-element Array{Pair{Any,Any},1}:
           "mlp" => 21
          "risk" => 15
    "confidence" => 14
     "bootstrap" => 12
           "vol" => 11
       "surgery" => 9
    "operations" => 8
     "mortality" => 8
      "networks" => 8
       "failure" => 7
 "complications" => 7
       "history" => 7
         "renal" => 7
                 ⋮
       "factors" => 1
  "interactions" => 1
       "strokes" => 1
     "committee" => 1
        "number" => 1
      "maintain" => 1
         "block" => 1
       "reflect" => 1
       "summary" => 1
       "average" => 1
      "spending" => 1
   "performance" => 1

In [42]:
labels_freq

Dict{Any,Any} with 15739 entries:
  "oblique"     => Dict{Any,Any}("points"=>5,"jlkl"=>2,"translation"=>2,"sekule…
  "dev"         => Dict{Any,Any}("fpxsftvg"=>1,"fprr"=>1,"neurophysiol"=>1,"res…
  "cambrdge"    => Dict{Any,Any}("proceedings"=>1,"learningfrom"=>1,"tesauro"=>…
  "yjk"         => Dict{Any,Any}("corresponding"=>1,"rotation"=>1,"points"=>2,"…
  "null"        => Dict{Any,Any}("regions"=>2,"uext"=>1,"oblique"=>1,"excitatio…
  "inflowbased" => Dict{Any,Any}("choices"=>1,"accommodate"=>1,"optimal"=>1,"in…
  "ztt"         => Dict{Any,Any}("qlearning"=>1,"presented"=>2,"directly"=>1,"m…
  "iaan"        => Dict{Any,Any}("want"=>2,"elog"=>2,"emax"=>2,"holds"=>3,"fini…
  "subfeature"  => Dict{Any,Any}("tasks"=>1,"due"=>1,"indicate"=>1,"relevant"=>…
  "rises"       => Dict{Any,Any}("constant"=>3,"functional"=>1,"los"=>2,"equal"…
  "hampshire"   => Dict{Any,Any}("constant"=>1,"action"=>1,"equal"=>1,"data"=>1…
  "dzfk"        => Dict{Any,Any}("fpxsftvg"=>1,"dem"=>1,"fprr"=>1,"kxt"=>1,

In [84]:
corpus.vocabulary

Dict{Any,Any} with 14707 entries:
  "oblique"     => 7247
  "dev"         => 16
  "cambrdge"    => 3145
  "yjk"         => 7246
  "null"        => 1728
  "inflowbased" => 1831
  "ztt"         => 4061
  "iaan"        => 6949
  "subfeature"  => 10738
  "rises"       => 6074
  "hampshire"   => 11903
  "dzfk"        => 8
  "vapnik"      => 3387
  "progression" => 8512
  "neumann"     => 5360
  "fram"        => 10918
  "gathered"    => 7052
  "eeitutery"   => 7290
  "arborize"    => 10152
  "november"    => 6778
  "stress"      => 5963
  "zqm"         => 8291
  "rectified"   => 6644
  "obey"        => 2264
  "methods"     => 818
  ⋮             => ⋮

In [65]:
s = 0
for (phrase, score) in phrase_model_bigram.find_phrases(bigrams_docs)
    println(phrase)
    s+=1
end
println(s)

input_output
coronary_artery_bypass
neural_information_processing_systems
i_i_i
using_neural_networks
as_shown_in_figure
m__m
can_be_obtained
networks_of_spiking_neurons
we_assume_that
i__i_
this_paper_we
this_can_be
can_be_derived
learning_from_examples
it_is_possible
editors_advances_in_neural_information
i_i_i_i
cambridge_ma
office_of_naval_research
city_block_length
can_be_viewed_as
radial_basis_function
where_n
teacher_space_entropy
we_then
output_is
is_defined_as
that_can_be
we_find_that
et_al_eds
san_mateo_ca
can_be_used
modified_actorcritic_algorithm
hierarchical_mixtures_of_experts
which_can_be
training_algorithms
it_is_not
et_al
all_other
processing_systems_san_mateo
network_is
processing_systems_morgan_kaufmann
will_not_be
m_is
pittsburgh_pa
where_is
elastic_input_field
eds_advances_in_neural_information
it_can_be_shown
l_d
it_does_not
can_be_found
here_is
carnegie_mellon_university
way_that
it_may_be
information_processing_systems
terrence_j_sejnowski
there_is_no
receiver_o

## Part of Speech related random commands

In [35]:
a= nltk.pos_tag(docs[7143])

17802-element Array{Tuple{String,String},1}:
 ("A", "DT")
 ("n", "JJ")
 (" ", "NN")
 ("A", "NNP")
 ("l", "NN")
 ("t", "NN")
 ("e", "NN")
 ("r", "NN")
 ("n", "IN")
 ("a", "DT")
 ("t", "NN")
 ("i", "NN")
 ("v", "VBP")
 ⋮
 ("p", "JJ")
 ("u", "JJ")
 ("t", "NN")
 ("a", "DT")
 ("t", "NN")
 ("i", "NN")
 ("o", "VBP")
 ("n", "NN")
 (".", ".")
 ("\n", "CC")
 ("\n", "JJ")
 ("\f", "NN")

In [44]:
a[1][1]

"A"

In [3]:
for i in a
end

LoadError: UndefVarError: a not defined

In [38]:
using PyCall
nltk = pyimport("nltk")

PyObject <module 'nltk' from '/Users/khan/.julia/conda/3/lib/python3.8/site-packages/nltk/__init__.py'>

In [7]:
a = nltk.stem.WordNetLemmatizer()

PyObject <WordNetLemmatizer>

In [9]:
a.lemmatize("Word", pos="")

"Word"

In [8]:
using DataStructures
wn = nltk.corpus.wordnet
t_map = Dict('J' => wn.ADJ,'V' => wn.VERB,'R' => wn.ADV)

tag_map = DefaultDict(wn.NOUN, t_map)

LoadError: UndefVarError: nltk not defined

In [33]:
tag_map['Z']

"n"

In [9]:
dd = DefaultDict(1) 
d = Dict('a'=>1, 'b'=>2)

Dict{Char,Int64} with 2 entries:
  'a' => 1
  'b' => 2

In [10]:
dd = DefaultDict(0, d)

DefaultDict{Char,Int64,Int64} with 2 entries:
  'a' => 1
  'b' => 2

In [15]:
dd['A']

0