In [1]:
using Pkg
using Distances
using StatsBase
using DataStructures
using ProgressMeter
using JSON
Pkg.activate("TopicModels")
import TopicModels
using PyCall
using PyPlot

using LSHFunctions, LinearAlgebra, BenchmarkTools
#corpora = pyimport("gensim.corpora")
#ch = pyimport("gensim.models.coherencemodel")

[32m[1m Activating[22m[39m environment at `~/Documents/Study/Git_Personal/julia_bayes/TopicModels/Project.toml`
┌ Info: Precompiling TopicModels [cfcb1801-bb54-4f1b-8249-336c042d2c46]
└ @ Base loading.jl:1278


In [2]:
#nn = pyimport("torch.nn")
np = pyimport("numpy")
torch = pyimport("torch")
#plt = pyimport("matplotlib.pyplot")

PyObject <module 'torch' from '/Users/khan/.julia/conda/3/lib/python3.8/site-packages/torch/__init__.py'>

## Sample Run on Dummy Data

In [2]:
corpus = TopicModels.readData("news-en.txt");

In [3]:
wordPrior = TopicModels.Dirichlet(12, 0.01)
M = 3
alpha = [0.01 for i in 1:M];
topicPrior = TopicModels.Dirichlet(alpha);

In [4]:
lda = TopicModels.LDA(topicPrior, wordPrior);

In [5]:
samples = TopicModels.lda_sample(corpus.documents, lda);

In [6]:
# 1st topic and top 5 words along with vocab proportion
words, proportions = TopicModels.lda_topicN(3, 4, corpus, lda);

In [7]:
println(words)
println(proportions)

Any["medal", "runner", "era", "culture"]
[0.39258, 0.19727, 0.19727, 0.19727]


 # Human in the Loop Topic Modeling with APIs

In [3]:
corpus, no_lemma_docs = TopicModels.preprocess("papers.csv", 1000); #took 1:35 minutes

In [151]:
#load lda object from saved json
lda = TopicModels.loadLDA("lda_obj.json"); #only takes few seconds 

#or train new lda model from below command, uncomment to run.
#lda = TopicModels.train(corpus, 10);


In [152]:
topics, proportions = TopicModels.show_topics(lda, corpus, 10);

Any["data", "kernel", "features", "method", "approach", "methods", "point", "space", "points", "learning"]
----------------------
Any["model", "system", "figure", "neural", "network", "time", "neurons", "input", "activity", "units"]
----------------------
Any["model", "graph", "set", "algorithm", "nodes", "node", "data", "models", "time", "number"]
----------------------
Any["learning", "state", "policy", "reward", "action", "reinforcement", "time", "function", "value", "states"]
----------------------
Any["learning", "image", "training", "images", "data", "loss", "adversarial", "model", "generative", "samples"]
----------------------
Any["matrix", "algorithm", "problem", "clustering", "algorithms", "approximation", "time", "let", "pages", "one"]
----------------------
Any["model", "models", "deep", "inference", "neural", "training", "variational", "parameters", "learning", "gradient"]
----------------------
Any["network", "networks", "neural", "training", "input", "layer", "output", "

## Top equal number of docs for each topic(number of docs/number of topics)

In [6]:
topic_distributions = TopicModels.sortedTopDocsForTopics(lda, corpus);

In [7]:
println(topic_distributions[1])

[653, 569, 452, 236, 376, 112, 667, 66, 869, 728, 129, 937, 796, 631, 768, 252, 393, 2, 382, 432, 501, 356, 102, 885, 219, 873, 816, 684, 89, 775, 960, 560, 235, 565, 265, 449, 719, 93, 469, 54, 880, 444, 44, 261, 607, 74, 455, 594, 480, 325, 714, 699, 961, 687, 212, 800, 936, 619, 371, 792, 979, 904, 754, 923, 977, 804, 133, 695, 257, 237, 360, 830, 324, 932, 964, 504, 652, 929, 266, 693, 819, 420, 680, 80, 144, 916, 899, 147, 919, 322, 605, 628, 561, 123, 724, 951, 90, 515, 301, 945]


In [8]:
fileKPDoc = "kp/simplified_last_1000_keyphrases_clean.json"
fileEMB = "kp/1000_docs_keyphrase_embedding.json"
fileSim = "kp/1000_docs_kp_similarity_python.json"
fileKP = "kp/keyphrases_from_python.json"
kp = TopicModels.load_keyphrase(fileKPDoc, fileEMB, fileSim, fileKP)
# Took 0:54 minutes

Total keyphrases are: 1274
Unique keyphrases are: 1096


In [10]:
all_keyphrases_1, documentwise_keyphrases_ls_1 = TopicModels.top_keyphrases_of_topic(kp, topic_distributions, 1);


Total keyphrases are: 1274
Unique keyphrases are: 1096


In [40]:
all_keyphrases_1_ls = [i.first for i in all_keyphrases_1];

In [187]:
function keyphrase_cluster(kp, topic_doc_kp, sim_threshold=0.85, max_kp_count=5)
    kp_topic_ls = [i.first for i in topic_doc_kp];
    cluster_kp = []; all_cluster_kps = []; kp_c = 1
    for (i_idx, i) in enumerate(kp_topic_ls)
        if !(i in all_cluster_kps)
            temp_kp_ls = kp.keyphraseSimilarity[i]
            temp_dict = Dict(); temp_dict[i] = []
            for j in findall(x->x==1, ifelse.(temp_kp_ls.>sim_threshold, true, false))
                if kp.keyphrasesOnly[j] in kp_topic_ls
                    push!(temp_dict[i], kp.keyphrasesOnly[j])
                    push!(all_cluster_kps, kp.keyphrasesOnly[j])
                end
            end
            push!(cluster_kp, temp_dict)
            kp_c+=1
        end
        if kp_c>max_kp_count
            break
        end
    end
    return cluster_kp
end

keyphrase_cluster (generic function with 3 methods)

In [169]:
cluster_kp_1 = keyphrase_cluster(kp, all_keyphrases_1);

In [82]:
cluster_kp = []
all_cluster_kps = []
kp_c = 0
sim_threshold = 0.85
for (idx, i) in enumerate(all_keyphrases_1_ls)
    if !(i in all_cluster_kps)
        temp_kp_ls = kp.keyphraseSimilarity[i]
        temp_dict = Dict()
        temp_dict[i] = []
        for j in findall(x->x==1, ifelse.(temp_kp_ls.>sim_threshold, true, false))
            if kp.keyphrasesOnly[j] in all_keyphrases_1_ls
                push!(temp_dict[i], kp.keyphrasesOnly[j])
                push!(all_cluster_kps, kp.keyphrasesOnly[j])
            end
        end
        push!(cluster_kp, temp_dict)
        kp_c+=1
        #print(i)
    end
    if kp_c>5
        break
    end
        
end

In [177]:
print(cluster_kp[4])

Dict{Any,Any}("gaussian processes" => Any["gaussian processes", "gaussian process", "stochastic processes", "gaussian process regression", "nonlinearity", "of gaussian processes", "mixtures of gaussian distributions", "stochastic systems", "latent variable multiple output gaussian processes", "gaussian mixture model", "nondegenerate gaussian processes"])

In [186]:
print(cluster_kp_1[3])

Dict{Any,Any}("kernel methods" => Any["kernel methods", "functional algorithm", "kernel regression", "kernel machines", "kernel approximation", "kernel x free methods", "kernel functions", "kernel dimension reduction"])

In [153]:
docs_have = []
for i in cluster_kp
    temp_docs_have = []
    for j in collect(values(i))[1]
        for (k_idx, k) in enumerate(documentwise_keyphrases_ls_1)
            if j in k && !(topic_distributions[1][k_idx] in temp_docs_have)
                push!(temp_docs_have, topic_distributions[1][k_idx])
            end
        end
    end
    push!(docs_have, copy(temp_docs_have))
end   

In [154]:
print(sort(docs_have[3]))

Any[102, 129, 219, 252, 356, 360, 371, 376, 607, 631, 667, 680, 695]

In [155]:
lda.topicPolya[5]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [0.6613950891749237, 0.3910087579712004, 0.41621724469367627, 0.3092807128687573, 0.3218985595874629, 0.43918174997401005, 0.35933131008920305, 0.5646429362355259, 0.6561032591272069, 0.43943293662376515], 4.5584925563457315), [265, 10, 439, 0, 3, 212, 73, 6, 1494, 348], 2850)

In [156]:
lda.wordPolya[1].n[3180]

2683

In [157]:
TopicModels.apply_refinement(lda, corpus, "R_D", docs_have[3], 1); # Take 2:30 mints

In [158]:
lda.topicPolya[102]

TopicModels.Polya(10, TopicModels.Dirichlet(10, [1.0e-7, 0.3910087579712004, 0.41621724469367627, 0.3092807128687573, 0.3218985595874629, 0.43918174997401005, 0.35933131008920305, 0.5646429362355259, 0.6561032591272069, 0.43943293662376515], 3.8970975671708077), [0, 217, 106, 37, 645, 173, 176, 124, 1621, 15], 3114)

In [None]:
Any["data", "kernel", "features", "method", "approach", "methods", "point", "space", "points", "learning"]

In [159]:
TopicModels.show_topics(lda, corpus, 10);

Any["data", "point", "approach", "method", "linear", "space", "points", "used", "methods", "features"]
----------------------
Any["model", "system", "figure", "time", "neurons", "neural", "network", "input", "activity", "information"]
----------------------
Any["model", "graph", "nodes", "set", "algorithm", "node", "data", "time", "models", "tree"]
----------------------
Any["learning", "policy", "state", "reward", "action", "reinforcement", "time", "function", "value", "states"]
----------------------
Any["learning", "image", "training", "images", "data", "model", "loss", "adversarial", "generative", "samples"]
----------------------
Any["matrix", "algorithm", "clustering", "problem", "algorithms", "approximation", "time", "random", "number", "let"]
----------------------
Any["model", "models", "deep", "neural", "inference", "learning", "training", "variational", "parameters", "data"]
----------------------
Any["network", "networks", "neural", "training", "input", "layer", "output", "

In [160]:
lda.wordPolya[1].n[3180]

410

In [180]:
println(topic_distributions[2])

[894, 767, 273, 451, 583, 810, 751, 220, 726, 972, 740, 175, 832, 862, 142, 745, 962, 780, 928, 900, 787, 762, 887, 440, 826, 120, 886, 718, 843, 996, 735, 76, 901, 847, 176, 672, 736, 506, 153, 991, 715, 753, 65, 926, 362, 879, 883, 853, 694, 131, 938, 820, 943, 739, 818, 998, 770, 934, 953, 861, 769, 835, 946, 807, 777, 164, 974, 978, 822, 945, 462, 967, 906, 539, 999, 394, 933, 350, 942, 87, 764, 994, 940, 711, 789, 713, 871, 956, 925, 788, 231, 881, 993, 888, 773, 969, 924, 897, 808, 318]


In [181]:
all_keyphrases_2, documentwise_keyphrases_ls_2 = TopicModels.top_keyphrases_of_topic(kp, topic_distributions, 2);


Total keyphrases are: 1307
Unique keyphrases are: 1121


In [188]:
cluster_kp_2 = keyphrase_cluster(kp, all_keyphrases_2);

In [189]:
[keys(i) for i in cluster_kp_2]

5-element Array{Base.KeySet{Any,Dict{Any,Any}},1}:
 ["neural networks"]
 ["learning"]
 ["associative memory"]
 ["simulation"]
 ["algorithms"]

In [190]:
docs_have_2 = []
for i in cluster_kp_2
    temp_docs_have = []
    for j in collect(values(i))[1]
        for (k_idx, k) in enumerate(documentwise_keyphrases_ls_2)
            if j in k && !(topic_distributions[2][k_idx] in temp_docs_have)
                push!(temp_docs_have, topic_distributions[2][k_idx])
            end
        end
    end
    push!(docs_have_2, copy(temp_docs_have))
end   

In [198]:
print(sort(docs_have_2[1]))

Any[65, 76, 131, 142, 153, 164, 175, 220, 318, 394, 451, 462, 672, 711, 713, 736, 739, 751, 764, 767, 770, 777, 787, 808, 832, 847, 871, 881, 888, 897, 901, 925, 933, 934, 940, 953, 956, 969, 974, 991, 993, 998]

In [193]:
TopicModels.apply_refinement(lda, corpus, "R_D", docs_have_2[1], 2); # Take 2:30 mints

In [None]:
Any["model", "system", "figure", "time", "neurons", "neural", "network", "input", "activity", "information"]

In [194]:
TopicModels.show_topics(lda, corpus, 10);

Any["data", "point", "figure", "linear", "approach", "method", "space", "points", "set", "features"]
----------------------
Any["model", "system", "time", "neural", "neurons", "information", "learning", "figure", "activity", "noise"]
----------------------
Any["model", "graph", "nodes", "set", "algorithm", "node", "data", "time", "models", "tree"]
----------------------
Any["learning", "state", "policy", "reward", "action", "reinforcement", "control", "time", "value", "function"]
----------------------
Any["learning", "image", "training", "images", "data", "model", "loss", "adversarial", "generative", "classification"]
----------------------
Any["matrix", "algorithm", "clustering", "problem", "algorithms", "approximation", "time", "random", "pages", "number"]
----------------------
Any["model", "models", "deep", "neural", "inference", "training", "learning", "parameters", "variational", "gradient"]
----------------------
Any["network", "networks", "neural", "input", "training", "output

# Save LDA object

In [229]:
lda_dict = Dict()
lda_dict["numIteration"] = lda.numIteration
lda_dict["M"] = lda.M
lda_dict["topicDir_param"] = lda.topicDir.alpha
lda_dict["wordPolya_n"] = [i.n for i in lda.wordPolya]
lda_dict["X"] = lda.X
lda_dict["topicPolya_n"] = [i.n for i in lda.topicPolya]
lda_dict["Samples"] = lda.Samples
lda_json_string = JSON.json(lda_dict)

open("lda_obj.json","w") do f 
    write(f, lda_json_string) 
end

18214193

In [None]:
lda1_raw = JSON.parsefile("lda_obj.json");
numIteration = lda1_raw["numIteration"]
M = lda1_raw["M"]
topicDir = TopicModels.Dirichlet(lda1_raw["topicDir_param"])
wordPolya = [TopicModels.Polya(TopicModels.Dirichlet(length(i), 0.01), i) for i in lda1_raw["wordPolya_n"]]
X = lda1_raw["X"]
topicPolya = [TopicModels.Polya(topicDir, i) for i in lda1_raw["topicPolya_n"]]
Samples = lda1_raw["Samples"]
lda_obj = TopicModels.LDA(numIteration, M, topicDir, wordPolya, X, topicPolya, Samples);

# Topic Coherence, add and remove word refinement

In [6]:
TopicModels.topic_coherence(corpus, topics, true)

-2.7308979585130184

In [7]:
TopicModels.apply_refinement(lda, corpus, "remove", "model", 1);

In [8]:
topics, proportions = TopicModels.show_topics(corpus, lda, 10);

Any["variable", "causal", "data", "group", "test", "individual", "distribution", "information", "outcome", "hypothesis"]
[0.01333, 0.00915, 0.00838, 0.00817, 0.00716, 0.00677, 0.00609, 0.00606, 0.00596, 0.00581]
----------------------
Any["algorithm", "time", "problem", "one", "number", "also", "result", "value", "given", "two"]
[0.03303, 0.01766, 0.01364, 0.01102, 0.01081, 0.00674, 0.00599, 0.00536, 0.00536, 0.00493]
----------------------
Any["image", "network", "model", "convolutional", "training", "adversarial", "learning", "deep", "sample", "loss"]
[0.02886, 0.017, 0.00936, 0.00924, 0.00892, 0.00814, 0.00783, 0.0075, 0.00735, 0.00707]
----------------------
Any["error", "memory", "noise", "data", "distributed", "block", "bit", "vector", "performance", "code"]
[0.01721, 0.01412, 0.01332, 0.00967, 0.00902, 0.00875, 0.00864, 0.00794, 0.00773, 0.00715]
----------------------
Any["network", "unit", "neural", "input", "weight", "output", "learning", "hidden", "training", "state"]
[0.058

In [9]:
TopicModels.topic_coherence(corpus, topics, true)

-2.6578622641487515

In [None]:
TopicModels.apply_refinement(corpus, lda, "add", "node", 2);

In [None]:
TopicModels.show_topics(corpus, lda, 10);

## Find the top topic for each doc

In [36]:
function topic_of_each_doc(corpus, lda)
    top_topic_for_each_doc = []
    for i in 1:corpus.document_size
        top_val = 0
        top_topic = 0
        for j in 1:lda.M
            v = TopicModels.lda_topicPredict(i, j, lda)
            if v>=top_val
                top_val = v
                top_topic = j
            end
        end
        push!(top_topic_for_each_doc, top_topic)
    end
    return top_topic_for_each_doc
end
top_topic_for_each_doc = topic_of_each_doc(corpus, lda);

In [37]:
top_topic_for_each_doc

999-element Array{Any,1}:
  2
  6
  2
  1
  6
  4
  9
  1
  6
  5
  1
  5
  5
  ⋮
  9
  6
 10
  6
 10
 10
 10
 10
  6
 10
 10
  5

In [38]:
function view_top_docs(top_topic_for_each_doc, document_file, topic)
    papers = CSV.read(document_file, DataFrame);
    titles = papers.title
    titles = titles[length(titles)-corpus.document_size+1:length(titles)]
    for (idx, t) in enumerate(top_topic_for_each_doc)
        if t == topic
            #println(idx, " " , titles[idx])
            print(idx, ", ")
        end
    end
end

view_top_docs (generic function with 1 method)

In [32]:
using CSV, DataFrames
view_top_docs(top_topic_for_each_doc, "papers.csv", 1)

14, 15, 24, 51, 64, 70, 76, 79, 

# HLTM remove document implementation

In [13]:
corpus, no_lemma_docs = TopicModels.preprocess("papers.csv", 20);

In [14]:
lda = TopicModels.train(corpus, 3);

In [15]:
topics, proportions = TopicModels.show_topics(corpus, lda, 5);

Any["network", "input", "networks", "training", "learning"]
[0.02076, 0.01477, 0.01446, 0.01322, 0.01085]
----------------------
Any["neural", "algorithm", "data", "case", "weight"]
[0.01855, 0.01428, 0.01418, 0.01023, 0.00927]
----------------------
Any["state", "model", "learning", "models", "number"]
[0.01429, 0.01354, 0.01193, 0.01107, 0.00774]
----------------------


In [16]:
lda.wordPolya

3-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [7, 36, 8, 10, 14, 128, 17, 7, 0, 0  …  1, 0, 0, 0, 0, 2, 0, 0, 0, 1], 9628)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 13, 0, 0, 0, 7  …  0, 0, 0, 0, 0, 0, 1, 0, 0, 0], 9330)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 0, 0, 0, 6, 0  …  0, 1, 1, 2, 2, 0, 0, 1, 1, 0], 9252)

In [17]:
lda.topicPolya

20-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [720, 398, 334], 1452)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [111, 370, 1065], 1546)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [1325, 271, 214], 1810)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [963, 138, 561], 1662)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [102, 250, 128], 480)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845), [559, 274, 422], 1255)
 TopicModels.Polya(3, TopicModels.Dirichle

In [18]:
lda.topicDir

TopicModels.Dirichlet(3, [0.6906504672532601, 0.6433493639156528, 0.6922273273374719], 2.0262271585063845)

In [19]:
lda.Samples

20-element Array{Any,1}:
 Any[1, 1, 1, 1, 1, 1, 1, 1, 3, 2  …  1, 1, 3, 1, 1, 3, 3, 3, 1, 2]
 Any[3, 3, 3, 3, 3, 2, 3, 3, 3, 3  …  2, 2, 3, 3, 2, 3, 2, 3, 3, 3]
 Any[1, 1, 1, 2, 1, 1, 1, 1, 1, 1  …  3, 1, 1, 1, 1, 1, 1, 1, 3, 1]
 Any[1, 1, 3, 1, 1, 1, 1, 2, 1, 2  …  3, 3, 1, 1, 2, 1, 3, 1, 1, 3]
 Any[2, 2, 3, 2, 2, 2, 3, 3, 1, 2  …  1, 1, 3, 3, 1, 2, 3, 3, 2, 1]
 Any[1, 3, 2, 1, 2, 1, 1, 1, 3, 1  …  2, 3, 1, 3, 3, 2, 3, 3, 2, 3]
 Any[1, 1, 1, 1, 3, 1, 3, 3, 2, 1  …  3, 1, 3, 1, 3, 1, 1, 1, 3, 3]
 Any[1, 1, 1, 2, 2, 2, 2, 2, 1, 1  …  1, 2, 2, 2, 1, 1, 2, 1, 2, 1]
 Any[2, 2, 2, 3, 2, 3, 2, 3, 2, 2  …  2, 3, 1, 2, 3, 1, 1, 3, 2, 2]
 Any[2, 2, 2, 2, 2, 2, 2, 2, 2, 2  …  1, 1, 3, 2, 3, 3, 2, 3, 2, 2]
 Any[2, 1, 2, 2, 1, 1, 2, 3, 1, 3  …  1, 1, 1, 1, 3, 1, 2, 2, 1, 2]
 Any[2, 2, 2, 3, 2, 1, 2, 3, 2, 2  …  2, 3, 2, 2, 3, 1, 3, 2, 3, 3]
 Any[3, 3, 3, 2, 3, 3, 3, 1, 3, 1  …  2, 2, 1, 3, 2, 3, 2, 3, 3, 3]
 Any[1, 3, 1, 1, 1, 1, 1, 3, 1, 1  …  1, 1, 3, 1, 3, 3, 1, 1, 1, 1]
 Any[3, 3, 3, 3, 1, 3, 

In [20]:
TopicModels.apply_refinement(lda, corpus, "R_D", 1, 1);

In [21]:
lda.wordPolya

3-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 39, 0, 0, 0, 0  …  1, 0, 0, 0, 2, 2, 0, 0, 1, 1], 8005)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [0, 0, 0, 0, 0, 102, 0, 0, 0, 5  …  0, 0, 1, 0, 0, 0, 0, 1, 0, 0], 9991)
 TopicModels.Polya(5280, TopicModels.Dirichlet(5280, [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01  …  0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], 52.800000000000004), [7, 36, 8, 10, 14, 0, 17, 7, 6, 2  …  0, 1, 0, 2, 0, 0, 1, 0, 0, 0], 10214)

In [22]:
lda.topicPolya

20-element Array{TopicModels.Polya,1}:
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [1.0e-7, 0.6433493639156528, 0.6922273273374719], 1.3355767912531247), [0, 647, 805], 1452)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [48, 355, 1143], 1546)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [1291, 264, 255], 1810)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [944, 131, 587], 1662)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [78, 235, 167], 480)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.4866968873359249, 0.5891369773347052, 0.6502990374700215], 1.7261329021406515), [482, 283, 490], 1255)
 TopicModels.Polya(3, TopicModels.Dirichlet(3, [0.48669688

In [23]:
lda.Samples

20-element Array{Any,1}:
 Any[3, 3, 3, 3, 3, 2, 3, 3, 3, 2  …  3, 2, 2, 2, 2, 3, 3, 2, 2, 2]
 Any[3, 2, 3, 3, 3, 3, 3, 3, 3, 3  …  2, 2, 3, 3, 2, 3, 3, 2, 3, 3]
 Any[1, 1, 1, 2, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 3, 2, 3, 3]
 Any[2, 1, 1, 1, 1, 3, 1, 1, 1, 3  …  3, 3, 1, 1, 2, 1, 1, 1, 1, 3]
 Any[2, 2, 1, 2, 2, 2, 3, 2, 3, 2  …  1, 1, 3, 3, 1, 2, 3, 3, 2, 2]
 Any[1, 3, 2, 1, 2, 3, 3, 1, 3, 3  …  2, 3, 3, 3, 3, 2, 3, 3, 2, 1]
 Any[1, 1, 1, 1, 3, 1, 3, 3, 2, 3  …  1, 2, 3, 1, 3, 1, 1, 1, 3, 3]
 Any[1, 1, 2, 2, 1, 2, 2, 2, 1, 1  …  3, 2, 1, 2, 1, 1, 2, 1, 2, 1]
 Any[2, 2, 2, 2, 2, 1, 2, 3, 2, 2  …  2, 2, 3, 2, 3, 1, 3, 3, 2, 2]
 Any[2, 2, 2, 2, 2, 2, 2, 2, 2, 2  …  1, 2, 3, 2, 3, 3, 2, 1, 2, 2]
 Any[1, 1, 2, 2, 1, 1, 2, 1, 1, 1  …  2, 1, 1, 2, 3, 1, 2, 2, 1, 1]
 Any[2, 2, 2, 2, 2, 1, 2, 3, 2, 2  …  2, 3, 2, 2, 3, 2, 3, 2, 3, 2]
 Any[3, 3, 3, 2, 3, 3, 3, 2, 3, 2  …  2, 2, 3, 3, 2, 3, 2, 3, 3, 3]
 Any[1, 3, 3, 1, 1, 1, 1, 3, 1, 1  …  1, 1, 3, 1, 1, 3, 1, 1, 1, 1]
 Any[3, 3, 3, 3, 3, 3, 

# Automatic Topic Labeling Experiment

In [1]:
using Pkg
using Distances
using StatsBase
using DataStructures
using ProgressMeter
Pkg.activate("TopicModels")
import TopicModels
#using PyCall

#corpora = pyimport("gensim.corpora")
#ch = pyimport("gensim.models.coherencemodel")

[32m[1m Activating[22m[39m environment at `~/Documents/Study/Git_Personal/julia_bayes/TopicModels/Project.toml`
┌ Info: Precompiling TopicModels [cfcb1801-bb54-4f1b-8249-336c042d2c46]
└ @ Base loading.jl:1278


In [2]:
corpus, no_lemma_docs = TopicModels.preprocess("papers.csv", 100);

In [3]:
candidate_label_distribution = TopicModels.train_phrase_model(no_lemma_docs, corpus.vocabulary, 10, true);

In [4]:
lda = TopicModels.train(corpus, 10);

In [5]:
topics, proportions = TopicModels.show_topics(corpus, lda, corpus.vocab_count, false);

In [6]:
for t in topics
    println(t[1:10])
end

Any["input", "figure", "current", "motion", "direction", "analog", "single", "voltage", "synapses", "layer"]
Any["state", "learning", "algorithm", "states", "probability", "function", "convergence", "value", "algorithms", "process"]
Any["network", "neural", "units", "networks", "output", "training", "control", "figure", "architecture", "model"]
Any["model", "system", "two", "field", "one", "visual", "weights", "time", "first", "figure"]
Any["recognition", "training", "distance", "input", "information", "used", "pattern", "figure", "performance", "test"]
Any["data", "algorithm", "problem", "set", "models", "new", "given", "mixture", "classification", "points"]
Any["image", "network", "networks", "mlp", "features", "road", "images", "model", "objects", "layer"]
Any["learning", "space", "generalization", "case", "training", "distribution", "examples", "sequence", "error", "matrix"]
Any["memory", "neurons", "learning", "computational", "model", "activation", "tasks", "neuron", "synaptic", 

In [7]:
#New code
top_3_bi_tri_labels_only = TopicModels.label_ranking(candidate_label_distribution, topics, proportions)

[32mComputing...100%|███████████████████████████████████████| Time: 0:44:04[39m


Dict{Any,Any} with 10 entries:
  7  => Array{Any,1}[["hidden_layer", 2.78953, [0.00218779, 0.0116524, 0.010924…
  4  => Array{Any,1}[["can_be", 2.56588, [0.00419746, 0.00339036, 0.00328274, 0…
  9  => Array{Any,1}[["it_is", 2.80177, [0.00166972, 0.000501499, 0.00517437, 0…
  10 => Array{Any,1}[["can_be", 1.9869, [0.000646218, 0.00269087, 0.00172235, 0…
  2  => Array{Any,1}[["can_be", 2.29343, [0.00258326, 0.00559643, 0.0037132, 0.…
  3  => Array{Any,1}[["neural_network", 2.24291, [0.00704973, 0.00472876, 0.001…
  5  => Array{Any,1}[["training_data", 2.66596, [0.00135389, 0.00968897, 0.0006…
  8  => Array{Any,1}[["can_be", 2.3162, [0.00559643, 0.00172235, 0.00139951, 0.…
  6  => Array{Any,1}[["can_be", 2.27858, [0.00382081, 0.0037132, 0.00349797, 0.…
  1  => Array{Any,1}[["has_been", 2.75192, [0.0031105, 0.000865275, 0.00328321,…

In [8]:
# For comparing all words of topic distributions with base_count_kl = 0.01
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_bi_tri_labels_only[pos]])
    #println(i[1:10]) 
end

["has_been", "can_be", "such_as"]
["can_be", "value_function", "note_that"]
["neural_network", "error_between", "after_training"]
["can_be", "is_not", "has_been"]
["training_data", "this_is", "can_be"]
["can_be", "it_is", "em_algorithm"]
["hidden_layer", "video_camera", "input_features"]
["can_be", "we_have", "it_is"]
["it_is", "such_as", "biological_neural"]
["can_be", "it_is", "this_is"]


## For comparing top n words of topic distributions

In [25]:
# For comparing top 1000 words of topic distributions
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_labels[pos]])
    println(i[1:10])
end

["procedureoptimal", "steadily", "glennygedacuk"]
Any["figure", "however", "examples", "noise", "case", "point", "equation", "approximation", "rule", "given"]
["validityinterval", "breastcancerwisc", "uel"]
Any["networks", "rules", "neural", "units", "network", "mlp", "hidden", "rule", "training", "prediction"]
["datadriven", "irrespective", "distort"]
Any["image", "tasks", "object", "algorithm", "images", "one", "objects", "models", "view", "learn"]
["qlg", "mbl", "vgo"]
Any["wta", "input", "current", "voltage", "gate", "inversion", "source", "floating", "correlogram", "connectivity"]
["anatolviical", "ity", "limonenelinalool"]
Any["model", "models", "activation", "inhibitory", "output", "receptor", "layer", "excitatory", "module", "cell"]
["jointangle", "cameradriven", "orderly"]
Any["units", "memory", "unit", "patterns", "number", "pattern", "representation", "region", "joint", "active"]
["gos", "ilx", "bkkx"]
Any["data", "problem", "given", "one", "function", "variance", "paper", "

In [19]:
# For comparing top 100 words of topic distributions
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_labels[pos]])
    println(i[1:10])
end

["openversusclosed", "procedureoptimal", "lksaulopsychemitedu"]
Any["figure", "however", "examples", "noise", "case", "point", "equation", "approximation", "rule", "given"]
["chowdhury", "afscsprojectconnectbench", "validityinterval"]
Any["networks", "rules", "neural", "units", "network", "mlp", "hidden", "rule", "training", "prediction"]
["datadriven", "architecuture", "neuristique"]
Any["image", "tasks", "object", "algorithm", "images", "one", "objects", "models", "view", "learn"]
["buccleuch", "qlg", "mbl"]
Any["wta", "input", "current", "voltage", "gate", "inversion", "source", "floating", "correlogram", "connectivity"]
["anatolviical", "ity", "norberto"]
Any["model", "models", "activation", "inhibitory", "output", "receptor", "layer", "excitatory", "module", "cell"]
["systvol", "bartlett", "rethworcementlearmng"]
Any["units", "memory", "unit", "patterns", "number", "pattern", "representation", "region", "joint", "active"]
["nijmegen", "psycholinguistik", "aonori"]
Any["data", "pro

In [16]:
# For comparing top 50 words of topic distributions
for (pos, i) in enumerate(topics)
    println([l[1] for l in top_3_labels[pos]])
    println(i[1:10])
end

["procedureoptimal", "openversusclosed", "glennygedacuk"]
Any["figure", "however", "examples", "noise", "case", "point", "equation", "approximation", "rule", "given"]
["validityinterval", "chowdhury", "afscsprojectconnectbench"]
Any["networks", "rules", "neural", "units", "network", "mlp", "hidden", "rule", "training", "prediction"]
["datadriven", "architecuture", "neuristique"]
Any["image", "tasks", "object", "algorithm", "images", "one", "objects", "models", "view", "learn"]
["qlg", "buccleuch", "ioeqo"]
Any["wta", "input", "current", "voltage", "gate", "inversion", "source", "floating", "correlogram", "connectivity"]
["anatolviical", "ity", "norberto"]
Any["model", "models", "activation", "inhibitory", "output", "receptor", "layer", "excitatory", "module", "cell"]
["cameradriven", "sloppy", "recognizable"]
Any["units", "memory", "unit", "patterns", "number", "pattern", "representation", "region", "joint", "active"]
["gos", "nijmegen", "psycholinguistik"]
Any["data", "problem", "give

## Word count in topic labels

In [83]:
sort(collect(labels_freq["coronary_artery_bypass"]), by=x->x[2], rev=true)

278-element Array{Pair{Any,Any},1}:
           "mlp" => 21
          "risk" => 15
    "confidence" => 14
     "bootstrap" => 12
           "vol" => 11
       "surgery" => 9
    "operations" => 8
     "mortality" => 8
      "networks" => 8
       "failure" => 7
 "complications" => 7
       "history" => 7
         "renal" => 7
                 ⋮
       "factors" => 1
  "interactions" => 1
       "strokes" => 1
     "committee" => 1
        "number" => 1
      "maintain" => 1
         "block" => 1
       "reflect" => 1
       "summary" => 1
       "average" => 1
      "spending" => 1
   "performance" => 1

In [42]:
labels_freq

Dict{Any,Any} with 15739 entries:
  "oblique"     => Dict{Any,Any}("points"=>5,"jlkl"=>2,"translation"=>2,"sekule…
  "dev"         => Dict{Any,Any}("fpxsftvg"=>1,"fprr"=>1,"neurophysiol"=>1,"res…
  "cambrdge"    => Dict{Any,Any}("proceedings"=>1,"learningfrom"=>1,"tesauro"=>…
  "yjk"         => Dict{Any,Any}("corresponding"=>1,"rotation"=>1,"points"=>2,"…
  "null"        => Dict{Any,Any}("regions"=>2,"uext"=>1,"oblique"=>1,"excitatio…
  "inflowbased" => Dict{Any,Any}("choices"=>1,"accommodate"=>1,"optimal"=>1,"in…
  "ztt"         => Dict{Any,Any}("qlearning"=>1,"presented"=>2,"directly"=>1,"m…
  "iaan"        => Dict{Any,Any}("want"=>2,"elog"=>2,"emax"=>2,"holds"=>3,"fini…
  "subfeature"  => Dict{Any,Any}("tasks"=>1,"due"=>1,"indicate"=>1,"relevant"=>…
  "rises"       => Dict{Any,Any}("constant"=>3,"functional"=>1,"los"=>2,"equal"…
  "hampshire"   => Dict{Any,Any}("constant"=>1,"action"=>1,"equal"=>1,"data"=>1…
  "dzfk"        => Dict{Any,Any}("fpxsftvg"=>1,"dem"=>1,"fprr"=>1,"kxt"=>1,

In [84]:
corpus.vocabulary

Dict{Any,Any} with 14707 entries:
  "oblique"     => 7247
  "dev"         => 16
  "cambrdge"    => 3145
  "yjk"         => 7246
  "null"        => 1728
  "inflowbased" => 1831
  "ztt"         => 4061
  "iaan"        => 6949
  "subfeature"  => 10738
  "rises"       => 6074
  "hampshire"   => 11903
  "dzfk"        => 8
  "vapnik"      => 3387
  "progression" => 8512
  "neumann"     => 5360
  "fram"        => 10918
  "gathered"    => 7052
  "eeitutery"   => 7290
  "arborize"    => 10152
  "november"    => 6778
  "stress"      => 5963
  "zqm"         => 8291
  "rectified"   => 6644
  "obey"        => 2264
  "methods"     => 818
  ⋮             => ⋮

In [65]:
s = 0
for (phrase, score) in phrase_model_bigram.find_phrases(bigrams_docs)
    println(phrase)
    s+=1
end
println(s)

input_output
coronary_artery_bypass
neural_information_processing_systems
i_i_i
using_neural_networks
as_shown_in_figure
m__m
can_be_obtained
networks_of_spiking_neurons
we_assume_that
i__i_
this_paper_we
this_can_be
can_be_derived
learning_from_examples
it_is_possible
editors_advances_in_neural_information
i_i_i_i
cambridge_ma
office_of_naval_research
city_block_length
can_be_viewed_as
radial_basis_function
where_n
teacher_space_entropy
we_then
output_is
is_defined_as
that_can_be
we_find_that
et_al_eds
san_mateo_ca
can_be_used
modified_actorcritic_algorithm
hierarchical_mixtures_of_experts
which_can_be
training_algorithms
it_is_not
et_al
all_other
processing_systems_san_mateo
network_is
processing_systems_morgan_kaufmann
will_not_be
m_is
pittsburgh_pa
where_is
elastic_input_field
eds_advances_in_neural_information
it_can_be_shown
l_d
it_does_not
can_be_found
here_is
carnegie_mellon_university
way_that
it_may_be
information_processing_systems
terrence_j_sejnowski
there_is_no
receiver_o

## Part of Speech related random commands

In [35]:
a= nltk.pos_tag(docs[7143])

17802-element Array{Tuple{String,String},1}:
 ("A", "DT")
 ("n", "JJ")
 (" ", "NN")
 ("A", "NNP")
 ("l", "NN")
 ("t", "NN")
 ("e", "NN")
 ("r", "NN")
 ("n", "IN")
 ("a", "DT")
 ("t", "NN")
 ("i", "NN")
 ("v", "VBP")
 ⋮
 ("p", "JJ")
 ("u", "JJ")
 ("t", "NN")
 ("a", "DT")
 ("t", "NN")
 ("i", "NN")
 ("o", "VBP")
 ("n", "NN")
 (".", ".")
 ("\n", "CC")
 ("\n", "JJ")
 ("\f", "NN")

In [44]:
a[1][1]

"A"

In [3]:
for i in a
end

LoadError: UndefVarError: a not defined

In [38]:
using PyCall
nltk = pyimport("nltk")

PyObject <module 'nltk' from '/Users/khan/.julia/conda/3/lib/python3.8/site-packages/nltk/__init__.py'>

In [7]:
a = nltk.stem.WordNetLemmatizer()

PyObject <WordNetLemmatizer>

In [9]:
a.lemmatize("Word", pos="")

"Word"

In [8]:
using DataStructures
wn = nltk.corpus.wordnet
t_map = Dict('J' => wn.ADJ,'V' => wn.VERB,'R' => wn.ADV)

tag_map = DefaultDict(wn.NOUN, t_map)

LoadError: UndefVarError: nltk not defined

In [33]:
tag_map['Z']

"n"

In [9]:
dd = DefaultDict(1) 
d = Dict('a'=>1, 'b'=>2)

Dict{Char,Int64} with 2 entries:
  'a' => 1
  'b' => 2

In [10]:
dd = DefaultDict(0, d)

DefaultDict{Char,Int64,Int64} with 2 entries:
  'a' => 1
  'b' => 2

In [15]:
dd['A']

0