In [1]:
# imports needed and logging
import os
import sys
import logging
from time import time
import pickle
import io
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import gensim 
from gensim import corpora, models, similarities, utils
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import MmCorpus, Dictionary
from gensim.test.utils import datapath
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis.gensim
import nltk
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.text import Text



 
logging.basicConfig(format= '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [2]:
#lemmatize for verbs
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        lemma = word
    return lemma

#lemmatize for nouns
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [3]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [4]:
#clean texts
def clean_txt(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [get_lemma(token) for token in tokens]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

In [5]:
def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = io.open(os.path.join(root, file), encoding='utf=8', errors='ignore').read() # read the entire document, as one big string
            x = clean_txt(document) # or whatever tokenization suits
            yield x


In [6]:
class MyCorpus(object):
    
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=2, keep_n=30000) # check API docs for pruning params 
        
    def __len__(self):
        count = 0
        for root, dirs, files in os.walk(self.top_dir):
            for file in filter(lambda file: file.endswith('.txt'), files):
                count += 1
        self._data_len = int(count)
        return self._data_len
        
        
    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)


In [7]:
#returns corpus, serialized corpus, and dict. args: top_dir, corpus_name
#saves serialized corpus mm and dictionary as .dict
#both args in ''

def get_corpus_dict(top_dir, corpus_name):
    corpus = MyCorpus(top_dir)
    #save corpus
    pickle.dump(corpus, open(corpus_name + '.pkl', 'wb'))
    #save dictionary
    dictionary = corpus.dictionary
    dictionary.save(corpus_name +'_dictionary.dict')
    new_corpus = [vector for vector in iter(corpus)]
    corpora.MmCorpus.serialize(corpus_name+'_serialized.mm', new_corpus)
    # Building reverse index.
    for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token
    return corpus, new_corpus, dictionary

In [22]:
def lda_model(corpus,dictionary, num_passes, num_topics, chunksize):
    start = time()
    print('makin model')
    LDA = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, passes=num_passes, num_topics = num_topics, chunksize=chunksize)
    #print time
    print ('used: {:.2f}s'.format(time()-start))
    return LDA

In [9]:
def print_topics(model, num_words):
    topics = model.print_topics(num_words=num_words)
    for topic in topics:
        print(topic)

In [10]:
#print nice df of topics
def get_topics(corpus_name, model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    topics = pd.DataFrame(word_dict)
    print(topics)
    topics.to_csv(corpus_name+'_topics.csv')

In [11]:
def make_ldavis(model, serial_corpus, dictionary, corpus_name):
    pyLDAvis.enable_notebook()
    data = pyLDAvis.gensim.prepare(model, serial_corpus, dictionary)
    pyLDAvis.save_html(data, corpus_name+'_lda.html')

In [38]:
#topic coherence - human interpretability of topic model using cv coherence
# arguments: dictionary = Gensim dictionary, corpus =  Gensim corpus, limit = max num topics
# Returns: lm_list = List of LDA topic models and c_v  = Coherence values corresponding to LDA model with respective num_topics

def evaluate_graph(dictionary, corpus, num_passes, chunksize, limit):
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = lda_model(corpus=corpus, dictionary=dictionary, num_passes = num_passes, num_topics = num_topics, chunksize=chunksize)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    fig1 = plt.gcf()
    plt.show()
    plt.draw
    fig1.savefig('coherence_plot.png')
    return lm_list, c_v

#get everything for pro-texts
pro_cor, pro_cor_serial, pro_dict = get_corpus_dict('C:/Users/kec52/Documents/BLtext_research/proVaxTextLib/proTexts', 'pro_corpus')

In [30]:
pro_cor = pickle.load(open('C:/Users/kec52/Documents/BLtext_research/pro_corpus.pkl', 'rb'))
pro_serial_corpus = MmCorpus(datapath('C:/Users/kec52/Documents/BLtext_research/pro_corpus_serialized.mm'))
pro_dict = Dictionary.load('C:/Users/kec52/Documents/BLtext_research/pro_corpus_dictionary.dict')

2018-09-04 22:47:00,020 : INFO : loaded corpus index from C:/Users/kec52/Documents/BLtext_research/pro_corpus_serialized.mm.index
2018-09-04 22:47:00,020 : INFO : initializing cython corpus reader from C:/Users/kec52/Documents/BLtext_research/pro_corpus_serialized.mm
2018-09-04 22:47:00,020 : INFO : accepted corpus with 16 documents, 7219 features, 24605 non-zero entries
2018-09-04 22:47:00,024 : INFO : loading Dictionary object from C:/Users/kec52/Documents/BLtext_research/pro_corpus_dictionary.dict
2018-09-04 22:47:00,028 : INFO : loaded C:/Users/kec52/Documents/BLtext_research/pro_corpus_dictionary.dict


In [31]:
evaluate_graph(pro_dict, pro_cor, 10, 100, 11)

TypeError: lda_model() got an unexpected keyword argument 'passes'

In [32]:
pro_LDA = lda_model(pro_cor, pro_dict,30, 4,100)

2018-09-04 22:47:01,155 : INFO : using symmetric alpha at 0.25
2018-09-04 22:47:01,159 : INFO : using symmetric eta at 0.25
2018-09-04 22:47:01,163 : INFO : using serial LDA version on this node
2018-09-04 22:47:01,167 : INFO : running online (multi-pass) LDA training, 4 topics, 30 passes over the supplied corpus of 16 documents, updating model once every 16 documents, evaluating perplexity every 16 documents, iterating 50x with a convergence threshold of 0.001000


makin model


2018-09-04 22:47:01,934 : INFO : -9.549 per-word bound, 749.2 perplexity estimate based on a held-out corpus of 16 documents with 76562 words
2018-09-04 22:47:01,934 : INFO : PROGRESS: pass 0, at document #16/16
2018-09-04 22:47:01,990 : INFO : topic #0 (0.250): 0.017*"sheffield" + 0.011*"cent" + 0.006*"unvaccinated" + 0.006*"smallpox" + 0.005*"borough" + 0.005*"aged" + 0.005*"revaccinated" + 0.004*"pitted" + 0.004*"birch" + 0.004*"houses"
2018-09-04 22:47:01,994 : INFO : topic #1 (0.250): 0.024*"cent" + 0.019*"sheffield" + 0.012*"unvaccinated" + 0.009*"borough" + 0.008*"aged" + 0.008*"sub" + 0.007*"revaccinated" + 0.007*"birch" + 0.005*"houses" + 0.004*"census"
2018-09-04 22:47:01,994 : INFO : topic #2 (0.250): 0.014*"sheffield" + 0.010*"cent" + 0.006*"smallpox" + 0.006*"unvaccinated" + 0.004*"sub" + 0.004*"revaccinated" + 0.004*"aged" + 0.004*"houses" + 0.003*"pitted" + 0.003*"census"
2018-09-04 22:47:01,994 : INFO : topic #3 (0.250): 0.013*"sheffield" + 0.011*"cent" + 0.005*"aged" +

2018-09-04 22:47:07,079 : INFO : topic diff=0.130925, rho=0.353553
2018-09-04 22:47:07,838 : INFO : -7.862 per-word bound, 232.6 perplexity estimate based on a held-out corpus of 16 documents with 76562 words
2018-09-04 22:47:07,838 : INFO : PROGRESS: pass 7, at document #16/16
2018-09-04 22:47:07,886 : INFO : topic #0 (0.250): 0.012*"red" + 0.012*"punctures" + 0.008*"marks" + 0.006*"row" + 0.006*"fame" + 0.006*"brown" + 0.005*"cafes" + 0.005*"pimples" + 0.005*"areola" + 0.005*"smallpox"
2018-09-04 22:47:07,890 : INFO : topic #1 (0.250): 0.049*"sheffield" + 0.043*"cent" + 0.022*"unvaccinated" + 0.018*"borough" + 0.016*"revaccinated" + 0.016*"aged" + 0.014*"sub" + 0.012*"houses" + 0.011*"pitted" + 0.010*"census"
2018-09-04 22:47:07,890 : INFO : topic #2 (0.250): 0.017*"smallpox" + 0.004*"cowpox" + 0.003*"vesicle" + 0.003*"protective" + 0.003*"virus" + 0.003*"opinions" + 0.003*"modified" + 0.003*"doctrine" + 0.003*"ditto" + 0.002*"specific"
2018-09-04 22:47:07,890 : INFO : topic #3 (0.25

2018-09-04 22:47:12,730 : INFO : topic diff=0.028048, rho=0.258199
2018-09-04 22:47:13,473 : INFO : -7.840 per-word bound, 229.2 perplexity estimate based on a held-out corpus of 16 documents with 76562 words
2018-09-04 22:47:13,473 : INFO : PROGRESS: pass 14, at document #16/16
2018-09-04 22:47:13,513 : INFO : topic #0 (0.250): 0.013*"red" + 0.012*"punctures" + 0.009*"marks" + 0.007*"fame" + 0.007*"row" + 0.007*"brown" + 0.006*"cafes" + 0.006*"pimples" + 0.006*"areola" + 0.005*"smallpox"
2018-09-04 22:47:13,513 : INFO : topic #1 (0.250): 0.049*"sheffield" + 0.044*"cent" + 0.022*"unvaccinated" + 0.018*"borough" + 0.016*"revaccinated" + 0.016*"aged" + 0.014*"sub" + 0.012*"houses" + 0.011*"pitted" + 0.010*"census"
2018-09-04 22:47:13,513 : INFO : topic #2 (0.250): 0.017*"smallpox" + 0.004*"modified" + 0.004*"cowpox" + 0.004*"protective" + 0.004*"vesicle" + 0.003*"virus" + 0.003*"opinions" + 0.003*"doctrine" + 0.003*"ditto" + 0.002*"specific"
2018-09-04 22:47:13,517 : INFO : topic #3 (0.2

2018-09-04 22:47:18,234 : INFO : topic diff=0.011274, rho=0.213201
2018-09-04 22:47:18,973 : INFO : -7.833 per-word bound, 228.1 perplexity estimate based on a held-out corpus of 16 documents with 76562 words
2018-09-04 22:47:18,973 : INFO : PROGRESS: pass 21, at document #16/16
2018-09-04 22:47:19,013 : INFO : topic #0 (0.250): 0.013*"red" + 0.012*"punctures" + 0.009*"marks" + 0.007*"fame" + 0.007*"row" + 0.007*"brown" + 0.006*"cafes" + 0.006*"pimples" + 0.006*"areola" + 0.005*"smallpox"
2018-09-04 22:47:19,013 : INFO : topic #1 (0.250): 0.049*"sheffield" + 0.044*"cent" + 0.022*"unvaccinated" + 0.018*"borough" + 0.016*"revaccinated" + 0.016*"aged" + 0.014*"sub" + 0.012*"houses" + 0.011*"pitted" + 0.010*"census"
2018-09-04 22:47:19,013 : INFO : topic #2 (0.250): 0.016*"smallpox" + 0.005*"modified" + 0.004*"protective" + 0.004*"cowpox" + 0.004*"vesicle" + 0.003*"virus" + 0.003*"doctrine" + 0.003*"opinions" + 0.002*"ditto" + 0.002*"specific"
2018-09-04 22:47:19,017 : INFO : topic #3 (0.2

2018-09-04 22:47:23,751 : INFO : topic diff=0.006307, rho=0.185695
2018-09-04 22:47:24,502 : INFO : -7.830 per-word bound, 227.5 perplexity estimate based on a held-out corpus of 16 documents with 76562 words
2018-09-04 22:47:24,506 : INFO : PROGRESS: pass 28, at document #16/16
2018-09-04 22:47:24,542 : INFO : topic #0 (0.250): 0.013*"red" + 0.012*"punctures" + 0.009*"marks" + 0.007*"fame" + 0.007*"row" + 0.007*"brown" + 0.006*"cafes" + 0.006*"pimples" + 0.006*"areola" + 0.005*"smallpox"
2018-09-04 22:47:24,542 : INFO : topic #1 (0.250): 0.049*"sheffield" + 0.044*"cent" + 0.022*"unvaccinated" + 0.018*"borough" + 0.016*"revaccinated" + 0.016*"aged" + 0.014*"sub" + 0.012*"houses" + 0.011*"pitted" + 0.010*"census"
2018-09-04 22:47:24,542 : INFO : topic #2 (0.250): 0.016*"smallpox" + 0.005*"modified" + 0.004*"protective" + 0.004*"cowpox" + 0.004*"vesicle" + 0.003*"virus" + 0.003*"doctrine" + 0.003*"opinions" + 0.002*"ditto" + 0.002*"specific"
2018-09-04 22:47:24,546 : INFO : topic #3 (0.2

used: 24.19s


In [33]:
print_topics(pro_LDA, 4)

2018-09-04 22:47:25,345 : INFO : topic #0 (0.250): 0.013*"red" + 0.012*"punctures" + 0.009*"marks" + 0.007*"fame"
2018-09-04 22:47:25,349 : INFO : topic #1 (0.250): 0.049*"sheffield" + 0.044*"cent" + 0.022*"unvaccinated" + 0.018*"borough"
2018-09-04 22:47:25,349 : INFO : topic #2 (0.250): 0.016*"smallpox" + 0.005*"modified" + 0.004*"protective" + 0.004*"cowpox"
2018-09-04 22:47:25,353 : INFO : topic #3 (0.250): 0.014*"birch" + 0.003*"answer" + 0.003*"pamphlet" + 0.003*"esq"


(0, '0.013*"red" + 0.012*"punctures" + 0.009*"marks" + 0.007*"fame"')
(1, '0.049*"sheffield" + 0.044*"cent" + 0.022*"unvaccinated" + 0.018*"borough"')
(2, '0.016*"smallpox" + 0.005*"modified" + 0.004*"protective" + 0.004*"cowpox"')
(3, '0.014*"birch" + 0.003*"answer" + 0.003*"pamphlet" + 0.003*"esq"')


topics = get_topics('pro_corpus_with_4TEST',pro_LDA,4)
print(topics)

#make viz
make_ldavis(pro_LDA, pro_cor_serial, pro_dict, 'pro_corpus4TEST')

In [34]:
#for anti
anti_cor, anti_cor_serial, anti_dict = get_corpus_dict('C:/Users/kec52/Documents/BLtext_research/antiVaxTextLib/antiTexts', 'anti_corpus')

2018-09-04 22:47:25,429 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-09-04 22:47:26,535 : INFO : built Dictionary(32071 unique tokens: ['aban', 'abandon', 'abandoned', 'abated', 'abili']...) from 18 documents (total 355355 corpus positions)
2018-09-04 22:47:26,579 : INFO : discarding 20173 tokens: [('abandon', 10), ('abili', 1), ('able', 15), ('abra', 1), ('abroad', 10), ('absolute', 14), ('absolutely', 13), ('absorbent', 1), ('accidental', 10), ('accord', 11)]...
2018-09-04 22:47:26,579 : INFO : keeping 11898 tokens which were in no less than 2 and no more than 9 (=50.0%) documents
2018-09-04 22:47:26,595 : INFO : resulting dictionary: Dictionary(11898 unique tokens: ['aban', 'abandoned', 'abated', 'abilities', 'ablest']...)
2018-09-04 22:47:26,611 : INFO : saving Dictionary object under anti_corpus_dictionary.dict, separately None
2018-09-04 22:47:26,619 : INFO : saved anti_corpus_dictionary.dict
2018-09-04 22:47:27,666 : INFO : storing corpus in Matrix Market 

In [35]:
LDA = lda_model(anti_cor, anti_dict, 50, 6, 100)

2018-09-04 22:47:27,746 : INFO : using symmetric alpha at 0.16666666666666666
2018-09-04 22:47:27,750 : INFO : using symmetric eta at 0.16666666666666666
2018-09-04 22:47:27,754 : INFO : using serial LDA version on this node
2018-09-04 22:47:27,762 : INFO : running online (multi-pass) LDA training, 6 topics, 50 passes over the supplied corpus of 18 documents, updating model once every 18 documents, evaluating perplexity every 18 documents, iterating 50x with a convergence threshold of 0.001000


makin model


2018-09-04 22:47:29,064 : INFO : -10.330 per-word bound, 1287.2 perplexity estimate based on a held-out corpus of 18 documents with 123048 words
2018-09-04 22:47:29,064 : INFO : PROGRESS: pass 0, at document #18/18
2018-09-04 22:47:29,152 : INFO : topic #4 (0.167): 0.024*"smallpox" + 0.010*"cowpox" + 0.006*"leicester" + 0.003*"table" + 0.002*"calf" + 0.002*"commission" + 0.002*"pearson" + 0.002*"variolation" + 0.001*"fatality" + 0.001*"baron"
2018-09-04 22:47:29,152 : INFO : topic #0 (0.167): 0.025*"smallpox" + 0.011*"cowpox" + 0.010*"leicester" + 0.003*"table" + 0.002*"commission" + 0.002*"calf" + 0.002*"fatality" + 0.001*"variolation" + 0.001*"pearson" + 0.001*"walker"
2018-09-04 22:47:29,156 : INFO : topic #5 (0.167): 0.021*"smallpox" + 0.016*"leicester" + 0.007*"cowpox" + 0.005*"table" + 0.003*"fatality" + 0.002*"commission" + 0.001*"calf" + 0.001*"pearson" + 0.001*"toxin" + 0.001*"claim"
2018-09-04 22:47:29,156 : INFO : topic #2 (0.167): 0.012*"smallpox" + 0.011*"leicester" + 0.00

2018-09-04 22:47:36,186 : INFO : topic #3 (0.167): 0.003*"smallpox" + 0.002*"humours" + 0.002*"organic" + 0.002*"pus" + 0.001*"god" + 0.001*"humour" + 0.001*"taints" + 0.001*"company" + 0.001*"philosophy" + 0.001*"german"
2018-09-04 22:47:36,186 : INFO : topic diff=0.276095, rho=0.377964
2018-09-04 22:47:37,489 : INFO : -8.832 per-word bound, 455.8 perplexity estimate based on a held-out corpus of 18 documents with 123048 words
2018-09-04 22:47:37,489 : INFO : PROGRESS: pass 6, at document #18/18
2018-09-04 22:47:37,557 : INFO : topic #1 (0.167): 0.003*"spurious" + 0.003*"genuine" + 0.002*"circumstance" + 0.002*"failures" + 0.002*"birch" + 0.002*"baron" + 0.002*"assertions" + 0.002*"candour" + 0.002*"scrophula" + 0.001*"uniformly"
2018-09-04 22:47:37,557 : INFO : topic #5 (0.167): 0.038*"leicester" + 0.011*"table" + 0.007*"commission" + 0.005*"fatality" + 0.004*"toxin" + 0.004*"calf" + 0.003*"percentage" + 0.003*"serum" + 0.003*"diphtheria" + 0.002*"tuberculosis"
2018-09-04 22:47:37,56

2018-09-04 22:47:44,431 : INFO : topic #5 (0.167): 0.039*"leicester" + 0.012*"table" + 0.007*"commission" + 0.005*"fatality" + 0.004*"toxin" + 0.004*"calf" + 0.003*"percentage" + 0.003*"serum" + 0.003*"diphtheria" + 0.002*"tuberculosis"
2018-09-04 22:47:44,435 : INFO : topic #4 (0.167): 0.035*"smallpox" + 0.006*"talk" + 0.004*"inquirer" + 0.003*"hygiene" + 0.002*"germany" + 0.002*"hat" + 0.002*"switzerland" + 0.002*"german" + 0.002*"cowpox" + 0.002*"sore"
2018-09-04 22:47:44,435 : INFO : topic diff=0.050889, rho=0.277350
2018-09-04 22:47:45,745 : INFO : -8.804 per-word bound, 446.9 perplexity estimate based on a held-out corpus of 18 documents with 123048 words
2018-09-04 22:47:45,745 : INFO : PROGRESS: pass 12, at document #18/18
2018-09-04 22:47:45,813 : INFO : topic #1 (0.167): 0.003*"spurious" + 0.003*"genuine" + 0.002*"circumstance" + 0.002*"failures" + 0.002*"birch" + 0.002*"baron" + 0.002*"assertions" + 0.002*"candour" + 0.002*"scrophula" + 0.001*"uniformly"
2018-09-04 22:47:45,

2018-09-04 22:47:52,689 : INFO : topic #3 (0.167): 0.002*"humours" + 0.002*"pus" + 0.002*"organic" + 0.002*"smau" + 0.002*"god" + 0.002*"german" + 0.002*"company" + 0.002*"taints" + 0.001*"philosophy" + 0.001*"humour"
2018-09-04 22:47:52,689 : INFO : topic #1 (0.167): 0.003*"spurious" + 0.003*"genuine" + 0.002*"circumstance" + 0.002*"failures" + 0.002*"birch" + 0.002*"baron" + 0.002*"assertions" + 0.002*"candour" + 0.002*"scrophula" + 0.001*"uniformly"
2018-09-04 22:47:52,693 : INFO : topic #0 (0.167): 0.042*"smallpox" + 0.019*"cowpox" + 0.003*"pearson" + 0.003*"variolation" + 0.002*"walker" + 0.002*"baron" + 0.002*"spurious" + 0.002*"god" + 0.001*"claim" + 0.001*"woodville"
2018-09-04 22:47:52,693 : INFO : topic diff=0.011972, rho=0.229416
2018-09-04 22:47:54,015 : INFO : -8.801 per-word bound, 446.1 perplexity estimate based on a held-out corpus of 18 documents with 123048 words
2018-09-04 22:47:54,019 : INFO : PROGRESS: pass 18, at document #18/18
2018-09-04 22:47:54,083 : INFO : to

2018-09-04 22:48:00,926 : INFO : topic #0 (0.167): 0.042*"smallpox" + 0.019*"cowpox" + 0.003*"pearson" + 0.003*"variolation" + 0.002*"walker" + 0.002*"baron" + 0.002*"spurious" + 0.002*"god" + 0.001*"claim" + 0.001*"woodville"
2018-09-04 22:48:00,926 : INFO : topic #3 (0.167): 0.002*"humours" + 0.002*"pus" + 0.002*"organic" + 0.002*"smau" + 0.002*"god" + 0.002*"german" + 0.002*"company" + 0.002*"taints" + 0.001*"philosophy" + 0.001*"humour"
2018-09-04 22:48:00,926 : INFO : topic #2 (0.167): 0.010*"areola" + 0.009*"pustule" + 0.004*"vesicles" + 0.004*"phenomena" + 0.004*"bryce" + 0.004*"inflammation" + 0.004*"affection" + 0.003*"pus" + 0.003*"scab" + 0.003*"oldest"
2018-09-04 22:48:00,930 : INFO : topic #5 (0.167): 0.040*"leicester" + 0.012*"table" + 0.007*"commission" + 0.005*"fatality" + 0.004*"toxin" + 0.004*"calf" + 0.003*"percentage" + 0.003*"serum" + 0.003*"diphtheria" + 0.002*"tuberculosis"
2018-09-04 22:48:00,930 : INFO : topic diff=0.004979, rho=0.200000
2018-09-04 22:48:02,264

2018-09-04 22:48:09,155 : INFO : PROGRESS: pass 29, at document #18/18
2018-09-04 22:48:09,223 : INFO : topic #4 (0.167): 0.035*"smallpox" + 0.006*"talk" + 0.004*"inquirer" + 0.003*"hygiene" + 0.002*"germany" + 0.002*"switzerland" + 0.002*"hat" + 0.002*"german" + 0.002*"sore" + 0.002*"dresden"
2018-09-04 22:48:09,227 : INFO : topic #2 (0.167): 0.010*"areola" + 0.009*"pustule" + 0.004*"vesicles" + 0.004*"bryce" + 0.004*"phenomena" + 0.004*"inflammation" + 0.004*"affection" + 0.004*"pus" + 0.003*"scab" + 0.003*"oldest"
2018-09-04 22:48:09,227 : INFO : topic #3 (0.167): 0.002*"humours" + 0.002*"pus" + 0.002*"organic" + 0.002*"smau" + 0.002*"god" + 0.002*"german" + 0.002*"taints" + 0.002*"company" + 0.001*"philosophy" + 0.001*"humour"
2018-09-04 22:48:09,227 : INFO : topic #5 (0.167): 0.040*"leicester" + 0.012*"table" + 0.007*"commission" + 0.005*"fatality" + 0.004*"toxin" + 0.004*"calf" + 0.003*"percentage" + 0.003*"diphtheria" + 0.003*"serum" + 0.002*"tuberculosis"
2018-09-04 22:48:09,22

2018-09-04 22:48:16,146 : INFO : topic diff=0.003182, rho=0.166667
2018-09-04 22:48:17,460 : INFO : -8.798 per-word bound, 445.2 perplexity estimate based on a held-out corpus of 18 documents with 123048 words
2018-09-04 22:48:17,460 : INFO : PROGRESS: pass 35, at document #18/18
2018-09-04 22:48:17,528 : INFO : topic #4 (0.167): 0.035*"smallpox" + 0.006*"talk" + 0.004*"inquirer" + 0.003*"hygiene" + 0.002*"germany" + 0.002*"switzerland" + 0.002*"german" + 0.002*"hat" + 0.002*"sore" + 0.002*"dresden"
2018-09-04 22:48:17,528 : INFO : topic #0 (0.167): 0.041*"smallpox" + 0.019*"cowpox" + 0.003*"pearson" + 0.003*"variolation" + 0.002*"walker" + 0.002*"baron" + 0.002*"spurious" + 0.002*"god" + 0.001*"claim" + 0.001*"woodville"
2018-09-04 22:48:17,528 : INFO : topic #5 (0.167): 0.040*"leicester" + 0.012*"table" + 0.007*"commission" + 0.006*"fatality" + 0.004*"toxin" + 0.004*"calf" + 0.003*"percentage" + 0.003*"diphtheria" + 0.003*"serum" + 0.002*"tuberculosis"
2018-09-04 22:48:17,532 : INFO 

2018-09-04 22:48:24,474 : INFO : topic #3 (0.167): 0.002*"humours" + 0.002*"organic" + 0.002*"pus" + 0.002*"smau" + 0.002*"god" + 0.002*"taints" + 0.002*"german" + 0.002*"company" + 0.001*"philosophy" + 0.001*"humour"
2018-09-04 22:48:24,478 : INFO : topic diff=0.003037, rho=0.154303
2018-09-04 22:48:25,781 : INFO : -8.797 per-word bound, 444.9 perplexity estimate based on a held-out corpus of 18 documents with 123048 words
2018-09-04 22:48:25,781 : INFO : PROGRESS: pass 41, at document #18/18
2018-09-04 22:48:25,845 : INFO : topic #3 (0.167): 0.002*"humours" + 0.002*"organic" + 0.002*"pus" + 0.002*"smau" + 0.002*"god" + 0.002*"taints" + 0.002*"german" + 0.002*"company" + 0.001*"philosophy" + 0.001*"humour"
2018-09-04 22:48:25,849 : INFO : topic #0 (0.167): 0.041*"smallpox" + 0.019*"cowpox" + 0.003*"pearson" + 0.003*"variolation" + 0.002*"walker" + 0.002*"baron" + 0.002*"spurious" + 0.002*"god" + 0.001*"claim" + 0.001*"establishment"
2018-09-04 22:48:25,849 : INFO : topic #5 (0.167): 0

2018-09-04 22:48:32,787 : INFO : topic #2 (0.167): 0.010*"areola" + 0.009*"pustule" + 0.004*"vesicles" + 0.004*"bryce" + 0.004*"phenomena" + 0.004*"inflammation" + 0.004*"affection" + 0.004*"pus" + 0.004*"scab" + 0.003*"oldest"
2018-09-04 22:48:32,787 : INFO : topic #5 (0.167): 0.040*"leicester" + 0.012*"table" + 0.007*"commission" + 0.006*"fatality" + 0.004*"toxin" + 0.004*"calf" + 0.003*"percentage" + 0.003*"diphtheria" + 0.003*"serum" + 0.002*"tuberculosis"
2018-09-04 22:48:32,791 : INFO : topic diff=0.002921, rho=0.144338
2018-09-04 22:48:34,110 : INFO : -8.796 per-word bound, 444.6 perplexity estimate based on a held-out corpus of 18 documents with 123048 words
2018-09-04 22:48:34,110 : INFO : PROGRESS: pass 47, at document #18/18
2018-09-04 22:48:34,177 : INFO : topic #5 (0.167): 0.040*"leicester" + 0.012*"table" + 0.007*"commission" + 0.006*"fatality" + 0.004*"toxin" + 0.004*"calf" + 0.003*"percentage" + 0.003*"diphtheria" + 0.003*"serum" + 0.002*"tuberculosis"
2018-09-04 22:48:

used: 69.20s


print_topics(LDA, 6)

anti_topics = get_topics('anti_corpus_w6TEST',LDA,6)
print(anti_topics)


#make viz
make_ldavis(LDA, anti_cor_serial, anti_dict, 'anti_corpusw6')

In [36]:
#get everything for full corpora
both_cor, both_cor_serial, both_dict = get_corpus_dict('C:/Users/kec52/Documents/BLtext_research/bothTexts', 'both_corpus')

2018-09-04 22:48:37,038 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-09-04 22:48:38,771 : INFO : built Dictionary(44491 unique tokens: ['aad', 'abashed', 'aberdeen', 'abilities', 'ability']...) from 34 documents (total 549103 corpus positions)
2018-09-04 22:48:38,827 : INFO : discarding 28213 tokens: [('abashed', 1), ('able', 28), ('abroad', 20), ('absolutely', 23), ('abvantages', 1), ('acapulco', 1), ('accept', 18), ('accessary', 1), ('according', 26), ('accordingly', 18)]...
2018-09-04 22:48:38,827 : INFO : keeping 16278 tokens which were in no less than 2 and no more than 17 (=50.0%) documents
2018-09-04 22:48:38,855 : INFO : resulting dictionary: Dictionary(16278 unique tokens: ['aad', 'aberdeen', 'abilities', 'ability', 'abire']...)
2018-09-04 22:48:38,883 : INFO : saving Dictionary object under both_corpus_dictionary.dict, separately None
2018-09-04 22:48:38,891 : INFO : saved both_corpus_dictionary.dict
2018-09-04 22:48:40,517 : INFO : storing corpus in Ma

In [39]:
evaluate_graph(both_cor, both_dict, 50, 1000, 20)

makin model


AttributeError: 'MyCorpus' object has no attribute 'keys'

In [27]:
bothLDA = lda_model(both_cor, both_dict, 20, 10, 100)

2018-09-04 22:44:17,212 : INFO : using symmetric alpha at 0.1
2018-09-04 22:44:17,216 : INFO : using symmetric eta at 0.1
2018-09-04 22:44:17,220 : INFO : using serial LDA version on this node
2018-09-04 22:44:17,244 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 34 documents, updating model once every 34 documents, evaluating perplexity every 34 documents, iterating 50x with a convergence threshold of 0.001000


makin model


2018-09-04 22:44:19,317 : INFO : -11.053 per-word bound, 2124.0 perplexity estimate based on a held-out corpus of 34 documents with 218978 words
2018-09-04 22:44:19,317 : INFO : PROGRESS: pass 0, at document #34/34
2018-09-04 22:44:19,473 : INFO : topic #3 (0.100): 0.011*"smallpox" + 0.009*"leicester" + 0.004*"cowpox" + 0.003*"sheffield" + 0.002*"guardians" + 0.002*"borough" + 0.002*"million" + 0.002*"birch" + 0.002*"fatality" + 0.002*"red"
2018-09-04 22:44:19,473 : INFO : topic #9 (0.100): 0.019*"smallpox" + 0.006*"cowpox" + 0.005*"sheffield" + 0.003*"borough" + 0.003*"leicester" + 0.003*"aged" + 0.002*"revaccinated" + 0.002*"pitted" + 0.001*"birch" + 0.001*"epidemics"
2018-09-04 22:44:19,477 : INFO : topic #7 (0.100): 0.015*"smallpox" + 0.013*"sheffield" + 0.006*"cowpox" + 0.003*"revaccinated" + 0.003*"leicester" + 0.003*"aged" + 0.003*"borough" + 0.002*"census" + 0.002*"pitted" + 0.002*"west"
2018-09-04 22:44:19,477 : INFO : topic #4 (0.100): 0.014*"smallpox" + 0.007*"cowpox" + 0.00

2018-09-04 22:44:31,449 : INFO : topic #5 (0.100): 0.015*"birch" + 0.009*"ringwood" + 0.004*"pamphlet" + 0.003*"failures" + 0.003*"ring" + 0.003*"adams" + 0.003*"governors" + 0.002*"conduct" + 0.002*"dear" + 0.002*"westcott"
2018-09-04 22:44:31,449 : INFO : topic diff=0.532284, rho=0.377964
2018-09-04 22:44:33,502 : INFO : -8.882 per-word bound, 471.9 perplexity estimate based on a held-out corpus of 34 documents with 218978 words
2018-09-04 22:44:33,502 : INFO : PROGRESS: pass 6, at document #34/34
2018-09-04 22:44:33,630 : INFO : topic #6 (0.100): 0.011*"smallpox" + 0.003*"calf" + 0.003*"god" + 0.002*"teeth" + 0.002*"poisoning" + 0.002*"syphilitic" + 0.002*"figures" + 0.002*"prison" + 0.001*"doctors" + 0.001*"registrar"
2018-09-04 22:44:33,630 : INFO : topic #9 (0.100): 0.028*"smallpox" + 0.005*"talk" + 0.003*"inquirer" + 0.003*"hygiene" + 0.002*"german" + 0.002*"doctors" + 0.002*"congress" + 0.002*"germany" + 0.002*"cowpox" + 0.002*"sanitation"
2018-09-04 22:44:33,634 : INFO : topic

KeyboardInterrupt: 

In [None]:
print_topics(bothLDA, 10)

both_topics = get_topics('both_corpus_w9TEST', bothLDA, 9)
print(both_topics)

#make viz for both
make_ldavis(bothLDA, both_cor_serial, both_dict, 'both_corpus_9TEST')

corpus = pickle.load(open('C:/Users/kec52/Documents/BLtext_research/both_corpus.pkl', 'rb'))
serial_corpus = MmCorpus(datapath('C:/Users/kec52/Documents/BLtext_research/tmp/both_corpus_serialized.mm'))
dictionary = Dictionary.load('C:/Users/kec52/Documents/BLtext_research/both_corpus_dictionary.dict')

lda = lda_model(corpus,dictionary, 50, 9)

print_topics(lda, 10)

get_topics('test_both2TEST', lda, 9)

make_ldavis(lda, serialized_corpus, dictionary, 'both_test_again2')