In [13]:
# imports needed and logging
import os
import sys
import gensim 
import logging
import pickle
from helper import *
import warnings
from gensim.corpora.textcorpus import TextCorpus, TextDirectoryCorpus
from gensim import corpora, models, similarities, utils
from gensim.test.utils import datapath
from gensim.utils import smart_open, simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis.gensim
import io
import pandas as pd
import numpy as np
import scipy as sp
import sklearn
import nltk
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
from time import time
from collections import defaultdict

 
logging.basicConfig(format= '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings('ignore')

In [14]:
#how to lemmatize
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [15]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [16]:
#clean texts
def clean_txt(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

In [17]:
def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    count = 0
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            print(file)
            count+=1
            print(count)
            document = io.open(os.path.join(root, file), encoding='utf=8').read() # read the entire document, as one big string
            x = clean_txt(document) # or whatever tokenization suits you
            yield x


In [18]:
#print nice df of topics
def get_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [19]:
class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=2, keep_n=30000) # check API docs for pruning params
        
    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)


#make corpus
pro_corpus = MyCorpus('C:/Users/kec52/Documents/BLtext_research/proVaxTextLib/proTexts') # create a dictionary

In [22]:
#returns corpus and dict. args: top_dir, corpus_name
#saves serialized corpus mm and dictionary as .dict
#both args in ''

def get_corpus_dict(top_dir, corpus_name):
    corpus = MyCorpus(top_dir)
    #save dictionary
    dictionary = corpus.dictionary
    dictionary.save(corpus_name +'_dictionary.dict')
    print(dictionary)
    print(dictionary.token2id)
    new_corpus = [vector for vector in iter(corpus)]
    corpora.MmCorpus.serialize('tmp/'+corpus_name+'_serialized.mm', new_corpus)
    # Building reverse index.
    for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token
    return new_corpus, dictionary

In [23]:
x,y = get_corpus_dict('C:/Users/kec52/Documents/BLtext_research/proVaxTextLib/testingtexts', 'testCor')

help.txt
1
help1.txt
2
help2.txt
3
help3.txt
4
help4.txt
5
help5.txt
6
help6.txt
7
help7.txt
8
help8.txt
9
Dictionary(10 unique tokens: ['human', 'interface', 'response', 'survey', 'time']...)
{'human': 0, 'interface': 1, 'response': 2, 'survey': 3, 'time': 4, 'user': 5, 'eps': 6, 'tree': 7, 'graph': 8, 'minor': 9}
help.txt
1
help1.txt
2
help2.txt
3
help3.txt
4
help4.txt
5
help5.txt
6
help6.txt
7
help7.txt
8
help8.txt
9


In [77]:
#save dictionary
dictionary = test_corpus.dictionary
#dictionary.save(corpus_name +'_dictionary.dict')
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [87]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [101]:
corpus_name = 'new_corpus'
new_corpus = [vector for vector in iter(test_corpus)]
corpora.MmCorpus.serialize('tmp/'+corpus_name+'_serialized.mm', new_corpus)
print(new_corpus)

help.txt
1
help1.txt
2
help2.txt
3
help3.txt
4
help4.txt
5
help5.txt
6
help6.txt
7
help7.txt
8
help8.txt
9
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [104]:
new_test = corpora.MmCorpus('tmp/new_corpus_serialized.mm')
print(new_test)
print(list(new_test))

MmCorpus(9 documents, 12 features, 28 non-zero entries)
[[(0, 1.0), (1, 1.0), (2, 1.0)], [(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)], [(2, 1.0), (5, 1.0), (7, 1.0), (8, 1.0)], [(1, 1.0), (5, 2.0), (8, 1.0)], [(3, 1.0), (6, 1.0), (7, 1.0)], [(9, 1.0)], [(9, 1.0), (10, 1.0)], [(9, 1.0), (10, 1.0), (11, 1.0)], [(4, 1.0), (10, 1.0), (11, 1.0)]]


In [105]:
# Building reverse index.
for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token


In [24]:
lda = gensim.models.LdaModel(corpus=x, num_topics=5)
data = pyLDAvis.gensim.prepare(lda, x, y)

pyLDAvis.save_html(data, 'index_lda.html')

In [33]:
corpus_name = 'pro_corpus'
corpora.MmCorpus.serialize(corpus_name +'_SerializedCorpus.mm', pro_corpus)
dictionary = pro_corpus.dictionary
pickle.dump(pro_corpus, open(corpus_name + '.pkl', 'wb'))
dictionary.save(corpus_name +'_dictionary.dict')

1807, A Rowland for an Olivier.txt
1
1807, Report of the Royal College of Physicians of London.txt
2
1807, Report on the state and progress of vaccination in Bengal.txt
3
1808, Hints for the consideration of Parliament.txt
4
1809, Report of the surgeons of the Edinburgh Vaccine Institution.txt
5
1828, Observations on cowpox.txt
6
1829, The question of vaccination popularly considered.txt
7
1830, A cottage dialogue on vaccination.txt
8
1839, Vaccination and re-vaccination.txt
9
1857, On the protective and modifying powers of vaccination.txt
10
1859, Vaccination.txt
11
1869, Compulsory vaccination.txt
12
1869, Is vaccination injurious.txt
13
1888, A pamphlet on vaccination.txt
14
1889, Report on an epidemic of small-pox at Sheffield.txt
15
1899, Vaccination Burke'd.txt
16


In [34]:
#get vars for model
npasses = 30
ntopics = 5
nterms = 10

In [43]:
start = time()
#create object of model w/gensim
LDA = gensim.models.ldamodel.LdaModel
pro_lda_model = LDA(pro_corpus, id2word=pro_corpus.dictionary, passes=npasses, num_topics=ntopics)
#print time
print ('used: {:.2f}s'.format(time()-start))

1807, A Rowland for an Olivier.txt
1
1807, Report of the Royal College of Physicians of London.txt
2
1807, Report on the state and progress of vaccination in Bengal.txt
3
1808, Hints for the consideration of Parliament.txt
4
1809, Report of the surgeons of the Edinburgh Vaccine Institution.txt
5
1828, Observations on cowpox.txt
6
1829, The question of vaccination popularly considered.txt
7
1830, A cottage dialogue on vaccination.txt
8
1839, Vaccination and re-vaccination.txt
9
1857, On the protective and modifying powers of vaccination.txt
10
1859, Vaccination.txt
11
1869, Compulsory vaccination.txt
12
1869, Is vaccination injurious.txt
13
1888, A pamphlet on vaccination.txt
14
1889, Report on an epidemic of small-pox at Sheffield.txt
15
1899, Vaccination Burke'd.txt
16
1807, A Rowland for an Olivier.txt
1
1807, Report of the Royal College of Physicians of London.txt
2
1807, Report on the state and progress of vaccination in Bengal.txt
3
1808, Hints for the consideration of Parliament.

1889, Report on an epidemic of small-pox at Sheffield.txt
15
1899, Vaccination Burke'd.txt
16
1807, A Rowland for an Olivier.txt
1
1807, Report of the Royal College of Physicians of London.txt
2
1807, Report on the state and progress of vaccination in Bengal.txt
3
1808, Hints for the consideration of Parliament.txt
4
1809, Report of the surgeons of the Edinburgh Vaccine Institution.txt
5
1828, Observations on cowpox.txt
6
1829, The question of vaccination popularly considered.txt
7
1830, A cottage dialogue on vaccination.txt
8
1839, Vaccination and re-vaccination.txt
9
1857, On the protective and modifying powers of vaccination.txt
10
1859, Vaccination.txt
11
1869, Compulsory vaccination.txt
12
1869, Is vaccination injurious.txt
13
1888, A pamphlet on vaccination.txt
14
1889, Report on an epidemic of small-pox at Sheffield.txt
15
1899, Vaccination Burke'd.txt
16
1807, A Rowland for an Olivier.txt
1
1807, Report of the Royal College of Physicians of London.txt
2
1807, Report on the stat

1829, The question of vaccination popularly considered.txt
7
1830, A cottage dialogue on vaccination.txt
8
1839, Vaccination and re-vaccination.txt
9
1857, On the protective and modifying powers of vaccination.txt
10
1859, Vaccination.txt
11
1869, Compulsory vaccination.txt
12
1869, Is vaccination injurious.txt
13
1888, A pamphlet on vaccination.txt
14
1889, Report on an epidemic of small-pox at Sheffield.txt
15
1899, Vaccination Burke'd.txt
16
1807, A Rowland for an Olivier.txt
1
1807, Report of the Royal College of Physicians of London.txt
2
1807, Report on the state and progress of vaccination in Bengal.txt
3
1808, Hints for the consideration of Parliament.txt
4
1809, Report of the surgeons of the Edinburgh Vaccine Institution.txt
5
1828, Observations on cowpox.txt
6
1829, The question of vaccination popularly considered.txt
7
1830, A cottage dialogue on vaccination.txt
8
1839, Vaccination and re-vaccination.txt
9
1857, On the protective and modifying powers of vaccination.txt
10
18

In [44]:
topics = pro_lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.013*"smallpox" + 0.010*"puncture" + 0.005*"pimple" + 0.005*"marks"')
(1, '0.000*"sheffield" + 0.000*"puncture" + 0.000*"pimple" + 0.000*"scab"')
(2, '0.049*"sheffield" + 0.022*"unvaccinated" + 0.018*"borough" + 0.016*"revaccinated"')
(3, '0.004*"syphilis" + 0.004*"modify" + 0.004*"doctor" + 0.004*"protective"')
(4, '0.015*"birch" + 0.006*"ringwood" + 0.003*"pamphlet" + 0.003*"jennerian"')


In [45]:
pro_lda_model.save( corpus_name + '.gensim')

In [46]:
pro_lda_topics = get_topics(pro_lda_model, ntopics)

In [47]:
pro_lda_topics

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05
0,smallpox,sheffield,sheffield,syphilis,birch
1,puncture,puncture,unvaccinated,modify,ringwood
2,pimple,pimple,borough,doctor,pamphlet
3,marks,scab,revaccinated,protective,jennerian
4,pustule,marks,pit,ireland,governor
5,brown,brown,census,marks,moseley
6,scab,borough,respectively,million,conduct
7,areola,upper,workhouse,marson,committee
8,cowpox,unvaccinated,ecclesall,simply,publication
9,cafe,inftitution,barry,phyficians,adams


In [48]:
pro_lda_topics.isnull().values.any()

False

In [49]:
pro_lda_topics.to_csv('proTopics.csv')

In [21]:
corpus = pickle.load(open('pro_corpus.pkl', 'rb'))
SerializedCorpus = corpora.MmCorpus(corpus_name +'_SerializedCorpus.mm')
lda = gensim.models.ldamodel.LdaModel.load('pro_corpus.model')

In [23]:
pyLDAvis.enable_notebook()

In [24]:
lda_display = pyLDAvis.gensim.prepare(lda, SerializedCorpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.

pro_lsi_model = models.LsiModel(corpus=pro_corpus,id2word=pro_corpus.dictionary)

get_topics(pro_lsi_model, ntopics)