In [None]:
# imports needed and logging
import os
import sys
import logging
from time import time
import pickle
import io
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import gensim 
from gensim import corpora, models, similarities, utils
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import MmCorpus, Dictionary
from gensim.test.utils import datapath
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis.gensim
import nltk
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.text import Text



 
logging.basicConfig(format= '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
#lemmatize for verbs
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        lemma = word
    return lemma

#lemmatize for nouns
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [None]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [None]:
#clean texts
def clean_txt(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2]
    #tokens = [get_lemma(token) for token in tokens]
    #tokens = [get_lemma2(token) for token in tokens]
    return tokens

In [None]:
def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = io.open(os.path.join(root, file), encoding='utf=8', errors='ignore').read() # read the entire document, as one big string
            x = clean_txt(document) # or whatever tokenization suits
            yield x


In [None]:
class MyCorpus(object):
    
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=2, keep_n=30000) # check API docs for pruning params 
        
    def __len__(self):
        count = 0
        for root, dirs, files in os.walk(self.top_dir):
            for file in filter(lambda file: file.endswith('.txt'), files):
                count += 1
        self._data_len = int(count)
        return self._data_len
        
        
    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)


In [None]:
#returns corpus, serialized corpus, and dict. args: top_dir, corpus_name
#saves serialized corpus mm and dictionary as .dict
#both args in ''

def get_corpus_dict(top_dir, corpus_name):
    corpus = MyCorpus(top_dir)
    #save corpus
    pickle.dump(corpus, open(corpus_name + '.pkl', 'wb'))
    #save dictionary
    dictionary = corpus.dictionary
    dictionary.save(corpus_name +'_dictionary.dict')
    new_corpus = [vector for vector in iter(corpus)]
    corpora.MmCorpus.serialize(corpus_name+'_serialized.mm', new_corpus)
    # Building reverse index.
    for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token
    return corpus, new_corpus, dictionary

In [None]:
def lda_model(corpus,dictionary, num_passes, num_topics):
    start = time()
    LDA = gensim.models.ldamodel.LdaModel(corpus, id2word = dictionary, passes = num_passes, num_topics = num_topics)
    #print time
    print ('used: {:.2f}s'.format(time()-start))
    return LDA

In [None]:
def print_topics(model, num_words):
    topics = model.print_topics(num_words=num_words)
    for topic in topics:
        print(topic)

In [None]:
#print nice df of topics
def get_topics(corpus_name, model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    topics = pd.DataFrame(word_dict)
    print(topics)
    topics.to_csv(corpus_name+'_topics.csv')

In [None]:
#print nice df of topics
def get_topics(corpus_name, model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    topics = pd.DataFrame(word_dict)
    print(topics)
    topics.to_csv(corpus_name+'_topics.csv')

In [None]:
def make_ldavis(model, serial_corpus, dictionary, corpus_name):
    pyLDAvis.enable_notebook()
    data = pyLDAvis.gensim.prepare(model, serial_corpus, dictionary)
    pyLDAvis.save_html(data, corpus_name+'_lda.html')

In [None]:
#topic coherence - human interpretability of topic model using cv coherence
# arguments: dictionary = Gensim dictionary, corpus =  Gensim corpus, limit = max num topics
# Returns: lm_list = List of LDA topic models and c_v  = Coherence values corresponding to LDA model with respective num_topics

def evaluate_graph(dictionary, corpus, limit):
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = lda_model(corpus, dictionary,30, num_topics)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v