### Imports and Setup

In [1]:
#import necessary packages
import pandas as pd
import gensim as gs
import pyLDAvis
import pyLDAvis.gensim
import string

from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
from collections import OrderedDict

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)

%matplotlib inline
pyLDAvis.enable_notebook()

  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import defaultdict, deque, Sequence
  from collections import Hashable
  'Matplotlib is building the font cache using fc-list. '
  formatvalue=lambda value: "")[1:-1]


### Processing Functions

In [2]:
def text_preprocessor(dataframe, text_column):
    """Takes in a Pandas dataframe column or Series of texts and preprocesses them for model building."""
    #set up the NLP variables needed for preprocessing
    stop = list(stopwords.words('english'))
    stop.extend(['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])

    punctuation = list(string.punctuation)
    punctuation.extend(['1','2','3','4','5','6','7','8','9','0'])

    letters = list(string.ascii_lowercase)
    
    #remove the line breaks from each of the reviews
    dataframe[text_column] = dataframe[text_column].apply(lambda x: ' '.join(x.splitlines()))
    
    #remove punctuation
    dataframe[text_column] = dataframe[text_column].apply(lambda x: ''.join(char for char in x if char not in punctuation))

    #tokenize the documents
    dataframe[text_column] = dataframe[text_column].apply(lambda x: word_tokenize(x.lower()))
    
    #remove stopwords
    dataframe[text_column] = dataframe[text_column].apply(lambda x: [i for i in x if i not in stop])

    #lemmatize the results
    dataframe[text_column] = dataframe[text_column].apply(lambda x: [WordNetLemmatizer().lemmatize(i) for i in x])

In [3]:
#create dictionary to show outputs of each topic on a bar graph
def graph_topics(model, corpus, documents, num_topics,normalize=False, num_words=5, figsize=(8,8)):
    """Visualizes the top represented topics in the corpus."""
    topic_counter = {}
    joined = {}
    
    topics, top_words = get_top_terms(model, num_topics=num_topics)
    for i in range(len(topics)):
        a = {topics[i]: str(top_words[i])}
        joined.update(a)

    #topic distribution by document
    for i in range(len(documents)):
        for topic, percent in model.get_document_topics(corpus)[i]:
            topic_key = "Topic %s" % topic
            topic_counter[topic_key]  = topic_counter.get(topic_key, 0) + 1
    
    #remap the keys to the topn words from each topic
    topic_counter = dict((joined[key], value) for (key, value) in topic_counter.items())
    
    #choose whether to display by count in # of documents or by % of corpus
    if normalize == True:
        for k, v in topic_counter.items():
            topic_counter[k] = topic_counter[k] / float(len(documents))
    
    #sort the dictionary for better output
    topic_counter = OrderedDict(sorted(topic_counter.items(), key=lambda x: x[1]))
    
    #visualize the results
    plt.figure(figsize=figsize)
    plt.barh(range(len(topic_counter)), list(topic_counter.values()), align='center')
    plt.yticks(range(len(topic_counter)), list(topic_counter.keys()))
    plt.xlabel('Topic Representation in Total Documents')
    plt.ylabel('TopN Words from Each Topic')
    plt.title('Topics in Documents')
    plt.grid(axis='x')

In [4]:
def get_top_terms(model, num_topics, num_words=5):
    """Takes in the .show_topics() method and creates a list of top 5 words for each topic and a list of topics"""
    topics = []
    topic_words = []

    for i in model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False):
        topics.append('Topic %s' % i[0])
        temp = []

        for j in i[1]:
            temp.append(j[0].encode('utf-8'))
        topic_words.append(str(temp).translate(None, "[']"))

    return topics, topic_words

In [5]:
def compute_best_model(dictionary, corpus, texts, start=1, stop=21, increment=2, passes=5, 
                           iterations=5, workers=3, visualize=True):
    """Computes model coherence for a variety of LDA models and returns the model with highest coherence.
    also has optional visualization"""
    model_list = []
    coherence_list = []
    
    #compute the models
    for i in range(start, stop, increment):
        model = models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, num_topics=i, 
                                                 passes=passes, iterations=iterations, workers=workers)
        coherence = models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
        model_list.append(model)
        coherence_list.append(coherence.get_coherence())
    
    #find the best model from the computed models
    best_model = model_list[coherence_list.index(max(coherence_list))]
    
    #optional visualization of all topics
    if visualize == True:
        plt.figure(figsize=(8,8))
        plt.plot(range(start, stop, increment), coherence_list, color='r')
        plt.xlabel('Number of Topics')
        plt.ylabel('Coherence Value')
        plt.grid()
        
    return best_model

### Main

In [6]:
#https://raw.githubusercontent.com/justmarkham/DAT7/master/data/yelp.csv
data = pd.read_csv('yelp.csv')

#useful for a later time, but not for now
# #find all the restaurants that have at least 10 reviews
# restaurants = data.groupby('business_id').count().sort_values('stars', ascending=False)
# restaurants = restaurants[restaurants.stars >= 10]

# #filter the main dataset for restaurants that have at least 10 reviews each
# filtered = data[data['business_id'].isin(restaurants.index)]
# filtered.head(2)

text_preprocessor(data, 'text')

In [None]:
#build a total dictionary of all words in all documents
dictionary = corpora.Dictionary(data.text)

#build a per-document numerical representation of the dataset
corpus = [dictionary.doc2bow(doc) for doc in data.text]

#build and train the LDA model
ldamodel = models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, 
                                            num_topics=20, passes=10, iterations=20,
                                            workers=3)

dictionary.num_pos
dictionary.token2id

### Analyzing the Model Output

In [None]:
#get the top terms for each topic. this is something easily seen in pyLDAvis
for i in range(5):
    print("Topic #: ", i)
    print(ldamodel.get_topic_terms(i))
    print()

In [None]:
ldamodel.show_topics(num_topics=5)

In [None]:
ldamodel.top_topics(corpus=bow, dictionary=dictionary)

### Visualize Model Outputs

In [None]:
graph_topics(model=ldamodel, corpus=corpus, documents=data.text, num_topics=50, normalize=True)

In [None]:
ldamodel = compute_best_model(dictionary=dictionary, corpus=corpus, texts=data.text, start=5, stop=70, 
                              increment=5, visualize=True)

In [None]:
ldamodel = models.ldamulticore.LdaMulticore(bow, id2word=dictionary, num_topics=10, passes=10, iterations=10, workers=3)

pyLDAvis.gensim.prepare(ldamodel, bow, dictionary)

## Appendix

In [None]:
for i in range(5):
    print(ldamodel.get_document_topics(corpus)[i])
    print

In [None]:
ldamodel.print_topics(3)[0]

In [None]:
#regex to revisit later

#     dataframe[text_column].replace("[!@#$'%^:;,""&*)(-+.{|}[\]=<>?/\\\~`]", ' ', regex=True, inplace=True)
#     dataframe[text_column].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]', ' ', regex=True, inplace=True)    