In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os

In [5]:
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import gensim, logging, io, os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
handler = logging.FileHandler('training_output.log')


# The handler above is somthing I needed with respect to logging.
# Gensim performs various calculations while training the LDA model that I am using, but the only way to see them
# is in the logging outputs.
# Specifically, I need to capture the perplexity values during training to verify that perplexity is decreasing.
# This metric is needed to compare models and to do hyperparameter tuning. 


# The following blog post was helpful to me in figure out how to make the log handler I needed.
# https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/

In [3]:
# For the LDA model, I'd like to filter out the top 3 most frequently used tokens in the corpus
# For most products, the most frequent tokens  refer to the product itself in a generic way and also contain what we might consider stop words
# but didn't want to remove earlier because they are needed for part-of-speech tagging
# (e.g., "product," "coconut," "oil" as well as "have," "be," etc.)
# this helper function is a slight modification of one of gensim's built-in methods -
# I don't want to remove the codewords I've inserted, and those also end up in the top 3 quite often

def remove_freq(dictionary, n):
    logger = logging.getLogger('gensim.corpora.dictionary')
    save = set(['GOODREVIEW', 'BADREVIEW', 'VGOODREVIEW', 'VBADREVIEW'])
    most_frequent_ids = (v for v in (dictionary.token2id).values() if dictionary[v] not in save)
    most_frequent_ids = sorted(most_frequent_ids, key=dictionary.dfs.get, reverse=True)
    most_frequent_ids = most_frequent_ids[:n]
    # do the actual filtering, then rebuild dictionary to remove gaps in ids
    most_frequent_words = [(dictionary[idx], dictionary.dfs.get(idx, 0)) for idx in most_frequent_ids]
    logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])

    dictionary.filter_tokens(bad_ids=most_frequent_ids)
    logger.info("resulting dictionary: %s", dictionary)

# the next few helper functions deal with extracting metrics of interest from gensim's logger, 
# which are being dumped into a log file as training runs
# I am capturing bounds, perplexity, and per-word topic differences
    
    
import itertools
# thanks to these SO answers https://stackoverflow.com/questions/6213063/python-read-next
# and https://stackoverflow.com/questions/5434891/iterate-a-list-as-pair-current-next-in-python
# for showing a way to deal with lines of a file in groups of three
def threes(iterator):
    "s -> (s0,s1,s2), (s1,s2,s3), (s2, s3, s4), ..."
    a, b, c = itertools.tee(iterator, 3)
    next(b, None)
    next(c, None)
    next(c, None)
    return zip(a, b, c)

def capture_logs(): # capture the perplexity, per-word bound, and topic difference values from the logger and save
    perplexity_log = []
    perplex = {}
    bounds = {}
    diff = {}
    with open("training_output.log", 'r') as f:
        for line in f:
            if re.match("|".join([r'.*topic diff.*', r'.*per-word.*', r'.*PROGRESS.*']), line):
                perplexity_log.append(line)
    for a, b, c in threes(perplexity_log): 
        if re.match(r'.*PROGRESS.*', a):
            pass_val = int(re.search(r'\d*, at', a).group(0).split(',')[0])
            if re.match(r'.*topic diff.*', b):
                d = float(re.search(r'\d*\.\d*', b).group(0).split()[0])
                diff[pass_val] = d
            if re.match(r'.*per-word.*', b): # these may show up in the second line of the group as well
                b = float(re.search(r'.\d*\.\d* per', b).group(0).split()[0])
                p = float(re.search(r'\d*\.\d perplexity', b).group(0).split()[0])
                perplex[pass_val] = p
                bounds[pass_val] = b
            if re.match(r'.*per-word.*', c):
                b = float(re.search(r'.\d*\.\d* per', c).group(0).split()[0])
                p = float(re.search(r'\d*\.\d perplexity', c).group(0).split()[0])
                perplex[pass_val] = p
                bounds[pass_val] = b
    return bounds, perplex, diff

def perplexity_decreasing(perplex): #checks if the perplexity decreased during the last two training passes
    passes = sorted(perplex.keys())
    start = passes[-2]
    end = passes[-1]
    if perplex[start] > perplex[end]:
        return True
    else:
        return False

In [4]:
# this function runs trains an LDA model for a single product, 
# constructing any number of topics over any number of passes
def run_lda(product, n_topics, n_passes, texts, save_path):
    """
    This function trains an LDA model fo a single product, 
    constructing any number of topics over any number of training passes.
    product: the string product ID
    n_topics: number of topics desired
    n_passes: number of training passes to make
    texts: the corpus to use (column name from the main dataframe)
    save_path: where to save the model outputs
    returns a dataframe with the results of the training 
    """
    os.remove('training_output.log')
    logger = logging.getLogger('gensim.models.ldamodel')
    handler = logging.FileHandler('training_output.log')
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s : %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    data = df[df['ProductId']==product]
    texts = data[texts].str.split()
    dictionary = corpora.Dictionary(texts)
    remove_freq(dictionary, 3)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
    mm = corpora.MmCorpus('/tmp/corpus.mm')
    chunk_size = review_counts[product]/3
    lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=chunk_size, passes=n_passes)
    bounds, perplex, diff = capture_logs()
    results = pd.DataFrame(index=[product], data={'num_topics': n_topics, 'chunk': chunk_size, 'passes': n_passes})
    results['per-word bounds'] = [bounds]
    results['perplexity'] = [perplex]
    results['topic diff'] = [diff]
    p = sorted(perplex.keys())
    end = p[-1]
    results['final perplexity'] = perplex[end]
    d = sorted(diff.keys())
    end = p[-1]
    results['final topic diff'] = diff[end]
    if perplexity_decreasing:
        results['perplexity decreasing'] = True
    else: 
        results['perplexity_decreasing'] = False
    for n in range(0,n_topics):
        topic = lda.show_topic(n, 20)
        results['topic {}'.format(n)] = [topic]
    lda.save('./{}/{}'.format(save_path, product))
    lda.clear()
    return results

In [6]:
# this function tunes an LDA model for a single product, by grid searching over various numbers of topics,
# over different numbers of passes, using a specified text set (cleaned, coded, valence coded, etc.)
from shutil import copyfile
from sys import exit
from gensim.models.coherencemodel import CoherenceModel


def tune_lda(product, n_topics, n_passes, input_text, save_path):
    os.remove('training_output.log')
    logger = logging.getLogger('gensim.models.ldamodel')
    handler = logging.FileHandler('training_output.log')
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s : %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    data = df[df['ProductId']==product]
    texts = data[input_text].str.split()
    dictionary = corpora.Dictionary(texts)
    remove_freq(dictionary, 10)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
    mm = corpora.MmCorpus('/tmp/corpus.mm')
    chunk_size = review_counts[product]/3
    output = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 'passes', 'per-word bounds', 'perplexity', 'topic diff',
                                  'final perplexity', 'final topic diff', 'perplexity decreasing', 'coherence'])
    for t in n_topics:
        for p in n_passes:
            print('training LDA with {} topics over {} passes'.format(t, p))
            lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, \
                                                  num_topics=t, update_every=1, \
                                                  chunksize=chunk_size, passes=p, random_state=42)
            bounds, perplex, diff = capture_logs()
            results = {'product': product, 'num_topics': t, 'chunk': chunk_size, 'passes': p}
            results['per-word bounds'] = [bounds]
            results['perplexity'] = [perplex]
            results['topic diff'] = [diff]
            per = sorted(perplex.keys())
            end = per[-1]
            results['final perplexity'] = perplex[end]
            d = sorted(diff.keys())
            end = d[-1]
            results['final topic diff'] = diff[end]
            if perplexity_decreasing:
                results['perplexity decreasing'] = True
            else: 
                results['perplexity_decreasing'] = False
            for n in range(0,t):
                topic = lda.show_topic(n, 20)
                results['topic {}'.format(n)] = [topic]
            lda.save('./{}/{}_{}_{}'.format(save_path, product, t, p))
            cm = CoherenceModel(model=lda, corpus=corpus, texts = texts, coherence='c_v')
            results['coherence'] = cm.get_coherence()
            output = pd.concat([output, pd.DataFrame(data=results)], axis=0)
            lda.clear()
    return output

# this function finds the best result (maximizing coherence) from the grid search run
# the results are saved to final output dataframe and the LDA model is also saved off

def save_best(output, final_results, save_path):
    output.reset_index(inplace=True)
    best_idx = output['coherence'].idxmax()
    product = output.loc[best_idx, 'product']
    print('best results for product {}:'.format(product))
    print(output.loc[best_idx])
    final_results = final_results.append(output.loc[best_idx], ignore_index=True)
    t = output.loc[best_idx, 'num_topics']
    p = output.loc[best_idx, 'passes']
    lda = gensim.models.ldamodel.LdaModel.load("./{}/{}_{}_{}".format(save_path, product, t, p))
    lda.save('./{}/final_models/{}_{}_{}'.format(save_path, product, t, p))
    lda.clear()
    del output
    print("Final model saved for product {} with {} topics over {} passes.".format(product, t, p))
    return final_results