In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
import logging
import io
import os
import re
import itertools
import sys

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
from src.lda import lda_funcs

from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
handler = logging.FileHandler('../models/training_output.log')

# The handler above is somthing I needed with respect to logging.
# Gensim performs various calculations while training the LDA model that I am using, but the only way to see them
# is in the logging outputs.
# Specifically, I need to capture the perplexity values during training to verify that perplexity is decreasing.
# This metric is needed to compare models and to do hyperparameter tuning. 


# The following blog post was helpful to me in figure out how to make the log handler I needed.
# https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/

In [3]:
df = pd.read_csv('../data/processed/reviews.csv', index_col=0)
df.head(2)

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,...,clean_vanilla_x,clean_coded_x,clean_valence_x,codecount_GOOD,codecount_BAD,valencecount_GOOD,valencecount_VGOOD,valencecount_BAD,valencecount_VBAD,Sentiment
0,10108,10108,21738,B000KV61FC,A1Y1YYH71TPYC6,thefinfan54,2,2,5,1222905600,...,two small dog absolutely love tug jug many us ...,two small dog absolutely love GOODREVIEW tug j...,two small dog absolutely love VGOODREVIEW tug ...,7,3,7,3,3,0,1
1,10109,10109,21739,B000KV61FC,A1SLLKDKCZ5IPL,C. Guariglia,2,2,2,1221091200,...,idea basically good one however large papillon...,idea basically good GOODREVIEW one however lar...,idea basically good GOODREVIEW one however lar...,3,0,3,0,0,0,0


In [4]:
# get review counts for each product
review_counts = df['ProductId'].value_counts().sort_values()
review_counts.tail(12)

B002IEZJMA    487
B006MONQMC    491
B005ZBZLT4    506
B003GTR8IO    530
B005K4Q34S    541
B0013A0QXC    542
B000NMJWZO    542
B000KV61FC    556
B001EO5Q64    567
B0026RQTGE    630
Name: ProductId, dtype: int64

In [5]:
# get a product group to work with - the 10 products with the largest number of reviews
top_ten = review_counts.tail(10).index.values

## LDA Model Grid Search & Tuning

For each of the input text types (vanilla, coded, and valence coded), first run an initial grid search with 50 or 80 training passes and either 6, 10, or 12 topics.

NOTE: grid search can take hours to run fully - don't run the below cells unless you actually want to perform the searching and tuning. The csv results files can simply be loaded in where noted below.

In [None]:
# LOAD RESULTS
# load in the final results
vanilla_final_results = pd.read_csv('../data/processed/vanilla_final_results.csv')
vanilla_final_results.set_index('product', inplace=True)

coded_final_results = pd.read_csv('../data/processed/coded_final_results.csv')
coded_final_results.set_index('product', inplace=True)

valence_final_results = pd.read_csv('../data/processed/valence_final_results.csv')
valence_final_results.set_index('product', inplace=True)

### Vanilla Inputs Grid Search & Tuning

In [None]:
# CAUTION - GRID SEARCH CAN TAKE MANY HOURS TO RUN
# ONLY RUN ON FIRST PASS
# create a dataframe to house the results of the model tuning from an initial grid search

vanilla_gs_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                           'passes', 'per-word bounds', 
                                           'perplexity', 'topic diff', 
                                           'final perplexity', 'final topic diff', 
                                           'perplexity decreasing', 'coherence', 
                                           'top_n removed', 'n_above threshold'])

# for each of the top ten products, grid search over a combination of n_passes and n_topics 
# save the parameter combinations (and saved model) of the model with the highest coherence score
for product in top_ten[0:1]:
    output = lda_funcs.tune_lda(df=df, product=product, n_passes=[50, 80], 
                                n_topics=[6, 8, 10, 12], save_path='vanilla_outputs',
                                input_text='clean_vanilla', n_below=0, 
                                top_n=[2, 10], n_above=[0.5, 1.0])
    vanilla_gs_results = lda_funcs.save_best(output, vanilla_gs_results, 
                                             save_path='vanilla_outputs')

In [None]:
vanilla_gs_results

In [None]:
# RUN ONLY ON FIRST PASS
# save off the results with the best model (highest coherence) for each product
# examine the results
vanilla_gs_results.to_csv('../data/interim/vanilla_gs_results.csv')
vanilla_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

In [None]:
# LOAD RESULTS
vanilla_gs_results = pd.read_csv('../data/interim/vanilla_gs_results.csv')
vanilla_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

Next, after reviewing the results of the initial grid search pass, and manually tune the models for each of the products to try to reach a threshold of 0.5 for the final model coherence. 

In [None]:
# run this cell for the products that needs further tuning
# increase or decrease the number of passes or topics depending on the best model found from previous results
output = lda_funcs.tune_lda(df=df, product='B0026RQTGE', n_passes=[80], 
                            n_topics=[5, 6, 7], save_path='vanilla_outputs',
                            input_text='clean_vanilla', n_below=0, top_n=[2], 
                            n_above=[1.0])

In [None]:
# view the outputs
output

In [None]:
# if the result is an improvement, run this cell to save it
# save off the updated results dataframe
vanilla_gs_results = lda_funcs.save_best(output, vanilla_gs_results, 
                                         save_path='vanilla_outputs')
vanilla_gs_results.to_csv('../data/interim/vanilla_gs_results.csv')

In [None]:
# finally, save off the best results into a final output dataframe
# create a df to collect the best models from all grid search tuning efforts
# save it off to a csv
vanilla_final_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                              'passes', 'per-word bounds', 'perplexity', 
                                              'topic diff', 'final perplexity', 
                                              'final topic diff', 'perplexity decreasing', 
                                              'coherence', 'top_n removed', 'n_above threshold'])

for product in top_ten:
    output = vanilla_gs_results[vanilla_gs_results['product']==product]
    vanilla_final_results = lda_funcs.save_best(output, vanilla_final_results, 
                                                save_path='vanilla_outputs')

vanilla_final_results.to_csv('../data/processed/vanilla_final_results.csv')

In [None]:
# LOAD RESULTS
# load in the final results
vanilla_final_results = pd.read_csv('../data/processed/vanilla_final_results.csv')
vanilla_final_results.set_index('product', inplace=True)
vanilla_final_results[['coherence', 'num_topics', 'passes', 'top_n removed', 'n_above threshold']]

The results above show that success with the vanilla review (i.e., no codewords, just clean text) was pretty poor - I was not able to achieve the 0.5 goal threshold with any of the products. 

### Coded Inputs Grid Search & Tuning

In [None]:
# CAUTION - GRID SEARCH CAN TAKE MANY HOURS TO RUN
# ONLY RUN ON FIRST PASS
# create a dataframe to house the results of the model tuning from an initial grid search

coded_gs_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                         'passes', 'per-word bounds', 'perplexity', 
                                         'topic diff', 'final perplexity', 
                                         'final topic diff', 'perplexity decreasing',
                                         'coherence', 'top_n removed', 'n_above threshold'])

# for each of the top ten products, grid search over a combination of n_passes and n_topics 
# save the parameter combinations (and saved model) of the model with the highest coherence score
for product in top_ten:
    output = lda_funcs.tune_lda(df=df, product=product, n_passes=[50, 80], 
                                n_topics=[6, 8, 10, 12], save_path='coded_outputs',
                                input_text='clean_coded', n_below=0, 
                                top_n=[2, 10], n_above=[0.5, 1.0])
    coded_gs_results = lda_funcs.save_best(output, coded_gs_results, save_path='coded_outputs')

In [None]:
coded_gs_results

In [None]:
# RUN ONLY ON FIRST PASS
# save off the results with the best model (highest coherence) for each product
# examine the results
coded_gs_results.to_csv('../data/interim/coded_gs_results.csv')
coded_gs_results[['product','coherence', 'num_topics', 
                  'passes', 'top_n removed', 'n_above threshold']]

In [None]:
# LOAD RESULTS
coded_gs_results = pd.read_csv('../data/interim/coded_gs_results.csv')
coded_gs_results[['product','coherence', 'num_topics', 
                  'passes', 'top_n removed', 'n_above threshold']]

Next, after reviewing the results of the initial grid search pass, and manually tune the models for each of the products to try to reach a threshold of 0.5 for the final model coherence. 

In [None]:
# run this cell for the product that needs further tuning
output = lda_funcs.tune_lda(df=df, product='B000KV61FC', n_passes=[50], 
                            n_topics=[9, 10, 11], save_path='coded_outputs',
                            input_text='clean_coded', n_below=0, 
                            top_n=[2], n_above=[0.5])

In [None]:
# view the outputs
output[['product','coherence', 'num_topics', 
        'passes', 'top_n removed', 'n_above threshold']]

In [None]:
# if the result is an improvement, run this cell to save it
# save off the updated results dataframe
coded_gs_results = lda_funcs.save_best(output, coded_gs_results, 
                                       save_path='coded_outputs')
coded_gs_results.to_csv('../data/interim/coded_gs_results.csv')

In [None]:
# finally, save off the best results into a final output dataframe
# create a df to collect the best models from all grid search tuning efforts
# save it off to a csv
coded_final_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                            'passes', 'per-word bounds', 
                                            'perplexity', 'topic diff', 
                                            'final perplexity', 'final topic diff', 
                                            'perplexity decreasing', 'coherence', 
                                            'top_n removed', 'n_above threshold'])

for product in top_ten:
    output = coded_gs_results[coded_gs_results['product']==product]
    coded_final_results = lda_funcs.save_best(output, coded_final_results, 
                                              save_path='coded_outputs')

coded_final_results.to_csv('../data/processed/coded_final_results.csv')

In [None]:
# LOAD RESULTS
# load in the final results
coded_final_results = pd.read_csv('../data/processed/coded_final_results.csv')
coded_final_results.set_index('product', inplace=True)
coded_final_results[['coherence', 'num_topics', 'passes', 
                     'top_n removed', 'n_above threshold']]

Using the coded review (i.e., with "GOODREVIEW" and "BADREVIEW" inserted following each positive or negative word), the coherence results are somewhat better than the uncoded reviews.

### Valence Coded Inputs Grid Search & Tuning

In [None]:
# CAUTION - GRID SEARCH CAN TAKE MANY HOURS TO RUN
# ONLY RUN ON FIRST PASS
# create a dataframe to house the results of the model tuning from an initial grid search

valence_gs_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                           'passes', 'per-word bounds', 
                                           'perplexity', 'topic diff', 
                                           'final perplexity', 'final topic diff', 
                                           'perplexity decreasing', 'coherence', 
                                           'top_n removed', 'n_above threshold'])

# for each of the top ten products, grid search over a combination of n_passes and n_topics 
# save the parameter combinations (and saved model) of the model with the highest coherence score
for product in top_ten:
    output = lda_funcs.tune_lda(df=df, product=product, n_passes=[50, 80], 
                                n_topics=[6, 8, 10, 12], save_path='valence_outputs', 
                                input_text='clean_valence', n_below=0, 
                                top_n=[2,10], n_above=[0.5, 1.0])
    valence_gs_results = lda_funcs.save_best(output, valence_gs_results, 
                                             save_path='valence_outputs')

In [None]:
valence_gs_results

In [None]:
# RUN ONLY ON FIRST PASS
# save off the results with the best model (highest coherence) for each product
# examine the results
valence_gs_results.to_csv('../data/interim/valence_gs_results.csv')
valence_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

In [None]:
# LOAD RESULTS
valence_gs_results = pd.read_csv('../data/interim/valence_gs_results.csv')
valence_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

Next, after reviewing the results of the initial grid search pass, and manually tune the models for each of the products to try to reach a threshold of 0.5 for the final model coherence. 

In [None]:
# run this cell for the product that needs further tuning
output = lda_funcs.tune_lda(df=df, product='B001EO5Q64', n_passes=[50], 
                            n_topics=[6], save_path='valence_outputs',
                            input_text='clean_valence', n_below=0,
                           top_n=[10], n_above=[0.5])

In [None]:
# view the outputs
output

In [None]:
# if the result is an improvement, run this cell to save it
# save off the updated results dataframe
valence_gs_results = lda_funcs.save_best(output, valence_gs_results, save_path="valence_outputs")
valence_gs_results.to_csv('../data/interim/valence_gs_results.csv')

In [None]:
# finally, save off the best results into a final output dataframe
# create a df to collect the best models from all grid search tuning efforts
# save it off to a csv
valence_final_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                              'passes', 'per-word bounds', 
                                              'perplexity', 'topic diff',
                                              'final perplexity', 'final topic diff', 
                                              'perplexity decreasing', 'coherence', 
                                              'top_n removed', 'n_above threshold'])

for product in top_ten:
    output = valence_gs_results[valence_gs_results['product']==product]
    valence_final_results = lda_funcs.save_best(output, valence_final_results, 
                                                save_path='valence_outputs')

valence_final_results.to_csv('../data/processed/valence_final_results.csv')

In [None]:
valence_final_results.loc[0, 'top_n removed']

In [None]:
# LOAD RESULTS
# load in the final results
valence_final_results = pd.read_csv('../data/processed/valence_final_results.csv')
valence_final_results.set_index('product', inplace=True)
valence_final_results[['coherence', 'num_topics', 'passes', 'top_n removed', 'n_above threshold']]