In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
import logging
import io
import os
import re
import itertools

import lda_funcs # helper functions for LDA tuning

from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
handler = logging.FileHandler('training_output.log')

# The handler above is somthing I needed with respect to logging.
# Gensim performs various calculations while training the LDA model that I am using, but the only way to see them
# is in the logging outputs.
# Specifically, I need to capture the perplexity values during training to verify that perplexity is decreasing.
# This metric is needed to compare models and to do hyperparameter tuning. 


# The following blog post was helpful to me in figure out how to make the log handler I needed.
# https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/

In [3]:
df = pd.read_csv('reviews_subset.csv', index_col=0)
df.head(2)

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,...,valence_stripped,clean_vanilla_x,clean_coded_x,clean_valence_x,Vanilla Subtopic,Vanilla Subtopic Fit,Coded Subtopic,Coded Subtopic Fit,Valence Subtopic,Valence Subtopic Fit
0,0,0,B000KV61FC,A1Y1YYH71TPYC6,thefinfan54,2,2,5,1222905600,The best dog toy I ever bought :),...,two small dog absolutely love VGOODREVIEW tug ...,two small dog absolutely love tug jug many us ...,two small dog absolutely love GOODREVIEW tug j...,two small dog absolutely love VGOODREVIEW tug ...,1.0,0.395601,1.0,0.271064,1.0,0.262538
1,1,1,B000KV61FC,A1SLLKDKCZ5IPL,C. Guariglia,2,2,2,1221091200,Buried Treasure Forever!,...,idea basically good GOODREVIEW one however lar...,idea basically good one however large papillon...,idea basically good GOODREVIEW one however lar...,idea basically good GOODREVIEW one however lar...,,,,,,


In [6]:
# get review counts for each product
review_counts = df['ProductId'].value_counts().sort_values()
review_counts.tail(12)

B002IEZJMA    487
B006MONQMC    491
B005ZBZLT4    506
B003GTR8IO    530
B005K4Q34S    541
B0013A0QXC    542
B000NMJWZO    542
B000KV61FC    556
B001EO5Q64    567
B0026RQTGE    630
Name: ProductId, dtype: int64

In [7]:
# get a couple product groups to work with - the 10 products with the largest number of reviews
top_ten = review_counts.tail(10).index.values

## LDA Model Grid Search & Tuning

For each of the input text types (vanilla, coded, and valence coded), first run an initial grid search with 50 or 80 training passes and either 6, 10, or 12 topics.

NOTE: grid search can take hours to run fully - don't run the below cells unless you actually want to perform the searching and tuning. The csv results files can simply be loaded in where noted below.

In [None]:
# LOAD RESULTS
# load in the final results
vanilla_final_results = pd.read_csv('vanilla_final_results.csv')
vanilla_final_results.set_index('product', inplace=True)

coded_final_results = pd.read_csv('coded_final_results.csv')
coded_final_results.set_index('product', inplace=True)

valence_final_results = pd.read_csv('valence_final_results.csv')
valence_final_results.set_index('product', inplace=True)

### Vanilla Inputs Grid Search & Tuning

In [None]:
# CAUTION - GRID SEARCH CAN TAKE MANY HOURS TO RUN
# ONLY RUN ON FIRST PASS
# create a dataframe to house the results of the model tuning from an initial grid search

vanilla_gs_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                           'passes', 'per-word bounds', 
                                           'perplexity', 'topic diff', 
                                           'final perplexity', 'final topic diff', 
                                           'perplexity decreasing', 'coherence', 
                                           'top_n removed', 'n_above threshold'])

# for each of the top ten products, grid search over a combination of n_passes and n_topics 
# save the parameter combinations (and saved model) of the model with the highest coherence score
for product in top_ten:
    output = lda_funcs.tune_lda(df=df, product=product, n_passes=[50, 80], 
                                n_topics=[6, 8, 10, 12], save_path='vanilla_outputs',
                                input_text='clean_vanilla', n_below=0, 
                                top_n=[2,10], n_above=[0.5, 1.0])
    vanilla_gs_results = lda_funcs.save_best(output, vanilla_gs_results, 
                                             save_path='vanilla_outputs')

In [None]:
vanilla_gs_results

In [None]:
# RUN ONLY ON FIRST PASS
# save off the results with the best model (highest coherence) for each product
# examine the results
vanilla_gs_results.to_csv('vanilla_gs_results.csv')
vanilla_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

In [None]:
# LOAD RESULTS
vanilla_gs_results = pd.read_csv('vanilla_gs_results.csv')
vanilla_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

Next, after reviewing the results of the initial grid search pass, and manually tune the models for each of the products to try to reach a threshold of 0.5 for the final model coherence. 

In [None]:
# run this cell for the products that needs further tuning
# increase or decrease the number of passes or topics depending on the best model found from previous results
output = lda_funcs.tune_lda(df=df, product='B0026RQTGE', n_passes=[80], 
                            n_topics=[5, 6, 7], save_path='vanilla_outputs',
                            input_text='clean_vanilla', n_below=0, top_n=[2], 
                            n_above=[1.0])

In [None]:
# view the outputs
output

In [None]:
# if the result is an improvement, run this cell to save it
# save off the updated results dataframe
vanilla_gs_results = lda_funcs.save_best(output, vanilla_gs_results, 
                                         save_path='vanilla_outputs')
vanilla_gs_results.to_csv('vanilla_gs_results.csv')

In [None]:
# finally, save off the best results into a final output dataframe
# create a df to collect the best models from all grid search tuning efforts
# save it off to a csv
vanilla_final_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                              'passes', 'per-word bounds', 'perplexity', 
                                              'topic diff', 'final perplexity', 
                                              'final topic diff', 'perplexity decreasing', 
                                              'coherence', 'top_n removed', 'n_above threshold'])

for product in top_ten:
    output = vanilla_gs_results[vanilla_gs_results['product']==product]
    vanilla_final_results = lda_funcs.save_best(output, vanilla_final_results, 
                                                save_path='vanilla_outputs')

vanilla_final_results.to_csv('vanilla_final_results.csv')

In [None]:
# LOAD RESULTS
# load in the final results
vanilla_final_results = pd.read_csv('vanilla_final_results.csv')
vanilla_final_results.set_index('product', inplace=True)
vanilla_final_results[['coherence', 'num_topics', 'passes', 'top_n removed', 'n_above threshold']]

The results above show that success with the vanilla review (i.e., no codewords, just clean text) was pretty poor - I was not able to achieve the 0.5 goal threshold with any of the products. 

### Coded Inputs Grid Search & Tuning

In [None]:
# CAUTION - GRID SEARCH CAN TAKE MANY HOURS TO RUN
# ONLY RUN ON FIRST PASS
# create a dataframe to house the results of the model tuning from an initial grid search

coded_gs_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                         'passes', 'per-word bounds', 'perplexity', 
                                         'topic diff', 'final perplexity', 
                                         'final topic diff', 'perplexity decreasing',
                                         'coherence', 'top_n removed', 'n_above threshold'])

# for each of the top ten products, grid search over a combination of n_passes and n_topics 
# save the parameter combinations (and saved model) of the model with the highest coherence score
for product in top_ten:
    output = lda_funcs.tune_lda(df=df, product=product, n_passes=[50, 80], 
                                n_topics=[6, 8, 10, 12], save_path='coded_outputs',
                                input_text='clean_coded', n_below=0, 
                                top_n=[2,10], n_above=[0.5, 1.0])
    coded_gs_results = lda_funcs.save_best(output, coded_gs_results, save_path='coded_outputs')
    

    

In [None]:
coded_gs_results

In [None]:
# RUN ONLY ON FIRST PASS
# save off the results with the best model (highest coherence) for each product
# examine the results
coded_gs_results.to_csv('coded_gs_results.csv')
coded_gs_results[['product','coherence', 'num_topics', 
                  'passes', 'top_n removed', 'n_above threshold']]

In [None]:
# LOAD RESULTS
coded_gs_results = pd.read_csv('coded_gs_results.csv')
coded_gs_results[['product','coherence', 'num_topics', 
                  'passes', 'top_n removed', 'n_above threshold']]

Next, after reviewing the results of the initial grid search pass, and manually tune the models for each of the products to try to reach a threshold of 0.5 for the final model coherence. 

In [None]:
# run this cell for the product that needs further tuning
output = lda_funcs.tune_lda(df=df, product='B000KV61FC', n_passes=[50], 
                            n_topics=[9, 10, 11], save_path='coded_outputs',
                            input_text='clean_coded', n_below=0, 
                            top_n=[2], n_above=[0.5])

In [None]:
# view the outputs
output[['product','coherence', 'num_topics', 
        'passes', 'top_n removed', 'n_above threshold']]

In [None]:
# if the result is an improvement, run this cell to save it
# save off the updated results dataframe
coded_gs_results = lda_funcs.save_best(output, coded_gs_results, 
                                       save_path='coded_outputs')
coded_gs_results.to_csv('coded_gs_results.csv')

In [None]:
# finally, save off the best results into a final output dataframe
# create a df to collect the best models from all grid search tuning efforts
# save it off to a csv
coded_final_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                            'passes', 'per-word bounds', 
                                            'perplexity', 'topic diff', 
                                            'final perplexity', 'final topic diff', 
                                            'perplexity decreasing', 'coherence', 
                                            'top_n removed', 'n_above threshold'])

for product in top_ten:
    output = coded_gs_results[coded_gs_results['product']==product]
    coded_final_results = lda_funcs.save_best(output, coded_final_results, 
                                              save_path='coded_outputs')

coded_final_results.to_csv('coded_final_results.csv')

In [None]:
# LOAD RESULTS
# load in the final results
coded_final_results = pd.read_csv('coded_final_results.csv')
coded_final_results.set_index('product', inplace=True)
coded_final_results[['coherence', 'num_topics', 'passes', 
                     'top_n removed', 'n_above threshold']]

Using the coded review (i.e., with "GOODREVIEW" and "BADREVIEW" inserted following each positive or negative word), the coherence results are somewhat better than the uncoded reviews.

### Valence Coded Inputs Grid Search & Tuning

In [None]:
# CAUTION - GRID SEARCH CAN TAKE MANY HOURS TO RUN
# ONLY RUN ON FIRST PASS
# create a dataframe to house the results of the model tuning from an initial grid search

valence_gs_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                           'passes', 'per-word bounds', 
                                           'perplexity', 'topic diff', 
                                           'final perplexity', 'final topic diff', 
                                           'perplexity decreasing', 'coherence', 
                                           'top_n removed', 'n_above threshold'])

# for each of the top ten products, grid search over a combination of n_passes and n_topics 
# save the parameter combinations (and saved model) of the model with the highest coherence score
for product in top_ten:
    output = lda_funcs.tune_lda(df=df, product=product, n_passes=[50, 80], 
                                n_topics=[6, 8, 10, 12], save_path='valence_outputs', 
                                input_text='clean_valence', n_below=0, 
                                top_n=[2,10], n_above=[0.5, 1.0])
    valence_gs_results = lda_funcs.save_best(output, valence_gs_results, 
                                             save_path='valence_outputs')

In [None]:
valence_gs_results

In [None]:
# RUN ONLY ON FIRST PASS
# save off the results with the best model (highest coherence) for each product
# examine the results
valence_gs_results.to_csv('valence_gs_results.csv')
valence_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

In [4]:
# LOAD RESULTS
valence_gs_results = pd.read_csv('valence_gs_results.csv')
valence_gs_results[['product','coherence', 'num_topics', 
                    'passes', 'top_n removed', 'n_above threshold']]

Unnamed: 0,product,coherence,num_topics,passes,top_n removed,n_above threshold
0,B002IEZJMA,0.492564,12,50,10,0.5
1,B006MONQMC,0.502877,8,50,2,1.0
2,B005ZBZLT4,0.470262,12,80,2,0.5
3,B003GTR8IO,0.518304,12,80,2,1.0
4,B005K4Q34S,0.497359,8,80,2,0.5
5,B0013A0QXC,0.485961,10,80,2,0.5
6,B000NMJWZO,0.490841,8,50,2,1.0
7,B000KV61FC,0.523329,8,80,10,0.5
8,B001EO5Q64,0.488493,6,50,10,0.5
9,B0026RQTGE,0.438051,6,80,2,0.5


Next, after reviewing the results of the initial grid search pass, and manually tune the models for each of the products to try to reach a threshold of 0.5 for the final model coherence. 

In [None]:
# run this cell for the product that needs further tuning
output = lda_funcs.tune_lda(df=df, product='B001EO5Q64', n_passes=[50], 
                            n_topics=[6], save_path='valence_outputs',
                            input_text='clean_valence', n_below=0,
                           top_n=[10], n_above=[0.5])

In [None]:
# view the outputs
output

In [None]:
# if the result is an improvement, run this cell to save it
# save off the updated results dataframe
valence_gs_results = lda_funcs.save_best(output, valence_gs_results, save_path="valence_outputs")
valence_gs_results.to_csv('valence_gs_results.csv')

In [9]:
# finally, save off the best results into a final output dataframe
# create a df to collect the best models from all grid search tuning efforts
# save it off to a csv
valence_final_results = pd.DataFrame(columns=['product', 'num_topics', 'chunk', 
                                              'passes', 'per-word bounds', 
                                              'perplexity', 'topic diff',
                                              'final perplexity', 'final topic diff', 
                                              'perplexity decreasing', 'coherence', 
                                              'top_n removed', 'n_above threshold'])

for product in top_ten:
    output = valence_gs_results[valence_gs_results['product']==product]
    valence_final_results = lda_funcs.save_best(output, valence_final_results, 
                                                save_path='valence_outputs')

valence_final_results.to_csv('valence_final_results.csv')

2018-10-04 13:51:02,662 : INFO : loading LdaModel object from ./valence_outputs/B002IEZJMA_12_50_10_0.5
2018-10-04 13:51:02,669 : INFO : loading expElogbeta from ./valence_outputs/B002IEZJMA_12_50_10_0.5.expElogbeta.npy with mmap=None
2018-10-04 13:51:02,674 : INFO : setting ignored attribute state to None
2018-10-04 13:51:02,680 : INFO : setting ignored attribute id2word to None
2018-10-04 13:51:02,682 : INFO : setting ignored attribute dispatcher to None
2018-10-04 13:51:02,683 : INFO : loaded ./valence_outputs/B002IEZJMA_12_50_10_0.5
2018-10-04 13:51:02,685 : INFO : loading LdaState object from ./valence_outputs/B002IEZJMA_12_50_10_0.5.state
2018-10-04 13:51:02,690 : INFO : loaded ./valence_outputs/B002IEZJMA_12_50_10_0.5.state
2018-10-04 13:51:02,703 : INFO : saving LdaState object under ./valence_outputs/final_models/B002IEZJMA_12_50_10_0.5.state, separately None
2018-10-04 13:51:02,718 : INFO : saved ./valence_outputs/final_models/B002IEZJMA_12_50_10_0.5.state
2018-10-04 13:51:02

best results for product B002IEZJMA:
level_0                                                                  0
Unnamed: 0                                                               0
Unnamed: 0.1                                                             0
product                                                         B002IEZJMA
num_topics                                                              12
chunk                                                              162.333
passes                                                                  50
per-word bounds          {0: -8.081, 1: -5.316, 2: -4.657, 3: -4.471, 4...
perplexity               {0: 270.7, 1: 39.8, 2: 25.2, 3: 22.2, 4: 21.1,...
topic diff               {0: 0.792344, 1: 0.500671, 2: 0.397751, 3: 0.3...
final perplexity                                                      17.5
final topic diff                                                  0.077463
perplexity decreasing                                          

2018-10-04 13:51:02,849 : INFO : saved ./valence_outputs/final_models/B006MONQMC_8_50_2_1.0
2018-10-04 13:51:02,892 : INFO : loading LdaModel object from ./valence_outputs/B005ZBZLT4_12_80_2_0.5
2018-10-04 13:51:02,896 : INFO : loading expElogbeta from ./valence_outputs/B005ZBZLT4_12_80_2_0.5.expElogbeta.npy with mmap=None
2018-10-04 13:51:02,901 : INFO : setting ignored attribute state to None
2018-10-04 13:51:02,904 : INFO : setting ignored attribute id2word to None
2018-10-04 13:51:02,906 : INFO : setting ignored attribute dispatcher to None
2018-10-04 13:51:02,908 : INFO : loaded ./valence_outputs/B005ZBZLT4_12_80_2_0.5
2018-10-04 13:51:02,909 : INFO : loading LdaState object from ./valence_outputs/B005ZBZLT4_12_80_2_0.5.state
2018-10-04 13:51:02,915 : INFO : loaded ./valence_outputs/B005ZBZLT4_12_80_2_0.5.state
2018-10-04 13:51:02,923 : INFO : saving LdaState object under ./valence_outputs/final_models/B005ZBZLT4_12_80_2_0.5.state, separately None
2018-10-04 13:51:02,928 : INFO : 

Final model saved for product B006MONQMC with 8 topics over 50 passes, removing top 2 tokens and token review threshold 1.0.
best results for product B005ZBZLT4:
level_0                                                                  2
Unnamed: 0                                                               2
Unnamed: 0.1                                                             2
product                                                         B005ZBZLT4
num_topics                                                              12
chunk                                                              168.667
passes                                                                  80
per-word bounds          {0: -6.924, 1: -5.579, 2: -5.367, 3: -5.186, 4...
perplexity               {0: 121.5, 1: 47.8, 2: 41.3, 3: 36.4, 4: 32.3,...
topic diff               {0: 0.775349, 1: 0.502837, 2: 0.395454, 3: 0.3...
final perplexity                                                      23.9
final topic d

2018-10-04 13:51:03,060 : INFO : saving LdaState object under ./valence_outputs/final_models/B003GTR8IO_12_80_2_1.0.state, separately None
2018-10-04 13:51:03,068 : INFO : saved ./valence_outputs/final_models/B003GTR8IO_12_80_2_1.0.state
2018-10-04 13:51:03,074 : INFO : saving LdaModel object under ./valence_outputs/final_models/B003GTR8IO_12_80_2_1.0, separately ['expElogbeta', 'sstats']
2018-10-04 13:51:03,078 : INFO : not storing attribute state
2018-10-04 13:51:03,080 : INFO : not storing attribute id2word
2018-10-04 13:51:03,083 : INFO : not storing attribute dispatcher
2018-10-04 13:51:03,085 : INFO : storing np array 'expElogbeta' to ./valence_outputs/final_models/B003GTR8IO_12_80_2_1.0.expElogbeta.npy
2018-10-04 13:51:03,094 : INFO : saved ./valence_outputs/final_models/B003GTR8IO_12_80_2_1.0
2018-10-04 13:51:03,142 : INFO : loading LdaModel object from ./valence_outputs/B005K4Q34S_8_80_2_0.5
2018-10-04 13:51:03,150 : INFO : loading expElogbeta from ./valence_outputs/B005K4Q34S

Final model saved for product B003GTR8IO with 12 topics over 80 passes, removing top 2 tokens and token review threshold 1.0.
best results for product B005K4Q34S:
level_0                                                                  4
Unnamed: 0                                                               4
Unnamed: 0.1                                                             4
product                                                         B005K4Q34S
num_topics                                                               8
chunk                                                              180.333
passes                                                                  80
per-word bounds          {0: -8.385, 1: -5.311, 2: -4.623, 3: -4.254, 4...
perplexity               {0: 334.4, 1: 39.7, 2: 24.6, 3: 19.1, 4: 17.0,...
topic diff               {0: 0.661038, 1: 0.533929, 2: 0.475783, 3: 0.4...
final perplexity                                                      13.1
final topic 

2018-10-04 13:51:03,321 : INFO : not storing attribute id2word
2018-10-04 13:51:03,327 : INFO : not storing attribute dispatcher
2018-10-04 13:51:03,329 : INFO : storing np array 'expElogbeta' to ./valence_outputs/final_models/B0013A0QXC_10_80_2_0.5.expElogbeta.npy
2018-10-04 13:51:03,336 : INFO : saved ./valence_outputs/final_models/B0013A0QXC_10_80_2_0.5
2018-10-04 13:51:03,401 : INFO : loading LdaModel object from ./valence_outputs/B000NMJWZO_8_50_2_1.0
2018-10-04 13:51:03,411 : INFO : loading expElogbeta from ./valence_outputs/B000NMJWZO_8_50_2_1.0.expElogbeta.npy with mmap=None
2018-10-04 13:51:03,425 : INFO : setting ignored attribute state to None
2018-10-04 13:51:03,429 : INFO : setting ignored attribute id2word to None
2018-10-04 13:51:03,431 : INFO : setting ignored attribute dispatcher to None
2018-10-04 13:51:03,433 : INFO : loaded ./valence_outputs/B000NMJWZO_8_50_2_1.0
2018-10-04 13:51:03,436 : INFO : loading LdaState object from ./valence_outputs/B000NMJWZO_8_50_2_1.0.st

Final model saved for product B0013A0QXC with 10 topics over 80 passes, removing top 2 tokens and token review threshold 0.5.
best results for product B000NMJWZO:
level_0                                                                  6
Unnamed: 0                                                               6
Unnamed: 0.1                                                             6
product                                                         B000NMJWZO
num_topics                                                               8
chunk                                                              180.667
passes                                                                  50
per-word bounds          {0: -6.914, 1: -5.486, 2: -5.252, 3: -5.084, 4...
perplexity               {0: 120.6, 1: 44.8, 2: 38.1, 3: 33.9, 4: 32.2,...
topic diff               {0: 1.13664, 1: 0.730187, 2: 0.575428, 3: 0.50...
final perplexity                                                      23.3
final topic 

2018-10-04 13:51:03,568 : INFO : saving LdaState object under ./valence_outputs/final_models/B000KV61FC_8_80_10_0.5.state, separately None
2018-10-04 13:51:03,572 : INFO : saved ./valence_outputs/final_models/B000KV61FC_8_80_10_0.5.state
2018-10-04 13:51:03,578 : INFO : saving LdaModel object under ./valence_outputs/final_models/B000KV61FC_8_80_10_0.5, separately ['expElogbeta', 'sstats']
2018-10-04 13:51:03,580 : INFO : not storing attribute state
2018-10-04 13:51:03,584 : INFO : not storing attribute id2word
2018-10-04 13:51:03,587 : INFO : not storing attribute dispatcher
2018-10-04 13:51:03,590 : INFO : storing np array 'expElogbeta' to ./valence_outputs/final_models/B000KV61FC_8_80_10_0.5.expElogbeta.npy
2018-10-04 13:51:03,596 : INFO : saved ./valence_outputs/final_models/B000KV61FC_8_80_10_0.5
2018-10-04 13:51:03,648 : INFO : loading LdaModel object from ./valence_outputs/B001EO5Q64_6_50_10_0.5
2018-10-04 13:51:03,655 : INFO : loading expElogbeta from ./valence_outputs/B001EO5Q6

Final model saved for product B000KV61FC with 8 topics over 80 passes, removing top 10 tokens and token review threshold 0.5.
best results for product B001EO5Q64:
level_0                                                                  8
Unnamed: 0                                                               8
Unnamed: 0.1                                                             8
product                                                         B001EO5Q64
num_topics                                                               6
chunk                                                                  189
passes                                                                  50
per-word bounds          {0: -7.686, 1: -7.134, 2: -7.035, 3: -6.969, 4...
perplexity               {0: 205.9, 1: 140.5, 2: 131.1, 3: 125.3, 4: 12...
topic diff               {0: 1.266589, 1: 0.476273, 2: 0.333173, 3: 0.2...
final perplexity                                                     109.6
final topic 

2018-10-04 13:51:03,808 : INFO : saved ./valence_outputs/final_models/B0026RQTGE_5_80_10_0.5.state
2018-10-04 13:51:03,822 : INFO : saving LdaModel object under ./valence_outputs/final_models/B0026RQTGE_5_80_10_0.5, separately ['expElogbeta', 'sstats']
2018-10-04 13:51:03,825 : INFO : not storing attribute state
2018-10-04 13:51:03,827 : INFO : not storing attribute id2word
2018-10-04 13:51:03,832 : INFO : not storing attribute dispatcher
2018-10-04 13:51:03,835 : INFO : storing np array 'expElogbeta' to ./valence_outputs/final_models/B0026RQTGE_5_80_10_0.5.expElogbeta.npy
2018-10-04 13:51:03,840 : INFO : saved ./valence_outputs/final_models/B0026RQTGE_5_80_10_0.5


Final model saved for product B0026RQTGE with 5 topics over 80 passes, removing top 10 tokens and token review threshold 0.5.


In [None]:
valence_final_results.loc[0, 'top_n removed']

In [None]:
# LOAD RESULTS
# load in the final results
valence_final_results = pd.read_csv('valence_final_results.csv')
valence_final_results.set_index('product', inplace=True)
valence_final_results[['coherence', 'num_topics', 'passes', 'top_n removed', 'n_above threshold']]