# Importing packages, setting working directory, and loading file

In [None]:
try:
  import tmtoolkit
except:
  !pip install tmtoolkit
  import tmtoolkit

try:
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel
except:
  !pip install tmtoolkit['lda']
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel

try:
  from lda import LDA
except:
  !pip install lda
  from lda import LDA

import pickle
import logging
import warnings

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

import random
random.seed(20191120)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#For all Steve Madden Reviews
working_directory = '/content/drive/MyDrive/APRD6343/TopicModeling'

doc_labels = pickle.load(open('%s/final_labels.p' % working_directory, 'rb'))
dtm_sm = pickle.load(open('%s/final_dtm.p' % working_directory, 'rb'))
dtm_bg = pickle.load(open('%s/big_dtm.p' % working_directory, 'rb'))
vocab_bg = pickle.load(open('%s/big_vocab.p' % working_directory, 'rb'))
vocab_sm = pickle.load(open('%s/small_vocab.p' % working_directory, 'rb'))

In [None]:
#For just Steve Madden Reviews with 2 or less stars
working_directory2 = '/content/drive/MyDrive/APRD6343/TopicModeling'

doc2_labels = pickle.load(open('%s/2final_labels.p' % working_directory2, 'rb'))
dtm2_sm = pickle.load(open('%s/2final_dtm.p' % working_directory2, 'rb'))
dtm2_bg = pickle.load(open('%s/2big_dtm.p' % working_directory2, 'rb'))
vocab2_bg = pickle.load(open('%s/2big_vocab.p' % working_directory2, 'rb'))
vocab2_sm = pickle.load(open('%s/2small_vocab.p' % working_directory2, 'rb'))

In [None]:
#For just Steve Madden Reviews with 4 or more stars
working_directory3 = '/content/drive/MyDrive/APRD6343/TopicModeling'

doc3_labels = pickle.load(open('%s/3final_labels.p' % working_directory3, 'rb'))
dtm3_sm = pickle.load(open('%s/3final_dtm.p' % working_directory3, 'rb'))
dtm3_bg = pickle.load(open('%s/3big_dtm.p' % working_directory3, 'rb'))
vocab3_bg = pickle.load(open('%s/3big_vocab.p' % working_directory3, 'rb'))
vocab3_sm = pickle.load(open('%s/3small_vocab.p' % working_directory3, 'rb'))

# Creating topic models

In [None]:
#Suppress the "INFO" messages and warnings from lda
logger = logging.getLogger('lda')
logger.addHandler(logging.NullHandler())
logger.propagate = False
warnings.filterwarnings('ignore')

In [None]:
#For all Steve Madden Reviews
#Set data to use
#dtms = document term matrices to process
dtms = {
    'bigger': dtm_bg,
    'smaller': dtm_sm
}

#Create dictionaries with parameters to build topic models
lda_params = {
    'n_topics': 30,
    'eta': 1.3,
    'n_iter': 500,
    'random_state': 20191122,
    'alpha':1/(5*k)
}

models = compute_models_parallel(dtms, constant_parameters=lda_params)

In [None]:
#For just Steve Madden Reviews with 2 or less stars
dtms2 = {
    'bigger': dtm2_bg,
    'smaller': dtm2_sm
}

lda_params2 = {
    'n_topics': 15,
    'eta': 2,
    'n_iter': 500,
    'random_state': 20191122,
    'alpha':1/(7*k)
}

models2 = compute_models_parallel(dtms2, constant_parameters=lda_params2)

In [None]:
#For just Steve Madden Reviews with 4 or more stars
dtms3 = {
    'bigger': dtm3_bg,
    'smaller': dtm3_sm
}

lda_params3 = {
    'n_topics': 25,
    'eta': 1.3, 
    'n_iter': 500,
    'random_state': 20191122,
    'alpha':1/(5*k) 
}

models3 = compute_models_parallel(dtms3, constant_parameters=lda_params3)

In [None]:
#Model 1 with all Steve Madden Reviews
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words

model_sm = models['smaller'][0][1] #Extract smaller model
print_ldamodel_topic_words(model_sm.topic_word_, vocab_sm, top_n=5)

topic_1
> #1. size (0.135871)
> #2. order (0.068912)
> #3. review (0.063605)
> #4. small (0.043280)
> #5. big (0.041022)
topic_2
> #1. great (0.104881)
> #2. look (0.062247)
> #3. good (0.059543)
> #4. price (0.050635)
> #5. comfortable (0.048089)
topic_3
> #1. jean (0.083531)
> #2. look (0.081944)
> #3. great (0.066441)
> #4. dress (0.064732)
> #5. boot (0.057408)
topic_4
> #1. love (0.144253)
> #2. color (0.087833)
> #3. great (0.053815)
> #4. perfect (0.046679)
> #5. comfortable (0.044190)
topic_5
> #1. heel (0.109100)
> #2. high (0.056938)
> #3. walk (0.035310)
> #4. platform (0.028718)
> #5. look (0.028024)
topic_6
> #1. return (0.043701)
> #2. order (0.039633)
> #3. box (0.035158)
> #4. try (0.027430)
> #5. pair (0.026887)
topic_7
> #1. size (0.189256)
> #2. small (0.091935)
> #3. order (0.082940)
> #4. run (0.056388)
> #5. large (0.043274)
topic_8
> #1. heel (0.074407)
> #2. comfortable (0.060641)
> #3. love (0.059156)
> #4. high (0.044850)
> #5. walk (0.039586)
topic_9
> #1. so

In [None]:
#Model 2 with just Steve Madden Reviews with 2 or less stars
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words

model_sm2 = models2['smaller'][0][1]
print_ldamodel_topic_words(model_sm2.topic_word_, vocab2_sm, top_n=5)

topic_1
> #1. boot (0.131038)
> #2. foot (0.035291)
> #3. look (0.034134)
> #4. zipper (0.028927)
> #5. calf (0.019960)
topic_2
> #1. look (0.029543)
> #2. legging (0.020624)
> #3. walk (0.020067)
> #4. day (0.017280)
> #5. first (0.016722)
topic_3
> #1. look (0.098991)
> #2. boot (0.067079)
> #3. cheap (0.034516)
> #4. feel (0.027678)
> #5. return (0.026050)
topic_4
> #1. boot (0.103088)
> #2. color (0.040090)
> #3. look (0.032371)
> #4. order (0.027141)
> #5. black (0.022659)
topic_5
> #1. size (0.129581)
> #2. small (0.057264)
> #3. order (0.040903)
> #4. run (0.029123)
> #5. pair (0.021924)
topic_6
> #1. buy (0.043745)
> #2. month (0.039565)
> #3. look (0.034829)
> #4. boot (0.030092)
> #5. pair (0.027584)
topic_7
> #1. foot (0.069165)
> #2. look (0.034003)
> #3. narrow (0.031685)
> #4. heel (0.025889)
> #5. sole (0.025502)
topic_8
> #1. pair (0.053221)
> #2. purchase (0.022876)
> #3. return (0.021475)
> #4. look (0.019141)
> #5. order (0.018207)
topic_9
> #1. bag (0.055206)
> #2. 

In [None]:
#Model 3 with just Steve Madden Reviews with 4 or more stars
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words

model_sm3 = models3['smaller'][0][1]
print_ldamodel_topic_words(model_sm3.topic_word_, vocab3_sm, top_n=5)

topic_1
> #1. jean (0.085712)
> #2. look (0.085263)
> #3. great (0.074947)
> #4. dress (0.056110)
> #5. boot (0.055811)
topic_2
> #1. look (0.053187)
> #2. sole (0.031676)
> #3. make (0.029707)
> #4. nice (0.024646)
> #5. great (0.022818)
topic_3
> #1. heel (0.101371)
> #2. high (0.054836)
> #3. look (0.042977)
> #4. walk (0.036072)
> #5. comfortable (0.034571)
topic_4
> #1. sock (0.105348)
> #2. boot (0.056964)
> #3. size (0.042867)
> #4. thick (0.032250)
> #5. thin (0.023374)
topic_5
> #1. foot (0.065008)
> #2. ankle (0.049748)
> #3. strap (0.049457)
> #4. walk (0.025768)
> #5. cute (0.021844)
topic_6
> #1. size (0.117700)
> #2. order (0.076646)
> #3. review (0.062560)
> #4. read (0.035191)
> #5. small (0.029824)
topic_7
> #1. heel (0.079506)
> #2. size (0.058327)
> #3. comfortable (0.041786)
> #4. high (0.039003)
> #5. true (0.037457)
topic_8
> #1. warm (0.048689)
> #2. legging (0.042337)
> #3. great (0.030729)
> #4. comfortable (0.030510)
> #5. soft (0.029415)
topic_9
> #1. boot (0

In [None]:
#For all Steve Madden Reviews
model_bg = models['bigger'][0][1]
print_ldamodel_topic_words(model_bg.topic_word_, vocab_bg, top_n=5)

topic_1
> #1. bright (0.000174)
> #2. n (0.000174)
> #3. eventually (0.000174)
> #4. belly (0.000174)
> #5. darn (0.000174)
topic_2
> #1. profile (0.000227)
> #2. ther (0.000227)
> #3. nearly (0.000174)
> #4. dressed (0.000174)
> #5. mother (0.000174)
topic_3
> #1. packaging (0.000227)
> #2. yea (0.000227)
> #3. wah (0.000227)
> #4. dar (0.000174)
> #5. accentuate (0.000174)
topic_4
> #1.  (0.131511)
> #2. be (0.044384)
> #3. i (0.038504)
> #4. the (0.033953)
> #5. and (0.029522)
topic_5
> #1.  (0.118956)
> #2. be (0.046496)
> #3. i (0.044766)
> #4. the (0.040415)
> #5. a (0.026695)
topic_6
> #1.  (0.079505)
> #2. the (0.045640)
> #3. be (0.027098)
> #4. i (0.018093)
> #5. it (0.016168)
topic_7
> #1. c (0.000174)
> #2. 100 (0.000174)
> #3. literally (0.000174)
> #4. fabulously (0.000174)
> #5. smooth (0.000174)
topic_8
> #1. calfs (0.000174)
> #2. boutique (0.000174)
> #3. exist (0.000174)
> #4. exception (0.000174)
> #5. result (0.000174)
topic_9
> #1. prefer (0.000174)
> #2. gorgeous

In [None]:
#For just Steve Madden Reviews with 2 or less stars
model_bg2 = models2['bigger'][0][1]
print_ldamodel_topic_words(model_bg2.topic_word_, vocab2_bg, top_n=5)

topic_1
> #1. terribly (0.000479)
> #2. painter (0.000329)
> #3. 65 (0.000329)
> #4. buyit (0.000329)
> #5. probley (0.000329)
topic_2
> #1. bruklyns (0.000330)
> #2. 999 (0.000330)
> #3. fleece (0.000330)
> #4. chukka (0.000330)
> #5. totaly (0.000330)
topic_3
> #1. not (0.000329)
> #2. bright (0.000329)
> #3. stop (0.000329)
> #4. aso (0.000329)
> #5. ashamed (0.000329)
topic_4
> #1. y (0.000616)
> #2. talla (0.000616)
> #3. reginald (0.000616)
> #4. closet (0.000470)
> #5. el (0.000470)
topic_5
> #1.  (0.116232)
> #2. the (0.048904)
> #3. be (0.043734)
> #4. i (0.036246)
> #5. and (0.021669)
topic_6
> #1.  (0.111489)
> #2. be (0.043212)
> #3. the (0.039453)
> #4. i (0.038992)
> #5. and (0.023214)
topic_7
> #1. single (0.000477)
> #2. detach (0.000477)
> #3. 8211 (0.000477)
> #4. laver (0.000477)
> #5. insanely (0.000328)
topic_8
> #1. taco (0.000477)
> #2. juanita (0.000477)
> #3. puddle (0.000328)
> #4. snow (0.000328)
> #5. resulting (0.000328)
topic_9
> #1. necklace (0.000625)
> 

In [None]:
#For just Steve Madden Reviews with 4 or more stars
model_bg3 = models2['bigger'][0][1]
print_ldamodel_topic_words(model_bg3.topic_word_, vocab3_bg, top_n=5)

topic_1
> #1. goodwould (0.000479)
> #2. disign (0.000329)
> #3. 2star (0.000329)
> #4. advertized (0.000329)
> #5. economical (0.000329)
topic_2
> #1. addams (0.000330)
> #2. 34budget34 (0.000330)
> #3. bulletin (0.000330)
> #4. aluminum (0.000330)
> #5. gut (0.000330)
topic_3
> #1. depict (0.000329)
> #2. acne (0.000329)
> #3. function (0.000329)
> #4. 52534 (0.000329)
> #5. 52 (0.000329)
topic_4
> #1. inequitable (0.000616)
> #2. glisten (0.000616)
> #3. est (0.000616)
> #4. ample (0.000470)
> #5. block (0.000470)
topic_5
> #1.  (0.116232)
> #2. gorillapod (0.048904)
> #3. 75 (0.043734)
> #4. clour (0.036246)
> #5. 34wow (0.021669)
topic_6
> #1.  (0.111489)
> #2. 75 (0.043212)
> #3. gorillapod (0.039453)
> #4. clour (0.038992)
> #5. 34wow (0.023214)
topic_7
> #1. firt (0.000477)
> #2. batmanforty (0.000477)
> #3. 34age34 (0.000477)
> #4. considering (0.000477)
> #5. combination (0.000328)
topic_8
> #1. glamorus (0.000477)
> #2. completly (0.000477)
> #3. eleagent (0.000328)
> #4. fl