# Part 4: Running Topic Models



#Imports/Loading Data

In [1]:
try:
    import tmtoolkit
except:
    !pip install tmtoolkit
    import tmtoolkit

try:
    from tmtoolkit.topicmod.tm_lda import compute_models_parallel
except:
    !pip install tmtoolkit['lda']
    from tmtoolkit.topicmod.tm_lda import compute_models_parallel

try:
    from lda import LDA
except:
    !pip install lda
    from lda import LDA

import pickle #loading pickles
import logging #supress logs from lda
import warnings
import scipy.sparse

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import random
random.seed(20210128)

def get_time():
    from datetime import datetime, timedelta
    d = datetime.today() - timedelta(hours=7, minutes=0)
    return d.strftime('%H:%M:%S %p')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data: 

Loads the document term matrices we created

In [2]:
working_directory = '/content/drive/MyDrive/APRD6343'

#import files
doc_labels = pickle.load(open('%s/doc_labels1.p' % working_directory, 'rb')) #labels
dtm_sm = scipy.sparse.load_npz('%s/small_dtm1.npz' % working_directory) #more processed dtm (more processed = less shit in it)
dtm_bg = scipy.sparse.load_npz('%s/big_dtm1.npz' % working_directory) #less processed dtm
vocab_bg = pickle.load(open('%s/big_vocab1.p' % working_directory, 'rb')) #less processed vocab (keys)
vocab_sm = pickle.load(open('%s/small_vocab1.p' % working_directory, 'rb')) #more processed vocab (keys)
corpus = pickle.load(open('%s/corpus1.p' % working_directory, 'rb')) #corpus

In [3]:
#import logger for lda and ignore / get rid of warnings
logger = logging.getLogger('lda')
logger.addHandler(logging.NullHandler())
logger.propagate = False
warnings.filterwarnings('ignore')

#Creating Models

##1/ Low beta and Low alpha
*   k = 25
*   beta = 0.01
*   alpha = 1/(10*k)

###build

In [None]:
#set data to use (dictionaries)

#note, can add models if we wanted to
dtms = {
    'smaller': dtm_sm
}

#add fixed hyperparamaters (that we need for lda)

lda_params = {
    'n_topics': 25, #k term. get from notebook 5EvaluatingTopicModels
    'eta': 0.01,
    'n_iter': 500,
    'random_state': 20210128, #to make results reproducible
    'alpha': 1/(10*25)
}

In [None]:
print("Time of start: '%s'" % get_time())

Time of start: '14:37:34 PM'


In [None]:
#create the topic models
models = compute_models_parallel(dtms, constant_parameters = lda_params)
print("Time of finish: '%s'" % get_time())

Time of finish: '14:42:07 PM'


__NOTE__: PARAMATER MEANINGS

*n_topics*: Should not see the same words appear in multiple topics. Smaller n_topics (or k) will create less word overlap!!

*eta*: higher the eta, the less granular, the lower the eta the more granular. In general, the higher you set the eta, the lower number of topics in your model. .01 = more topics usually. If they appear to have overlap (have the same words) then make it smaller or bigger

*n_iter*: How many times we will run the cluster modeling to try and find the best fit. 500 is probably good, but 1000 is ideal (it just takes a while)


###view

In [None]:
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words
model_sm = models['smaller'][0][1] #extracts smaller model and pulls first and second item from the model
print_ldamodel_topic_words(model_sm.topic_word_, vocab_sm, top_n=3) #top_n=5 specifies how many words to print from each topic. 

topic_1
> #1. pant (0.118412)
> #2. size (0.092726)
> #3. waist (0.075931)
topic_2
> #1. jacket (0.144163)
> #2. pocket (0.036544)
> #3. coat (0.036257)
topic_3
> #1. pant (0.136149)
> #2. pocket (0.100450)
> #3. short (0.054781)
topic_4
> #1. size (0.091921)
> #2. jean (0.074346)
> #3. pant (0.073724)
topic_5
> #1. sock (0.242329)
> #2. foot (0.125096)
> #3. size (0.065514)
topic_6
> #1. size (0.098758)
> #2. waist (0.046577)
> #3. overall (0.042408)
topic_7
> #1. jean (0.161233)
> #2. pair (0.071584)
> #3. quality (0.054205)
topic_8
> #1. size (0.148068)
> #2. jacket (0.080433)
> #3. shirt (0.070014)
topic_9
> #1. coat (0.198041)
> #2. year (0.062038)
> #3. jacket (0.051131)
topic_10
> #1. shirt (0.290295)
> #2. quality (0.050263)
> #3. work (0.049176)
topic_11
> #1. sweatshirt (0.113311)
> #2. jacket (0.059612)
> #3. quality (0.049924)
topic_12
> #1. jacket (0.062029)
> #2. coat (0.051841)
> #3. winter (0.042402)
topic_13
> #1. pant (0.189904)
> #2. pocket (0.097128)
> #3. work (0.0

note: there is a lot of word overlap between topics and not a lot of correlation within topics. This means not the best model. 

To decrease the repitition in topics, we an either decrease K or change eta. Maybe increase number of topics too?

##2/ Low alpha and Reg beta
*   k = 30
*   beta = 0.1
*   alpha = 1/(10*k)

### build

In [None]:
print("Time of start: '%s'" % get_time())

In [None]:
# set data to use
dtms = {
    'smaller': dtm_sm
}

# and fixed hyperparameters
lda_params = {
    'n_topics': 30,
    'eta': .1,
    'alpha': 1/(10*30),
    'n_iter': 1000,
    'random_state': 20210128  # to make results reproducible
}

#create the topic models
models2 = compute_models_parallel(dtms, constant_parameters = lda_params)
print("Time of finish: '%s'" % get_time())

Time of finish: '2021-02-10 20:47:02.918284'


### view

In [None]:
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words
model2_sm = models2['smaller'][0][1] #extracts smaller model and pulls first and second item from the model
print_ldamodel_topic_words(model2_sm.topic_word_, vocab_sm, top_n=3) #top_n=5 specifies how many words to print from each topic. 

topic_1
> #1. pocket (0.160001)
> #2. pant (0.152627)
> #3. pair (0.044671)
topic_2
> #1. wallet (0.254235)
> #2. pocket (0.094775)
> #3. money (0.043008)
topic_3
> #1. short (0.159956)
> #2. pocket (0.070905)
> #3. pair (0.057895)
topic_4
> #1. sweatshirt (0.101383)
> #2. hoodie (0.066856)
> #3. quality (0.043038)
topic_5
> #1. boot (0.238398)
> #2. pair (0.064957)
> #3. foot (0.050642)
topic_6
> #1. vest (0.102071)
> #2. pocket (0.050140)
> #3. color (0.045510)
topic_7
> #1. pant (0.090596)
> #2. work (0.066710)
> #3. jean (0.055436)
topic_8
> #1. shirt (0.209316)
> #2. sleeve (0.052644)
> #3. color (0.039215)
topic_9
> #1. shirt (0.268178)
> #2. quality (0.061622)
> #3. work (0.051838)
topic_10
> #1. sock (0.208096)
> #2. size (0.128403)
> #3. foot (0.101111)
topic_11
> #1. jacket (0.216781)
> #2. pocket (0.052083)
> #3. hood (0.045791)
topic_12
> #1. product (0.091587)
> #2. quality (0.076257)
> #3. amazon (0.068799)
topic_13
> #1. jean (0.237019)
> #2. pair (0.069520)
> #3. fit (0

##3/ High beta and Reg alpha
* k = 35
* beta = .5
* alpha = 1/k

__*NOTE: BEST MODEL*__

### build

In [None]:
print("Time of start: '%s'" % get_time())

Time of start: '2021-02-10 21:00:52.633705'


In [None]:
#set data to use
dtms = {
    'smaller': dtm_sm
}

#and fixed hyperparameters
lda_params = {
    'n_topics': 35,
    'eta': .5,
    'alpha': 1/35,
    'n_iter': 1000,
    'random_state': 20210128  #to make results reproducible
}

#create the topic models
models3 = compute_models_parallel(dtms, constant_parameters = lda_params)
print("Time of finish: '%s'" % get_time())

Time of finish: '2021-02-10 21:02:11.442903'


### view

In [None]:
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words
model3_sm = models3['smaller'][0][1] #extracts smaller model and pulls first and second item from the model
print_ldamodel_topic_words(model3_sm.topic_word_, vocab_sm, top_n=3) #top_n=5 specifies how many words to print from each topic. 

topic_1
> #1. sock (0.254454)
> #2. size (0.129860)
> #3. foot (0.113053)
topic_2
> #1. wallet (0.258247)
> #2. pocket (0.105915)
> #3. money (0.050585)
topic_3
> #1. size (0.142064)
> #2. price (0.099189)
> #3. store (0.089415)
topic_4
> #1. pant (0.195349)
> #2. pair (0.091815)
> #3. work (0.077420)
topic_5
> #1. shirt (0.381052)
> #2. quality (0.047871)
> #3. pocket (0.035288)
topic_6
> #1. one (0.113707)
> #2. year (0.110729)
> #3. jacket (0.081186)
topic_7
> #1. jacket (0.411601)
> #2. weather (0.033829)
> #3. winter (0.030524)
topic_8
> #1. brand (0.087680)
> #2. quality (0.087680)
> #3. work (0.082542)
topic_9
> #1. shirt (0.334226)
> #2. quality (0.075703)
> #3. color (0.063961)
topic_10
> #1. product (0.202918)
> #2. quality (0.189286)
> #3. price (0.085256)
topic_11
> #1. coat (0.411426)
> #2. year (0.056716)
> #3. winter (0.056099)
topic_12
> #1. size (0.071552)
> #2. shoulder (0.053052)
> #3. arm (0.047610)
topic_13
> #1. pant (0.304052)
> #2. work (0.116021)
> #3. pair (0.

##4/ High beta and Low alpha
*   k = 25
*   beta = 0.5
*   alpha = 1/(10*k)


### build

In [None]:
print("Time of start: '%s'" % get_time())

Time of start: '14:28:28 PM'


In [None]:
#set data to use
dtms = {
    'smaller': dtm_sm
}

#and fixed hyperparameters
lda_params = {
    'n_topics': 25,
    'eta': .5,
    'alpha': 1/(10*35),
    'n_iter': 1000,
    'random_state': 20210128  #to make results reproducible
}

#create the topic models
models4 = compute_models_parallel(dtms, constant_parameters = lda_params)
print("Time of finish: '%s'" % get_time())

Time of finish: '14:29:26 PM'


### view

In [None]:
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words
model4_sm = models4['smaller'][0][1] #extracts smaller model and pulls first and second item from the model
print_ldamodel_topic_words(model4_sm.topic_word_, vocab_sm, top_n=3) #top_n=5 specifies how many words to print from each topic. 

topic_1
> #1. size (0.070609)
> #2. pocket (0.063987)
> #3. pant (0.062755)
topic_2
> #1. jacket (0.166357)
> #2. pocket (0.057169)
> #3. coat (0.052499)
topic_3
> #1. pant (0.143094)
> #2. pocket (0.081844)
> #3. work (0.061024)
topic_4
> #1. shirt (0.289615)
> #2. color (0.040021)
> #3. quality (0.029902)
topic_5
> #1. sock (0.161284)
> #2. boot (0.146952)
> #3. foot (0.096306)
topic_6
> #1. pant (0.109541)
> #2. work (0.049244)
> #3. pair (0.047741)
topic_7
> #1. pair (0.094459)
> #2. pant (0.073974)
> #3. jean (0.054714)
topic_8
> #1. jacket (0.133121)
> #2. size (0.094314)
> #3. sleeve (0.039957)
topic_9
> #1. shirt (0.236493)
> #2. quality (0.087368)
> #3. work (0.044631)
topic_10
> #1. pant (0.171700)
> #2. pocket (0.113974)
> #3. pair (0.054558)
topic_11
> #1. jacket (0.291330)
> #2. one (0.033150)
> #3. year (0.030572)
topic_12
> #1. gift (0.082249)
> #2. christmas (0.080772)
> #3. son (0.068033)
topic_13
> #1. wallet (0.247940)
> #2. pocket (0.084528)
> #3. money (0.040515)
t