## Topic Modeling

In [1]:
import pandas as pd
import pickle
import gensim
from pprint import pprint


In [2]:
with open("clean_sents.pkl", "rb") as f:
    clean_sents = pickle.load(f)

clean_sents[0]

['climb',
 'owl',
 'head',
 'october',
 'complete',
 'northeast',
 'new_england',
 'new_hampshire',
 'list',
 'peak',
 'mean',
 'go_back',
 'true_summit',
 'relocate',
 'mile',
 'get',
 'fish',
 'fry',
 'saying',
 'go',
 'new_england',
 'high',
 'peak',
 'remain',
 'finish',
 'list',
 'finally',
 'decide',
 'update',
 'credential',
 'grab',
 'new',
 'notsonew',
 'tippy',
 'august',
 'nice',
 'long',
 'hike',
 'thought',
 'scramble',
 'go',
 'way',
 'tough',
 'longer_than',
 'turned_out',
 'water',
 'crossing',
 'send',
 'return',
 'mean',
 'chance',
 'soak',
 'foot',
 'nice',
 'cold',
 'water',
 'soul_until',
 'start',
 'take',
 'four_hour',
 'ten_minute',
 'lincoln_wood',
 'parking_lot',
 'river',
 'crossing',
 'high',
 'take',
 'shoe',
 'sock',
 'keep',
 'dry',
 'first_time',
 'simply',
 'wad',
 'army',
 'engineer',
 'officer',
 'meet',
 'fun',
 'talk',
 'turn',
 'right',
 'head',
 'uphill',
 'garfield',
 'second',
 'mile',
 'cold',
 'bit',
 'slow',
 'time',
 'get',
 'galehead',
 'hu

In [3]:
id2word = gensim.corpora.Dictionary(clean_sents)

corpus = [id2word.doc2bow(t) for t in clean_sents]

In [4]:
[(id2word[id], freq) for id, freq in corpus[0]]

[('about_am', 1),
 ('about_ft', 2),
 ('about_hour', 1),
 ('accord', 1),
 ('actual', 1),
 ('actually', 3),
 ('adapt', 1),
 ('adventure', 1),
 ('after_leav', 1),
 ('ahead', 1),
 ('amazed', 1),
 ('announce', 1),
 ('anyways', 1),
 ('approach', 1),
 ('army', 1),
 ('as_expect', 1),
 ('august', 1),
 ('awhile', 1),
 ('bad', 3),
 ('battle', 1),
 ('beautiful', 1),
 ('bent', 1),
 ('bike', 4),
 ('bit', 1),
 ('black', 1),
 ('blame', 1),
 ('blood', 1),
 ('blowdown', 1),
 ('branch', 1),
 ('brook', 1),
 ('brook_trail', 2),
 ('bummer', 1),
 ('bushwack', 1),
 ('bushwhack', 3),
 ('busy', 1),
 ('cairn', 6),
 ('came_back', 2),
 ('carefully', 1),
 ('challenge', 1),
 ('chance', 1),
 ('change', 2),
 ('cheated', 1),
 ('circle', 1),
 ('class', 2),
 ('clear', 2),
 ('climb', 4),
 ('clothe', 1),
 ('club', 1),
 ('cold', 2),
 ('colorado', 1),
 ('come', 2),
 ('company', 1),
 ('complete', 2),
 ('confirm', 1),
 ('continue', 1),
 ('could_hear', 1),
 ('count', 1),
 ('course', 1),
 ('credential', 1),
 ('cross', 1),
 ('cro

In [5]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           workers = 3,
                                           random_state=100,
                                           chunksize=100,
                                           passes=5,
                                           per_word_topics=True)

lda_model.save('lda_model.model')

In [6]:
lda_model = gensim.models.ldamulticore.LdaMulticore.load('lda_model.model')

In [7]:
pprint(lda_model.print_topics())

[(0,
  '0.031*"climb" + 0.013*"route" + 0.011*"great" + 0.009*"snow" + '
  '0.009*"mountain" + 0.009*"weather" + 0.008*"good" + 0.007*"time" + '
  '0.006*"start" + 0.006*"nice"'),
 (1,
  '0.025*"hike" + 0.020*"trail" + 0.016*"climb" + 0.013*"great" + 0.011*"view" '
  '+ 0.011*"peak" + 0.010*"snow" + 0.009*"nice" + 0.009*"time" + '
  '0.009*"mountain"'),
 (2,
  '0.027*"climb" + 0.014*"route" + 0.010*"great" + 0.010*"snow" + 0.009*"peak" '
  '+ 0.008*"fun" + 0.008*"lake" + 0.008*"hike" + 0.007*"way" + 0.007*"get"'),
 (3,
  '0.022*"climb" + 0.013*"great" + 0.012*"hike" + 0.011*"snow" + 0.010*"route" '
  '+ 0.008*"way" + 0.008*"peak" + 0.007*"nice" + 0.007*"mountain" + '
  '0.007*"time"'),
 (4,
  '0.013*"hike" + 0.010*"drive" + 0.009*"nice" + 0.008*"climb" + 0.008*"way" + '
  '0.008*"road" + 0.007*"trail" + 0.007*"mountain" + 0.007*"time" + '
  '0.007*"view"')]


In [20]:
# add more stop words

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['climb','hike','mountain','peak','great','good','time','nice','beautiful','view','weather',
                   'route','trail','ridge','go','way','get','take','start'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [21]:
clean_sents_stop = remove_stopwords(clean_sents)

In [22]:
id2word_stop = gensim.corpora.Dictionary(clean_sents_stop)

corpus_stop = [id2word_stop.doc2bow(t) for t in clean_sents_stop]

In [23]:
lda_stop = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_stop,
                                           id2word=id2word_stop,
                                           num_topics=5, 
                                           workers=3,
                                           random_state=100,
                                           chunksize=100,
                                           passes=5,
                                           per_word_topics=True)

lda_stop.save('lda_stop.model')

In [24]:
pprint(lda_stop.print_topics())

[(0,
  '0.010*"snow" + 0.008*"hut" + 0.008*"reach" + 0.006*"hour" + 0.006*"glacier" '
  '+ 0.005*"condition" + 0.005*"night" + 0.005*"long" + 0.005*"trip" + '
  '0.004*"leave"'),
 (1,
  '0.015*"snow" + 0.007*"fun" + 0.006*"long" + 0.005*"little" + 0.005*"trip" + '
  '0.004*"hour" + 0.004*"easy" + 0.004*"leave" + 0.004*"car" + 0.004*"rock"'),
 (2,
  '0.014*"snow" + 0.007*"camp" + 0.006*"glacier" + 0.005*"fun" + 0.005*"trip" '
  '+ 0.005*"lake" + 0.005*"rock" + 0.005*"leave" + 0.004*"long" + '
  '0.004*"hour"'),
 (3,
  '0.008*"snow" + 0.007*"fun" + 0.006*"drive" + 0.006*"road" + 0.006*"lake" + '
  '0.005*"trip" + 0.005*"little" + 0.005*"rock" + 0.005*"long" + 0.004*"easy"'),
 (4,
  '0.011*"fun" + 0.008*"rock" + 0.008*"pitch" + 0.007*"snow" + 0.005*"class" + '
  '0.005*"traverse" + 0.005*"climbing" + 0.005*"easy" + 0.004*"long" + '
  '0.004*"couloir"')]


In [None]:
lda_stop2 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_stop,
                                           id2word=id2word_stop,
                                           num_topics=4, 
                                           workers=3,
                                           random_state=100,
                                           chunksize=100,
                                           passes=30,
                                           per_word_topics=True)

lda_stop2.save('lda_stop2.model')

In [None]:
pprint(lda_stop2.print_topics())