## Topic Modeling

In [1]:
import pandas as pd
import pickle
import gensim
from pprint import pprint


In [7]:
with open("clean_sents.pkl", "rb") as f:
    clean_sents = pickle.load(f)

clean_sents[0]

['climb',
 'owl',
 'head',
 'october',
 'complete',
 'northeast',
 'new_england',
 'new_hampshire',
 'list',
 'peak',
 'mean',
 'go_back',
 'true_summit',
 'relocate',
 'mile',
 'get',
 'fish',
 'fry',
 'saying',
 'go',
 'new_england',
 'high',
 'peak',
 'remain',
 'finish',
 'list',
 'finally',
 'decide',
 'update',
 'credential',
 'grab',
 'new',
 'notsonew',
 'tippy',
 'august',
 'nice',
 'long',
 'hike',
 'thought',
 'scramble',
 'go',
 'way',
 'tough',
 'longer_than',
 'turned_out',
 'water',
 'crossing',
 'send',
 'return',
 'mean',
 'chance',
 'soak',
 'foot',
 'nice',
 'cold',
 'water',
 'soul_until',
 'start',
 'take',
 'four_hour',
 'ten_minute',
 'lincoln_wood',
 'parking_lot',
 'river',
 'crossing',
 'high',
 'take',
 'shoe',
 'sock',
 'keep',
 'dry',
 'first_time',
 'simply',
 'wad',
 'army',
 'engineer',
 'officer',
 'meet',
 'fun',
 'talk',
 'turn',
 'right',
 'head',
 'uphill',
 'garfield',
 'second',
 'mile',
 'cold',
 'bit',
 'slow',
 'time',
 'get',
 'galehead',
 'hu

In [3]:
id2word = gensim.corpora.Dictionary(clean_sents)

corpus = [id2word.doc2bow(t) for t in clean_sents]

In [4]:
[(id2word[id], freq) for id, freq in corpus[0]]

[('about_am', 1),
 ('about_ft', 2),
 ('about_hour', 1),
 ('accord', 1),
 ('actual', 1),
 ('actually', 3),
 ('adapt', 1),
 ('adventure', 1),
 ('after_leav', 1),
 ('ahead', 1),
 ('amazed', 1),
 ('announce', 1),
 ('anyways', 1),
 ('approach', 1),
 ('army', 1),
 ('as_expect', 1),
 ('august', 1),
 ('awhile', 1),
 ('bad', 3),
 ('battle', 1),
 ('beautiful', 1),
 ('bent', 1),
 ('bike', 4),
 ('bit', 1),
 ('black', 1),
 ('blame', 1),
 ('blood', 1),
 ('blowdown', 1),
 ('branch', 1),
 ('brook', 1),
 ('brook_trail', 2),
 ('bummer', 1),
 ('bushwack', 1),
 ('bushwhack', 3),
 ('busy', 1),
 ('cairn', 6),
 ('came_back', 2),
 ('carefully', 1),
 ('challenge', 1),
 ('chance', 1),
 ('change', 2),
 ('cheated', 1),
 ('circle', 1),
 ('class', 2),
 ('clear', 2),
 ('climb', 4),
 ('clothe', 1),
 ('club', 1),
 ('cold', 2),
 ('colorado', 1),
 ('come', 2),
 ('company', 1),
 ('complete', 2),
 ('confirm', 1),
 ('continue', 1),
 ('could_hear', 1),
 ('count', 1),
 ('course', 1),
 ('credential', 1),
 ('cross', 1),
 ('cro

In [5]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           workers = 3,
                                           random_state=100,
                                           chunksize=100,
                                           passes=5,
                                           per_word_topics=True)

lda_model.save('lda_model.model')

In [6]:
lda_model = gensim.models.ldamulticore.LdaMulticore.load('lda_model.model')

In [7]:
pprint(lda_model.print_topics())

[(0,
  '0.031*"climb" + 0.013*"route" + 0.011*"great" + 0.009*"snow" + '
  '0.009*"mountain" + 0.009*"weather" + 0.008*"good" + 0.007*"time" + '
  '0.006*"start" + 0.006*"nice"'),
 (1,
  '0.025*"hike" + 0.020*"trail" + 0.016*"climb" + 0.013*"great" + 0.011*"view" '
  '+ 0.011*"peak" + 0.010*"snow" + 0.009*"nice" + 0.009*"time" + '
  '0.009*"mountain"'),
 (2,
  '0.027*"climb" + 0.014*"route" + 0.010*"great" + 0.010*"snow" + 0.009*"peak" '
  '+ 0.008*"fun" + 0.008*"lake" + 0.008*"hike" + 0.007*"way" + 0.007*"get"'),
 (3,
  '0.022*"climb" + 0.013*"great" + 0.012*"hike" + 0.011*"snow" + 0.010*"route" '
  '+ 0.008*"way" + 0.008*"peak" + 0.007*"nice" + 0.007*"mountain" + '
  '0.007*"time"'),
 (4,
  '0.013*"hike" + 0.010*"drive" + 0.009*"nice" + 0.008*"climb" + 0.008*"way" + '
  '0.008*"road" + 0.007*"trail" + 0.007*"mountain" + 0.007*"time" + '
  '0.007*"view"')]


In [2]:
# add more stop words

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['climb','hike','mountain','peak','great','good','time','hour','nice','beautiful','view',
                   'route','trail','ridge','go','way','get','take','start','trip','reach','weather'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [9]:
clean_sents_stop = remove_stopwords(clean_sents)

In [10]:
id2word_stop = gensim.corpora.Dictionary(clean_sents_stop)

corpus_stop = [id2word_stop.doc2bow(t) for t in clean_sents_stop]

In [23]:
# 5 topics and 5 passes

lda_stop = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_stop,
                                           id2word=id2word_stop,
                                           num_topics=5, 
                                           workers=3,
                                           random_state=100,
                                           chunksize=100,
                                           passes=5,
                                           per_word_topics=True)

lda_stop.save('lda_stop.model')

In [24]:
pprint(lda_stop.print_topics())

[(0,
  '0.010*"snow" + 0.008*"hut" + 0.008*"reach" + 0.006*"hour" + 0.006*"glacier" '
  '+ 0.005*"condition" + 0.005*"night" + 0.005*"long" + 0.005*"trip" + '
  '0.004*"leave"'),
 (1,
  '0.015*"snow" + 0.007*"fun" + 0.006*"long" + 0.005*"little" + 0.005*"trip" + '
  '0.004*"hour" + 0.004*"easy" + 0.004*"leave" + 0.004*"car" + 0.004*"rock"'),
 (2,
  '0.014*"snow" + 0.007*"camp" + 0.006*"glacier" + 0.005*"fun" + 0.005*"trip" '
  '+ 0.005*"lake" + 0.005*"rock" + 0.005*"leave" + 0.004*"long" + '
  '0.004*"hour"'),
 (3,
  '0.008*"snow" + 0.007*"fun" + 0.006*"drive" + 0.006*"road" + 0.006*"lake" + '
  '0.005*"trip" + 0.005*"little" + 0.005*"rock" + 0.005*"long" + 0.004*"easy"'),
 (4,
  '0.011*"fun" + 0.008*"rock" + 0.008*"pitch" + 0.007*"snow" + 0.005*"class" + '
  '0.005*"traverse" + 0.005*"climbing" + 0.005*"easy" + 0.004*"long" + '
  '0.004*"couloir"')]


In [25]:
# 4 topics and 30 passes

lda_stop2 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_stop,
                                           id2word=id2word_stop,
                                           num_topics=4, 
                                           workers=3,
                                           random_state=100,
                                           chunksize=100,
                                           passes=30,
                                           per_word_topics=True)

lda_stop2.save('lda_stop2.model')

In [26]:
pprint(lda_stop2.print_topics())

[(0,
  '0.009*"snow" + 0.009*"hut" + 0.008*"reach" + 0.006*"hour" + '
  '0.006*"condition" + 0.005*"glacier" + 0.005*"long" + 0.005*"night" + '
  '0.005*"guide" + 0.004*"easy"'),
 (1,
  '0.014*"snow" + 0.007*"fun" + 0.006*"long" + 0.005*"little" + 0.004*"trip" + '
  '0.004*"easy" + 0.004*"winter" + 0.004*"hour" + 0.004*"trailhead" + '
  '0.004*"road"'),
 (2,
  '0.015*"snow" + 0.008*"fun" + 0.007*"camp" + 0.006*"rock" + 0.006*"lake" + '
  '0.005*"leave" + 0.005*"trip" + 0.005*"glacier" + 0.005*"long" + '
  '0.004*"traverse"'),
 (3,
  '0.008*"fun" + 0.007*"drive" + 0.006*"road" + 0.005*"trip" + 0.005*"rock" + '
  '0.005*"lake" + 0.005*"little" + 0.004*"long" + 0.004*"easy" + 0.004*"find"')]


The model with 30 passes doesn't seem to be that much different than the one above with 5 passes, but it took nearly 10 times longer to run, so I'll stick with 5 passes to test different numbers of topics.

In [30]:
# 3 topics and 5 passes

lda_stop3 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_stop,
                                           id2word=id2word_stop,
                                           num_topics=3, 
                                           workers=3,
                                           random_state=100,
                                           chunksize=100,
                                           passes=5,
                                           per_word_topics=True)

lda_stop3.save('lda_stop3.model')

In [33]:
pprint(lda_stop3.print_topics(num_words=30))

[(0,
  '0.012*"snow" + 0.009*"fun" + 0.007*"lake" + 0.006*"long" + 0.006*"rock" + '
  '0.005*"camp" + 0.005*"traverse" + 0.004*"little" + 0.004*"class" + '
  '0.004*"easy" + 0.004*"car" + 0.004*"leave" + 0.003*"pass" + 0.003*"find" + '
  '0.003*"scramble" + 0.003*"climbing" + 0.003*"trailhead" + 0.003*"foot" + '
  '0.003*"head" + 0.003*"come" + 0.003*"solo" + 0.003*"north" + 0.003*"right" '
  '+ 0.003*"summit" + 0.003*"pitch" + 0.003*"approach" + 0.003*"descent" + '
  '0.003*"look" + 0.003*"couloir" + 0.002*"south"'),
 (1,
  '0.012*"snow" + 0.006*"fun" + 0.005*"long" + 0.005*"rock" + 0.005*"easy" + '
  '0.004*"glacier" + 0.004*"hut" + 0.004*"condition" + 0.004*"leave" + '
  '0.004*"little" + 0.004*"descent" + 0.004*"traverse" + 0.003*"wind" + '
  '0.003*"ice" + 0.003*"camp" + 0.003*"night" + 0.003*"cloud" + 0.003*"summit" '
  '+ 0.003*"people" + 0.003*"come" + 0.003*"steep" + 0.003*"ski" + 0.003*"day" '
  '+ 0.003*"ascent" + 0.003*"walk" + 0.003*"bit" + 0.003*"find" + '
  '0.002*"climb

These three topics are finally looking a bit decent.  I've picked out these few keywords for each topic:  
- Topic 0: lake, camp, traverse, scramble
- Topic 1: glacier, hut, condition, ice
- Topic 2: drive, road, walk


In [30]:
# testing the top topics for a couple mountains

lda_stop3 = gensim.models.LdaModel.load('lda_stop3.model')

mtn1 = lda_stop3.get_document_topics(corpus_stop[0]) # Owl's Head Mountain
mtn1.sort(key=lambda x: x[1], reverse=True)

mtn2 = lda_stop3.get_document_topics(corpus_stop[3]) # Aconcagua
mtn2.sort(key=lambda x: x[1], reverse=True)

mtn3 = lda_stop3.get_document_topics(corpus_stop[853]) # Y Mountain
mtn3.sort(key=lambda x: x[1], reverse=True)

print(f"Top topics for Owl's Head Mountain {mtn1}")
print(f"Top topics for Aconcagua {mtn2}")
print(f"Top topics for Y Mountain {mtn3}")

Top topics for Owl's Head Mountain [(2, 0.9287309), (0, 0.06018344), (1, 0.011085696)]
Top topics for Aconcagua [(1, 0.9997894)]
Top topics for Y Mountain [(2, 0.99875593)]


Creating an LDA visualization, we can dig a little deeper on the specific words in each topic.

In [22]:
import pyLDAvis
from pyLDAvis import gensim as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_stop3, corpus_stop, id2word_stop)
pyLDAvis.save_html(vis, 'lda.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


If you slide the relevance metric $λ$ = 0.2, you can see a heavier weighting given to words that appear in each topic, compared to the probability of the word appearing in all topics.  This emphasizes the words that I plucked out originally and adds a few more as well (**bolded** below).

- Topic 0: lake, camp, traverse, scramble, **class, pitch, exposure, approach** *(rock climbing words)*
- Topic 1: glacier, hut, condition, ice, **normal_route, crevasse, refuge, guide** *(mountaineering words)*
- Topic 2: drive, road, walk, **highpoint, tram, mile, hiking** *(leisurely hiking words)*

*Note, the numbers in the visualization do not align with the topic numbers, but the top words match*

In [32]:
dtm = pickle.load(open('dtm_combined.pkl','rb'))

In [62]:
tops = []
for i in range(0,len(lda_stop3[corpus_stop])):
    top_topics = lda_stop3.get_document_topics(corpus_stop[i])
    top_topics.sort(key=lambda x: x[1], reverse=True)
    tops.append(top_topics[0][0])

tops[:10]

[2, 2, 2, 1, 0, 1, 1, 1, 1, 1]

In [63]:
mtn_topics = list(zip(dtm.index, tops))
mtn_topics[:10]

[('"Owl\'s Head Mountain" (Peak above Owl\'s Head)', 2),
 ('Abajo Peak', 2),
 ('Abercrombie Mountain', 2),
 ('Aconcagua', 1),
 ('Adams, Mount', 0),
 ('Agassiz Peak', 1),
 ('Agung', 1),
 ('Aiguille Dibona', 1),
 ('Aiguille Verte', 1),
 ("Aiguille d'Argentière", 1)]

In [64]:
df_mtn_topics = pd.DataFrame({'mountain': dtm.index, 'topic': tops})
print(df_mtn_topics.info())
df_mtn_topics.sample(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 2 columns):
mountain    861 non-null object
topic       861 non-null int64
dtypes: int64(1), object(1)
memory usage: 13.5+ KB
None


Unnamed: 0,mountain,topic
254,Half Dome,0
70,Blue Mountain,2
360,Marys Peak,2
670,Ptarmigan Traverse,1
601,Naranjo de Bulnes (Picu Urriellu),2
122,Cheyenne Mountain,2
548,Mount Stone,0
537,Mount Shavano,0
225,Gobblers Knob,0
551,Mount Sunflower,2


In [76]:
# exporting mountain topic dataframe to CSV

df_mtn_topics.to_csv('./mtn_topics.csv', index=False)

In [6]:
df_locations = pd.read_csv('./mtn_locations.csv')
df_mtn_topics = pd.read_csv('./mtn_topics.csv')
print(df_locations.info())
df_locations

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 6 columns):
mountain     960 non-null object
continent    960 non-null object
country      960 non-null object
location     921 non-null object
url          960 non-null object
img_url      960 non-null object
dtypes: object(6)
memory usage: 45.1+ KB
None


Unnamed: 0,mountain,continent,country,location,url,img_url
0,Mount Whitney,North America,United States,California,http://www.summitpost.org/mount-whitney/150227,https://sp-images.summitpost.org/469726.JPG?au...
1,Mount Rainier,North America,United States,Washington,http://www.summitpost.org/mount-rainier/150291,https://sp-images.summitpost.org/457178.jpg?au...
2,Mount Shasta,North America,United States,California,http://www.summitpost.org/mount-shasta/150188,https://sp-images.summitpost.org/878331.jpg?au...
3,Mount Hood,North America,United States,Oregon,http://www.summitpost.org/mount-hood/150189,https://sp-images.summitpost.org/5175.jpg?auto...
4,Denali,North America,United States,Alaska,http://www.summitpost.org/denali/150199,https://sp-images.summitpost.org/831080.JPG?au...
5,Mount Elbert,North America,United States,Colorado,http://www.summitpost.org/mount-elbert/150325,https://sp-images.summitpost.org/844972.JPG?au...
6,Katahdin,North America,United States,Maine,http://www.summitpost.org/katahdin/150219,https://sp-images.summitpost.org/74769.jpg?aut...
7,Aconcagua,South America,Argentina,Mendoza,http://www.summitpost.org/aconcagua/150197,https://sp-images.summitpost.org/146685.jpg?au...
8,Mount Adams,North America,United States,Washington,http://www.summitpost.org/mount-adams/150198,https://sp-images.summitpost.org/80003.jpg?aut...
9,Grand Teton,North America,United States,Wyoming,http://www.summitpost.org/grand-teton/150312,https://sp-images.summitpost.org/844255.jpg?au...


In [7]:
pd.options.display.max_rows = 999

df_joined = df_mtn_topics.join(df_locations.set_index('mountain'), on='mountain', how='left')
df_joined

Unnamed: 0,mountain,topic,continent,country,location,url,img_url
0,"""Owl's Head Mountain"" (Peak above Owl's Head)",2,North America,United States,New Hampshire,http://www.summitpost.org/owl-s-head-mountain-...,https://sp-images.summitpost.org/328693.JPG?au...
1,Abajo Peak,2,North America,United States,Utah,http://www.summitpost.org/abajo-peak/153601,https://sp-images.summitpost.org/86784.jpg?aut...
2,Abercrombie Mountain,2,North America,United States,Washington,http://www.summitpost.org/abercrombie-mountain...,https://sp-images.summitpost.org/36084.jpg?aut...
3,Aconcagua,1,South America,Argentina,Mendoza,http://www.summitpost.org/aconcagua/150197,https://sp-images.summitpost.org/146685.jpg?au...
4,"Adams, Mount",0,North America,United States,Colorado,http://www.summitpost.org/adams-mount/150608,https://sp-images.summitpost.org/43479.jpg?aut...
5,Agassiz Peak,1,North America,United States,Arizona,http://www.summitpost.org/agassiz-peak/150289,http://www.summitpost.org/agassiz-peak/150289
6,Agung,1,Asia,Indonesia,Bali,http://www.summitpost.org/agung/152336,https://sp-images.summitpost.org/286411.jpg?au...
7,Aiguille Dibona,1,Europe,France,Écrins Massif/Isère,http://www.summitpost.org/aiguille-dibona/150836,https://sp-images.summitpost.org/812061.JPG?au...
8,Aiguille Verte,1,Europe,France,Haute Savoie (Mont Blanc),http://www.summitpost.org/aiguille-verte/150223,https://sp-images.summitpost.org/737593.jpg?au...
9,Aiguille d'Argentière,1,Europe,France,Haute Savoie (Mont Blanc),http://www.summitpost.org/aiguille-d-argenti-r...,https://sp-images.summitpost.org/597767.jpg?au...


In [8]:
df_joined.to_csv('./mtn_joined.csv', index=False)