In [1]:
import nltk
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv('climber_logs.csv')
data['comment_date'] = pd.to_datetime(data['comment_date'])
data['climb_date'] = pd.to_datetime(data['climb_date'])
data['comment'] = data['comment'].replace(np.nan, '', regex=True)

print(data.info())
data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35638 entries, 0 to 35637
Data columns (total 4 columns):
mountain        35638 non-null object
comment         35638 non-null object
comment_date    35638 non-null datetime64[ns]
climb_date      22860 non-null datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 1.1+ MB
None


Unnamed: 0,mountain,comment,comment_date,climb_date
0,Mount Whitney,Did this in a single day... very difficult for...,2018-11-19 08:51:00,2015-09-05
1,Mount Whitney,Mountaineers Route - first 14er,2018-11-01 06:34:00,NaT
2,Mount Whitney,Standard route from Whitney Portal via Trail C...,2018-10-01 12:44:00,2018-10-27
3,Mount Whitney,Worth hiking the 220-something miles along the...,2018-09-10 10:23:00,2016-07-23
4,Mount Whitney,"My brother John, his son and I backpacked in t...",2018-08-15 02:28:00,2018-08-10
5,Mount Whitney,Words will never describe the breathtaking vie...,2018-08-02 11:50:00,2018-07-01
6,Mount Whitney,Portal out and back,2018-07-31 07:51:00,2018-07-29
7,Mount Whitney,Great 2am ascent from Trail Camp.\nhttps://the...,2018-07-28 20:09:00,2018-07-28
8,Mount Whitney,Hiked from guitar lake in the middle of the ni...,2018-07-24 06:33:00,2014-08-08
9,Mount Whitney,Summited Whitney via the Whitney Trail with tw...,2018-07-02 07:31:00,2018-06-24


In [3]:
# preprocess text - change to lowercase, remove numbers and punctuation

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
no_new_line = lambda x: re.sub('\n', '', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

data['comment'] = data.comment.map(alphanumeric).map(punc_lower).map(no_new_line)
data.head()

Unnamed: 0,mountain,comment,comment_date,climb_date
0,Mount Whitney,did this in a single day very difficult for...,2018-11-19 08:51:00,2015-09-05
1,Mount Whitney,mountaineers route first,2018-11-01 06:34:00,NaT
2,Mount Whitney,standard route from whitney portal via trail c...,2018-10-01 12:44:00,2018-10-27
3,Mount Whitney,worth hiking the something miles along the j...,2018-09-10 10:23:00,2016-07-23
4,Mount Whitney,my brother john his son and i backpacked in t...,2018-08-15 02:28:00,2018-08-10


In [29]:
sample = data['comment'].sample(1000)

## Find Top Words with Count Vectorizer

In [32]:
cv1 = CountVectorizer(stop_words='english')
doc_word = cv1.fit_transform(sample).toarray()
doc_word.shape

(1000, 4875)

In [33]:
feature_names = cv1.get_feature_names()
sum_words = doc_word.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in cv1.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

IndexError: too many indices for array

In [34]:
sum_words

array([1, 3, 4, ..., 1, 3, 1], dtype=int64)

In [28]:
words_freq[:20]

[(('little', 2443), 1),
 (('long', 2463), 1),
 (('easy', 1316), 1),
 (('nice', 2824), 1),
 (('views', 4533), 1),
 (('awesome', 288), 1),
 (('weekend', 4616), 1),
 (('climb', 807), 1),
 (('summit', 4097), 1),
 (('daniel', 1064), 1),
 (('night', 2835), 1),
 (('camped', 626), 1),
 (('moraine', 2724), 1),
 (('glacier', 1741), 1),
 (('big', 418), 1),
 (('mistake', 2680), 1),
 (('sun', 4106), 1),
 (('went', 4627), 1),
 (('winds', 4679), 1),
 (('picked', 3095), 1)]

## Find Top Words with TF-IDF Vectorizer

In [14]:
cv_tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = cv_tfidf.fit_transform(sample).toarray()
X_tfidf.shape

(1000, 4787)

In [15]:

feature_array = np.array(cv_tfidf.get_feature_names())
tfidf_sorting = np.argsort(X_tfidf.flatten()[::-1])

n = 10
top_n = feature_array[tfidf_sorting][:n]

IndexError: index 3188068 is out of bounds for axis 1 with size 4787

## testing LSA topic modeling

In [5]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(sample)
doc_word.shape

(1000, 4787)

In [6]:
pd.DataFrame(doc_word.toarray(), index=sample, columns=vectorizer.get_feature_names()).head(10)

Unnamed: 0_level_0,aasgard,ability,able,abol,abondoned,aborting,abseil,absence,absolutely,abundant,...,zermatt,zero,zig,ziplock,zone,zoo,zoomed,zurbriggen,åsa,çlear
comment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a little long,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
very easy nice views up top,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
had an awesome weekend to climb to the e summit of daniel night camped at the moraine of the glacier big mistake as when the sun went down the winds picked up and blew ultra fine glacial dust into my tent which i am still trying to get out skirted the glacier as i was solo and tried staying on rock as much as possible interestingly on the snow field above hyas creek glacier i found an old person helipod that had crashed and melted out views in all direction were spectacular,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
gorgeous day lots of sp ers on the summit together today forjan you are an animal running to the summit is superhuman mybackyard and fossana only minutes and minutes behind forjan you qualify as animals also with your second summit of the day wingding it was a pleasure to make your acquaintance along with dave bruce alan and frank i guess mt conness was violated from all sides today,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
kelso ridge was very fun,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
this is a long hike make sure to leave enough time to get down before the thunderstorms roll in,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
very straight forward hike trails are well marked and maintained,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
great climb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
took the west route and was surprised at how much more work it is to get to the summit from pandora s box great views of the lakes make it all worth it,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
not sure when i did it sometime in between february and april i encountered some bad vibes from other guys up there and a climbing group that thought they owned the mountain for a day you know how that goes it was a great day otherwise good views and nice hike needed my ice axe for the last bit,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
lsa = TruncatedSVD(2) # getting N most important components/topics
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.05362247, 0.02531442])

In [8]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aasgard,ability,able,abol,abondoned,aborting,abseil,absence,absolutely,abundant,...,zermatt,zero,zig,ziplock,zone,zoo,zoomed,zurbriggen,åsa,çlear
component_1,0.001,0.001,0.031,0.0,0.001,0.0,0.001,0.001,0.005,0.0,...,0.013,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001
component_2,0.002,0.0,0.052,0.0,0.0,0.0,0.0,-0.001,0.0,0.0,...,0.005,-0.002,-0.002,0.001,-0.002,-0.0,-0.001,-0.001,0.0,0.0


In [9]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [10]:
display_topics(lsa, vectorizer.get_feature_names(), 5)


Topic  0
summit, day, trail, time, route

Topic  1
time, route, base, exposure, reached


In [11]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = sample,
             columns = ["component_1","component_2" ])
Vt

Unnamed: 0_level_0,component_1,component_2
comment,Unnamed: 1_level_1,Unnamed: 2_level_1
a little long,0.11831,-0.02903
very easy nice views up top,0.21568,-0.09829
had an awesome weekend to climb to the e summit of daniel night camped at the moraine of the glacier big mistake as when the sun went down the winds picked up and blew ultra fine glacial dust into my tent which i am still trying to get out skirted the glacier as i was solo and tried staying on rock as much as possible interestingly on the snow field above hyas creek glacier i found an old person helipod that had crashed and melted out views in all direction were spectacular,1.40288,-0.44241
gorgeous day lots of sp ers on the summit together today forjan you are an animal running to the summit is superhuman mybackyard and fossana only minutes and minutes behind forjan you qualify as animals also with your second summit of the day wingding it was a pleasure to make your acquaintance along with dave bruce alan and frank i guess mt conness was violated from all sides today,2.19534,-2.02799
kelso ridge was very fun,0.25307,0.12359
this is a long hike make sure to leave enough time to get down before the thunderstorms roll in,0.45591,0.30053
very straight forward hike trails are well marked and maintained,0.13776,-0.09279
great climb,0.33241,0.14602
took the west route and was surprised at how much more work it is to get to the summit from pandora s box great views of the lakes make it all worth it,1.11784,-0.22949
not sure when i did it sometime in between february and april i encountered some bad vibes from other guys up there and a climbing group that thought they owned the mountain for a day you know how that goes it was a great day otherwise good views and nice hike needed my ice axe for the last bit,1.77290,-0.52779
