In [37]:
# NLP Processing
# Processing by number of stars in the review
import pandas as pd
reviews = pd.read_csv ('OnTheSnow_SkiAreaReviews.csv')
reviews.columns = ['placeholder','state','ski_area','reviewer_name','review_date',
                        'review_stars','review_text']

ski_areas = reviews.groupby(['ski_area'])['ski_area'].count().sort_values(ascending=False)
#ski_areas.columns = ['ski_area']

In [44]:
ski_areas_stars = reviews.groupby(['ski_area','review_stars'])['review_stars'].mean().sort_values(ascending=False)

In [41]:
#ski_areas.iloc[:,1].sort_values()
print(ski_areas.head(10))
print(ski_areas_stars.head(10))


ski_area
ski-brule                   1315
killington-resort            204
vail                         203
winter-park-resort           191
blue-mountain-ski-area       189
breckenridge                 188
ski-apache                   172
heavenly-mountain-resort     157
wolf-creek-ski-area          155
steamboat                    150
Name: ski_area, dtype: int64
ski_area                     review_stars
woods-valley-ski-area        5               5.0
royal-mountain-ski-area      5               5.0
perfect-north-slopes         5               5.0
canaan-valley-resort         5               5.0
pico-mountain-at-killington  5               5.0
campgaw-mountain             5               5.0
pine-knob-ski-resort         5               5.0
pine-mountain                5               5.0
camelback-mountain-resort    5               5.0
plattekill-mountain          5               5.0
Name: review_stars, dtype: float64


In [26]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np



# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def get_clean_review(df):
    clean_review = []
    for index, row in df.iterrows():
        review_str = str(row['review_text'])
        # split the string into a list of words
        tokens = word_tokenize(review_str)
        lower_tokens = [t.lower() for t in tokens]

        # removes anything not alpha characters such as punctuation and numbers
        alpha_only = [t for t in lower_tokens if t.isalpha()]

        # removes the stop words from the text
        no_stops = [t for t in alpha_only if t not in stopwords.words('english')]

        # Lemmatize all tokens into a new list: lemmatized
        lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

        # make a list of lists
        for w in lemmatized:
            clean_review.append(w)
        
    return clean_review

reviews_ski_area = []

for sa in ski_areas.index:
    print(sa)
    reviews_to_pass= reviews[reviews['ski_area']==sa]
    print(len(reviews_to_pass))
    reviews_ski_area.append( get_clean_review(reviews_to_pass))



ski-brule
1315
killington-resort
204
vail
203
winter-park-resort
191
blue-mountain-ski-area
189
breckenridge
188
ski-apache
172
heavenly-mountain-resort
157
wolf-creek-ski-area
155
steamboat
150
mountain-creek-resort
148
camelback-mountain-resort
146
snowshoe-mountain-resort
146
mammoth-mountain-ski-area
142
keystone
141
alta-ski-area
140
taos-ski-valley
138
arapahoe-basin-ski-area
138
copper-mountain-resort
138
big-sky-resort
136
mount-snow
135
park-city-mountain-resort
135
sno-mountain
134
snowbird
134
jay-peak
133
okemo-mountain-resort
132
sugar-mountain-resort
129
sunrise-park-resort
121
indianhead-mountain
115
telluride
114
squaw-valley-usa
113
hunter-mountain
113
stratton-mountain
111
granite-peak-ski-area
111
angel-fire-resort
111
elk-mountain-ski-resort
108
arizona-snowbowl
107
kirkwood
107
loveland
107
beaver-creek
106
stowe-mountain-resort
106
jackson-hole
103
northstar-california
102
stevens-pass-resort
102
mt-bachelor
101
mt-hood-meadows
101
deer-valley-resort
100
ski-santa

In [29]:
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary_ski_area = Dictionary(reviews_ski_area)

# Create a MmCorpus: corpus
corpus_ski_areas = [dictionary_ski_area.doc2bow(r) for r in reviews_ski_area]


In [33]:
from collections import defaultdict
import itertools

count =1 

#print("Here are the lengths of the documents in the corpus")
#for doc in corpus_ski_areas:
#    print(len(doc))

for doc in  corpus_ski_areas:
    print("Here is ski area:", count)
    count +=1
    # Sort the doc for frequency: bow_doc
    bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

    # Print the top 5 words of the document alongside the count
    for word_id, word_count in bow_doc[:5]:
        print(dictionary_ski_area.get(word_id), word_count)

    # Create the defaultdict: total_word_count
    total_word_count = defaultdict(int)
    for word_id, word_count in itertools.chain.from_iterable(corpus_ski_areas):
        total_word_count[word_id] += word_count
    


Here is ski area: 1
great 820
ski 757
brule 524
family 442
friendly 390
Here is ski area: 2
killington 300
mountain 213
ski 172
great 152
trail 142
Here is ski area: 3
vail 416
day 205
ski 200
get 158
back 152
Here is ski area: 4
park 211
ski 197
winter 185
mountain 172
lift 165
Here is ski area: 5
mountain 268
blue 256
trail 158
lift 156
ski 136
Here is ski area: 6
ski 170
great 169
peak 168
mountain 160
breck 150
Here is ski area: 7
ski 286
snow 174
apache 165
lift 164
mountain 139
Here is ski area: 8
heavenly 206
mountain 149
lift 139
ski 126
get 115
Here is ski area: 9
wolf 206
creek 193
snow 167
ski 130
lift 128
Here is ski area: 10
steamboat 194
mountain 122
ski 108
skiing 104
great 100
Here is ski area: 11
mountain 237
creek 155
lift 137
get 125
ski 109
Here is ski area: 12
camelback 166
mountain 164
lift 156
line 134
trail 125
Here is ski area: 13
snowshoe 239
resort 158
lift 157
ski 145
snow 139
Here is ski area: 14
mammoth 234
mountain 185
great 104
snow 104
ski 93
Here is sk

Here is ski area: 125
attitash 62
peak 39
great 34
lift 33
ski 32
Here is ski area: 126
wilmot 42
ski 37
lift 36
place 36
hill 34
Here is ski area: 127
great 43
time 40
mountain 37
place 36
peter 34
Here is ski area: 128
great 55
tamarack 51
run 37
ski 37
mountain 33
Here is ski area: 129
place 51
day 41
vega 40
lift 38
snow 31
Here is ski area: 130
mountain 48
trail 44
waterville 40
great 37
day 31
Here is ski area: 131
whitecap 57
run 48
lift 46
skiing 44
ski 41
Here is ski area: 132
alpental 64
terrain 40
snow 39
get 35
great 28
Here is ski area: 133
mountain 65
sugarloaf 48
great 44
ski 39
get 29
Here is ski area: 134
jack 60
frost 60
snow 54
mountain 49
ski 46
Here is ski area: 135
summit 53
run 41
ski 41
great 40
area 37
Here is ski area: 136
bear 60
mountain 47
park 30
day 28
snow 28
Here is ski area: 137
pas 80
willamette 75
lift 73
ski 54
get 53
Here is ski area: 138
ski 40
great 36
family 32
kid 32
place 30
Here is ski area: 139
knob 83
blue 79
mountain 63
ski 59
good 45
Here

Here is ski area: 250
lift 27
ski 25
snowstar 25
time 24
day 21
Here is ski area: 251
run 22
great 19
andes 18
hill 17
place 17
Here is ski area: 252
swain 31
skiing 22
year 22
great 21
condition 14
Here is ski area: 253
great 22
tussey 22
mountain 17
place 16
ski 14
Here is ski area: 254
mountain 34
magic 22
run 22
hill 15
really 15
Here is ski area: 255
great 24
run 22
mountain 19
ski 18
mccauley 15
Here is ski area: 256
great 27
casper 24
ski 17
city 15
run 13
Here is ski area: 257
ski 32
tog 18
lift 17
son 16
black 15
Here is ski area: 258
lift 14
day 13
ski 11
get 10
great 10
Here is ski area: 259
resort 23
trail 18
rock 17
eagle 15
family 15
Here is ski area: 260
eaglecrest 22
terrain 21
lift 20
area 19
mountain 19
Here is ski area: 261
cooper 19
spur 18
ski 17
kid 15
would 15
Here is ski area: 262
mountain 22
soldier 16
great 12
day 11
ski 10
Here is ski area: 263
hill 40
ski 23
dry 22
small 12
kid 10
Here is ski area: 264
holimont 16
good 14
ski 14
skiing 14
great 12
Here is sk

In [54]:
from gensim.models.tfidfmodel import TfidfModel

tfidf = TfidfModel(corpus_ski_areas)

# Calculate the tfidf weights of doc: tfidf_weights
counter = 0
for doc in corpus_ski_areas:
    tfidf_weights = tfidf[doc]
    
    print('\n',ski_areas.index.values[counter]," key words are:")
    counter +=1
    
    # Sort the weights from highest to lowest: sorted_tfidf_weights
    sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

    # Print the top 5 weighted words
    for term_id, weight in sorted_tfidf_weights[:5]:
        print(dictionary_ski_area.get(term_id), round(weight,3))



 ski-brule  key words are:
brule 0.963
homestead 0.204
bbq 0.07
skiwee 0.031
carload 0.027

 killington-resort  key words are:
killington 0.854
pico 0.193
snowshed 0.103
powdr 0.091
gondola 0.086

 vail  key words are:
vail 0.797
bowl 0.206
lionshead 0.195
basin 0.13
sky 0.118

 winter-park-resort  key words are:
jane 0.574
wp 0.545
mary 0.327
mj 0.181
parsenn 0.127

 blue-mountain-ski-area  key words are:
pa 0.353
blue 0.268
burma 0.235
poconos 0.173
razor 0.142

 breckenridge  key words are:
breck 0.661
breckenridge 0.492
imperial 0.26
peak 0.178
town 0.098

 ski-apache  key words are:
apache 0.872
ruidoso 0.288
gondola 0.109
capitan 0.096
texas 0.088

 heavenly-mountain-resort  key words are:
heavenly 0.792
nevada 0.252
gondola 0.205
tahoe 0.17
casino 0.132

 wolf-creek-ski-area  key words are:
wolf 0.747
creek 0.372
pagosa 0.37
alberta 0.233
wc 0.108

 steamboat  key words are:
steamboat 0.935
morningside 0.108
town 0.061
gondola 0.061
colorado 0.053

 mountain-creek-resort  key w

 greek-peak  key words are:
greek 0.907
cny 0.114
toggenburg 0.111
labrador 0.069
alpha 0.069

 sipapu-ski-and-summer-resort  key words are:
sipapu 0.923
tao 0.103
fe 0.088
santa 0.084
teacher 0.069

 caberfae-peaks-ski-golf-resort  key words are:
caberfae 0.82
crystal 0.264
boyne 0.179
michigan 0.167
dub 0.129

 pine-mountain  key words are:
pine 0.696
famers 0.148
condo 0.109
birthday 0.106
brule 0.101

 roundtop-mountain-resort  key words are:
roundtop 0.838
lafayette 0.192
minuteman 0.169
liberty 0.148
ramrod 0.107

 nubs-nob-ski-area  key words are:
nub 0.862
nob 0.317
boyne 0.241
pintail 0.13
michigan 0.12

 attitash  key words are:
attitash 0.907
bear 0.131
bretton 0.106
peak 0.09
triple 0.075

 wilmot-mountain  key words are:
wilmot 0.835
wilmont 0.152
profoundly 0.11
wisconsin 0.096
alpine 0.084

 mount-peter-ski-area  key words are:
peter 0.811
tuxedo 0.109
mount 0.101
hermon 0.096
mt 0.089

 tamarack-resort  key words are:
tamarack 0.832
brundage 0.332
bogus 0.128
idaho 0.12

timely 0.144

 kelly-canyon-ski-area  key words are:
kelly 0.707
kelleys 0.269
rexburg 0.236
canyon 0.157
idaho 0.152

 king-pine  key words are:
kp 0.397
pine 0.359
king 0.351
thirty 0.286
purity 0.198

 giants-ridge-resort  key words are:
giant 0.686
lutsen 0.308
ridge 0.287
reccomendation 0.186
wynne 0.173

 woods-valley-ski-area  key words are:
glogging 0.323
steered 0.283
judging 0.221
pie 0.205
zone 0.198

 royal-mountain-ski-area  key words are:
royal 0.709
laided 0.33
moutian 0.266
privately 0.202
jim 0.196

 ski-mystic-at-deer-mountain  key words are:
mystic 0.468
deer 0.394
terry 0.323
brianhead 0.146
virtual 0.146

 bousquet-ski-area  key words are:
bosquet 0.521
bousquet 0.35
comer 0.263
pittsfield 0.261
bouquet 0.174

 blue-hills-ski-area  key words are:
yell 0.35
bluehills 0.265
monopolized 0.265
boston 0.258
certification 0.233

 snow-ridge  key words are:
tug 0.191
utica 0.191
titus 0.175
whiteface 0.174
pod 0.164

 pine-knob-ski-resort  key words are:
knob 0.309
shaboo