In [51]:
# NLP Processing
# Processing by number of stars in the review

#  The first step is to read in the dataset.  
#  This data set contains 'ski resort reviews'
#  And the source is https://www.kaggle.com/datasets/fredkellner/onthesnow-ski-area-reviews

import pandas as pd
reviews = pd.read_csv ('OnTheSnow_SkiAreaReviews.csv')
#  give the dataset friendlier column names.
reviews.columns = ['placeholder','state','ski_area','reviewer_name','review_date',
                        'review_stars','review_text']

#  break up the dataset into 5 subsets by the number of stars in the review
reviews_1star = reviews[reviews['review_stars']==1]
reviews_2star = reviews[reviews['review_stars']==2]
reviews_3star = reviews[reviews['review_stars']==3]
reviews_4star = reviews[reviews['review_stars']==4]
reviews_5star = reviews[reviews['review_stars']==5]

In [52]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np


# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


#  This defines a function to pre-process the text by 
#  removing any numbers and punctuation.  It tokenizes the sentences into words. 
#  It also removes common stop words in the english language.  
#  Finally it lemmatizes the words to have only roots of the words not endings. 

def get_clean_review(df):
    clean_review = []
    for index, row in df.iterrows():
        review_str = str(row['review_text'])
        # split the string into a list of words
        tokens = word_tokenize(review_str)
        lower_tokens = [t.lower() for t in tokens]

        # removes anything not alpha characters such as punctuation and numbers
        alpha_only = [t for t in lower_tokens if t.isalpha()]

        # removes the stop words from the text
        no_stops = [t for t in alpha_only if t not in stopwords.words('english')]

        # Lemmatize all tokens into a new list: lemmatized
        lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
        lemmatized = no_stops
        
        # make a list of lists
        for w in lemmatized:
            clean_review.append(w)
        
    return clean_review


#  The function returns a list of words broken down by the number of stars
#  There are 5 documents which will later be used in Gensim to do TFIDF analysis

review_for_dictionary1 = get_clean_review(reviews_1star)
review_for_dictionary2 = get_clean_review(reviews_2star)
review_for_dictionary3 = get_clean_review(reviews_3star)
review_for_dictionary4 = get_clean_review(reviews_4star)
review_for_dictionary5 = get_clean_review(reviews_5star)

#  Gensim requires the 5 documents to be in list forms.  Each document is a list 
#  of cleaned words.  The below line creates a list of lists.  

review_for_dictionary=[review_for_dictionary1,review_for_dictionary2,review_for_dictionary3,review_for_dictionary4,review_for_dictionary5]



In [54]:
# Import Counter
from collections import Counter

# print(clean_review[0:9])

# Create the bag-of-words: bow
# bow = Counter(clean_review)

# Print the 10 most common tokens
# print(bow.most_common(40))


In [55]:
# frequencies from gensim
# pip install --upgrade gensim
from gensim.corpora.dictionary import Dictionary

counter = 1
print("Here is the number of words in reviews of the reviews:")
for r in review_for_dictionary:
    print("There are", len(r), "in the",counter,"star reviews")
    counter += 1 
    
dictionary = Dictionary(review_for_dictionary)
#print(dictionary)
#print(clean_review[0:10])

# Create a Dictionary from the articles: dictionary
#dictionary = Dictionary(clean_review)

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(r) for r in review_for_dictionary]


Here is the length of the reviews:
94185
77809
123956
266096
431507


In [71]:

from collections import defaultdict
import itertools

count =1 

print("Here are the lengths of the documents in the corpus")
for doc in corpus:
    print(len(doc))

for doc in  corpus:
    print("These are the words in the ratings with", count, " star")
    count +=1
    # Sort the doc for frequency: bow_doc
    bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

    # Print the top 5 words of the document alongside the count
    for word_id, word_count in bow_doc[:5]:
        print(dictionary.get(word_id), word_count)

    # Create the defaultdict: total_word_count
    total_word_count = defaultdict(int)
    for word_id, word_count in itertools.chain.from_iterable(corpus):
        total_word_count[word_id] += word_count
    


Here are the lengths of the documents in the corpus
7674
7103
9046
12920
16017
These are the words in the ratings with 1  star
ski 1316
lift 1290
mountain 1156
snow 955
great 918
These are the words in the ratings with 2  star
ski 1131
mountain 947
lift 923
great 824
snow 771
These are the words in the ratings with 3  star
ski 1790
mountain 1648
lift 1548
great 1335
day 1185
These are the words in the ratings with 4  star
ski 3668
great 3450
mountain 3406
lift 3129
snow 2812
These are the words in the ratings with 5  star
ski 6852
great 6243
mountain 5690
lift 4718
snow 4375


In [94]:
#print(dictionary)

In [92]:
#print(corpus)

In [93]:
#print(doc)

In [56]:
from gensim.models.tfidfmodel import TfidfModel

tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
counter = 1
for doc in corpus:
    tfidf_weights = tfidf[doc]
    
    print('\n',counter," Star Reviews key words are:")
    counter +=1
    
    # Sort the weights from highest to lowest: sorted_tfidf_weights
    sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

    # Print the top 5 weighted words
    for term_id, weight in sorted_tfidf_weights[:10]:
        print(dictionary.get(term_id), round(weight,3))



 1  Star Reviews key words are:
holliday 0.2
illness 0.105
sledders 0.105
jp 0.1
mc 0.1
der 0.09
es 0.088
thunderstruck 0.088
wandered 0.07
woodloch 0.07

 2  Star Reviews key words are:
mc 0.11
coperate 0.086
initiated 0.086
kelleys 0.086
majic 0.086
meh 0.086
sustainable 0.086
millions 0.073
blady 0.064
chump 0.064

 3  Star Reviews key words are:
song 0.095
division 0.094
filed 0.094
sanford 0.079
tuff 0.079
activated 0.063
cables 0.063
clyde 0.063
enuff 0.063
garlic 0.063

 4  Star Reviews key words are:
brule 0.096
clair 0.071
invoice 0.071
bleecker 0.062
comparisons 0.062
presentation 0.062
kaatskill 0.061
honeycomb 0.059
afton 0.055
hmsr 0.053

 5  Star Reviews key words are:
brule 0.331
josh 0.087
nub 0.078
monarch 0.07
saloon 0.065
homestead 0.056
peruvian 0.056
coolest 0.053
disappoints 0.05
blackjack 0.048


In [81]:
type(tfidf)

gensim.models.tfidfmodel.TfidfModel

In [67]:
# find a ski_center with a particular word.
import re 

search_word = "nub"
search_word = search_word.lower()

temp = ''
for i in range(len(reviews)) :
    r,sc = reviews.loc[i, "review_text"], reviews.loc[i, "ski_area"]
    if not r != r:
        r = r.lower()
        if re.search(search_word,r):
            if sc != temp: 
                print(sc)
                temp = sc

jackson-hole
lutsen-mountains
mountain-creek-resort
boyne-mountain-resort
crystal-mountain
shanty-creek
nubs-nob-ski-area
loon-mountain
cannonsburg
alpine-valley-resort
boyne-highlands
caberfae-peaks-ski-golf-resort
devils-head
mount-brighton
mount-holly
spring-mountain-ski-area


In [63]:
r = reviews.loc[0,'review_text']
r != r

False

In [85]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np


def get_named_entity_list(df):
    ne = []
    for index, row in df.iterrows():
        review_str = str(row['review_text'])
        # Tokenize the article into sentences: sentences
        sentences = sent_tokenize(review_str)

        # Tokenize each sentence into words: token_sentences
        token_sentences = [word_tokenize(sent) for sent in sentences]

        # Tag each tokenized sentence into parts of speech: pos_sentences
        pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences] 

        # Create the named entity chunks: chunked_sentences
        chunked_sentences = nltk.ne_chunk_sents(pos_sentences, binary=True)
        
        # Test for stems of the tree with 'NE' tags
        for sent in chunked_sentences:
            for chunk in sent:
                if hasattr(chunk, "label") and chunk.label() == "NE":
                    ne.append(chunk)
    return ne

ne_chunks = []             
ne_chunks.append(get_named_entity_list(reviews_1star))
ne_chunks.append(get_named_entity_list(reviews_2star))
ne_chunks.append(get_named_entity_list(reviews_3star))
ne_chunks.append(get_named_entity_list(reviews_4star))
ne_chunks.append(get_named_entity_list(reviews_5star))



In [87]:
print(type(chunked_sentences)

SyntaxError: unexpected EOF while parsing (2669513967.py, line 1)

In [81]:
print(type(ne_chunks))

print(ne_chunks[0][0:6])
print(ne_chunks[1][0:6])
print(ne_chunks[2][0:6])
print(ne_chunks[3][0:6])

<class 'list'>
[Tree('NE', [('Squaw', 'NNP')]), Tree('NE', [('Squaw', 'NNP')]), Tree('NE', [('Sugar', 'NNP'), ('Bowl', 'NNP')]), Tree('NE', [('Sugar', 'NNP'), ('Bowl', 'NNP')]), Tree('NE', [('Squaw', 'NNP')]), Tree('NE', [('Squaw', 'NNP')])]
[Tree('NE', [('Plenty', 'NN')]), Tree('NE', [('Skip', 'NNP'), ('Tahoe', 'NNP')]), Tree('NE', [('Utah', 'NNP')]), Tree('NE', [('Colorado', 'NNP')]), Tree('NE', [('Canadian', 'JJ'), ('Rockies', 'NNPS')]), Tree('NE', [('Tahoe', 'NNP')])]
[Tree('NE', [('Due', 'NNP')]), Tree('NE', [('Super', 'NNP')]), Tree('NE', [('WTF', 'NNP')]), Tree('NE', [('Large', 'JJ')]), Tree('NE', [('Squaw', 'NNP')]), Tree('NE', [('Solitude', 'NNP')])]
[Tree('NE', [('SV', 'NNP')]), Tree('NE', [('Squaw', 'NNP'), ('Valley', 'NNP')]), Tree('NE', [('Visibility', 'NN')]), Tree('NE', [('Great', 'NNP'), ('Expert', 'NNP'), ('Mountain', 'NNP')]), Tree('NE', [('Alpine', 'NNP'), ('Meadows', 'NNP')]), Tree('NE', [('Squaw', 'NNP'), ('Squaw', 'NNP'), ('Valley', 'NNP')])]


In [83]:
from gensim.models.tfidfmodel import TfidfModel

tfidf = TfidfModel(ne_chunks)

# Calculate the tfidf weights of doc: tfidf_weights
#counter = 1
#for doc in corpus:
tfidf_weights = tfidf[ne_chunks[0]]
    
#    print('\n',counter," Star Reviews key words are:")
#    counter +=1
    
    # Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

    # Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:10]:
    print(dictionary.get(term_id), round(weight,3))


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# Importing libraries
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
  
# Input the file 
txt1 = []
with open('C:\\Users\\DELL\\Desktop\\MachineLearning1.txt') as file:
    txt1 = file.readlines()
  
# Preprocessing
def remove_string_special_characters(s):
      
    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', '', s)
    stripped = re.sub('_', '', stripped)
      
    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)
      
    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
            return stripped.lower()
          
# Stopword removal 
stop_words = set(stopwords.words('english'))
your_list = ['skills', 'ability', 'job', 'description']
for i, line in enumerate(txt1):
    txt1[i] = ' '.join([x for 
        x in nltk.word_tokenize(line) if 
        ( x not in stop_words ) and ( x not in your_list )])
      
# Getting trigrams 
vectorizer = CountVectorizer(ngram_range = (3,3))
X1 = vectorizer.fit_transform(txt1) 
features = (vectorizer.get_feature_names())
print("\n\nFeatures : \n", features)
print("\n\nX1 : \n", X1.toarray())
  
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (3,3))
X2 = vectorizer.fit_transform(txt1)
scores = (X2.toarray())
print("\n\nScores : \n", scores)
  
# Getting top ranking features
sums = X2.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ("\n\nWords head : \n", words.head(7))