# Machine Learning: NLP for important words

In [1]:
# Dependencies
import pandas as pd
import sqlite3

## Load data

In [2]:
# Create connections to database
conn = sqlite3.connect("Data/Hotels.db")

#Load the database table into a pandas dataframe
ratings = pd.read_sql_query("select * from ratings;", conn)
conn.close()

# Preview the dataframe
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,Our experience at Rancho Valencia was absolute...,Best romantic vacation ever!!!!,,
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,Amazing place. Everyone was extremely warm and...,Sweet sweet serenity,,
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,We booked a 3 night stay at Rancho Valencia to...,Amazing Property and Experience,,
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,Currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,I live in Md and the Aloft is my Home away fro...,ALWAYS GREAT STAY...,Laurel,MD


## Data cleaning

### Review aggregation

In [3]:
# Combine text and title for each hotel review
ratings["review"] = ratings["reviews_text"] + [""] + ratings["reviews_title"]
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,review
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,Our experience at Rancho Valencia was absolute...,Best romantic vacation ever!!!!,,,Our experience at Rancho Valencia was absolute...
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,Amazing place. Everyone was extremely warm and...,Sweet sweet serenity,,,Amazing place. Everyone was extremely warm and...
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,We booked a 3 night stay at Rancho Valencia to...,Amazing Property and Experience,,,We booked a 3 night stay at Rancho Valencia to...
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,Currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA,Currently in bed writing this for the past hr ...
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,I live in Md and the Aloft is my Home away fro...,ALWAYS GREAT STAY...,Laurel,MD,I live in Md and the Aloft is my Home away fro...


In [4]:
# Create a new dataframe that contains the combined text and excludes other variables
ratings2 = ratings.drop(columns = ["reviews_date", "index", "reviews_userCity", 
                                   "reviews_userProvince", "reviews_sourceURLs", 
                                   "reviews_text", "reviews_title"])
ratings2.head()

Unnamed: 0,name,reviews_rating,review
0,Rancho Valencia Resort Spa,5.0,Our experience at Rancho Valencia was absolute...
1,Rancho Valencia Resort Spa,5.0,Amazing place. Everyone was extremely warm and...
2,Rancho Valencia Resort Spa,5.0,We booked a 3 night stay at Rancho Valencia to...
3,Aloft Arundel Mills,2.0,Currently in bed writing this for the past hr ...
4,Aloft Arundel Mills,5.0,I live in Md and the Aloft is my Home away fro...


In [5]:
# Calculate the mean of the review rating per hotel
avg_ratings = ratings2.groupby("name")["reviews_rating"].mean()
avg_ratings = avg_ratings.reset_index().round()
avg_ratings.head()

Unnamed: 0,name,reviews_rating
0,1906 Lodge At Coronado Beach,5.0
1,250 Main Hotel,5.0
2,AC Hotel Chicago Downtown,4.0
3,AC Hotel Miami Beach,5.0
4,AC Hotel by Marriott Boston Downtown,5.0


In [6]:
# Aggregate the review text based on hotel name
reviews = ratings2.groupby("name")["review"].apply(list).reset_index()
reviews.head()

Unnamed: 0,name,review
0,1906 Lodge At Coronado Beach,[We had an absolutely delightful stay at the c...
1,250 Main Hotel,[I am a frequent business traveler and stay in...
2,AC Hotel Chicago Downtown,[Bad: Wish there was a hot tub after walking a...
3,AC Hotel Miami Beach,[A recent stay in early April 2016 was most en...
4,AC Hotel by Marriott Boston Downtown,[Bad: Dont change a thing. Good: Staff was ver...


In [7]:
# Merge the reviews and ratings dataframes
ratings3 = pd.merge(avg_ratings, reviews, on = "name")

# Remove "More" from the strings
temp_list = [lst[0] for lst in ratings3["review"]]
temp_list2 = [string.replace("More", "") for string in temp_list]
ratings3["review2"] = temp_list2

# Put all the letters in lowercase
ratings3["review2"] = ratings3["review2"].str.lower()

ratings4 = ratings3.drop(columns = ["review"])
ratings4.head()

Unnamed: 0,name,reviews_rating,review2
0,1906 Lodge At Coronado Beach,5.0,we had an absolutely delightful stay at the ch...
1,250 Main Hotel,5.0,i am a frequent business traveler and stay in ...
2,AC Hotel Chicago Downtown,4.0,bad: wish there was a hot tub after walking ar...
3,AC Hotel Miami Beach,5.0,a recent stay in early april 2016 was most enj...
4,AC Hotel by Marriott Boston Downtown,5.0,bad: dont change a thing. good: staff was very...


### Review cleanup

In [8]:
# Dependencies
import re, string

import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Remove stop words from the list
stops = stopwords.words("english")
exclude = set(string.punctuation)

# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

In [10]:
# Transform the poem in preparation for word counts
words_list = []
preprocessed_text = []
for review in ratings4["review2"]:
    
    # Create a list of words per poem after the words are converted to lowercase    
    words = word_tokenize(review)
    
    # Filter to remove stop words and punctuations    
    words2 = [word for word in words if word not in stops and word not in exclude]
    
    # Lemmatise each word (if it's a verb, convert to root verb)
    words3 = [lemmatiser.lemmatize(word, pos = "v") for word in words2]
    
    # Add the filtered list of words (representing each poem)
    words_list.append(words3)
    
    # Convert the list of strings back to one string
    words4 = " ".join(words3)
    
    # Add the filtered list of words (representing each poem)
    preprocessed_text.append(words4)

ratings4["tokens"] = words_list    
ratings4["filteredReview"] = preprocessed_text
ratings5 = ratings4.drop(columns = ["review2"])
ratings5.head()

Unnamed: 0,name,reviews_rating,tokens,filteredReview
0,1906 Lodge At Coronado Beach,5.0,"[absolutely, delightful, stay, charm, 1906, lo...",absolutely delightful stay charm 1906 lodge 's...
1,250 Main Hotel,5.0,"[frequent, business, traveler, stay, many, typ...",frequent business traveler stay many type hote...
2,AC Hotel Chicago Downtown,4.0,"[bad, wish, hot, tub, walk, around, town, day,...",bad wish hot tub walk around town day breakfas...
3,AC Hotel Miami Beach,5.0,"[recent, stay, early, april, 2016, enjoyable, ...",recent stay early april 2016 enjoyable propert...
4,AC Hotel by Marriott Boston Downtown,5.0,"[bad, dont, change, thing, good, staff, accomm...",bad dont change thing good staff accommodate e...


In [11]:
# Create a list of average review ratings
rates_list = list(set(ratings5["reviews_rating"]))

# Create a function that creates dataframes for each hotel rating
def create_df(rating):
    temp_df = ratings5[ratings5["reviews_rating"] == rating]
    return temp_df

# Create a list of dataframes by hotel rating
rate_df = [create_df(rate) for rate in rates_list]

rates = []
for r in rate_df:
    rates.append(r.groupby("reviews_rating")["filteredReview"].apply(list))
    
# Convert the pandas series to a dataframe    
rate_df2 = pd.DataFrame(pd.concat(rates,axis = 0).reset_index())

# Convert filteredReview from list to string
rate_df2["filteredReview"] = rate_df2["filteredReview"].apply(", ".join)

rate_df2

Unnamed: 0,reviews_rating,filteredReview
0,1.0,stay half hour leave say gross loud ... .. gue...
1,2.0,check room give another couple room come refri...
2,3.0,impress scene walk n't take long realize ace f...
3,4.0,bad wish hot tub walk around town day breakfas...
4,5.0,absolutely delightful stay charm 1906 lodge 's...


## Word importance
Source: https://stevenloria.com/tf-idf/

In [12]:
# Dependencies
import math
from textblob import TextBlob as tb

In [13]:
# Create a function that calculates term frequency
def tf(word, review):
    return review.words.count(word) / len(review.words)

# Create a function that determines the number of documents that contain a certain word
def n_docs(word, reviewlist):
    return sum(1 for review in reviewlist if word in review.words)

# Create a function that determines the inverse document frequency (IDF)
# IDF = how common a word is among all the documents in reviewlist
def idf(word, reviewlist):
    return math.log(len(reviewlist) / (1 + n_docs(word, reviewlist)))

def tdidf(word, review, reviewlist):
    return tf(word, review) * idf(word, reviewlist)

In [14]:
# Create a function that calculates the TF-IDF for important words in dataframes
def calc_TFIDF(df):
    reviewlist = [tb(review) for review in df["filteredReview"]]
    
    # Create an empty list to be filled with text blobs from cleaning reviewlist
    reviewlist2 = []

    # Loop through the reviewlist
    for i in range(0, len(reviewlist)):

        # Remove words that are shorter than 3 characters
        new_string = ' '.join([w for w in str(reviewlist[i]).split() if len(w) > 3])

        # Replace em dash and period with space
        new_string2 = new_string.replace("—", " ")
        new_string2 = new_string.replace(".", " ")

        # Convert string to text blob
        new_string2 = tb(new_string2)

        # Append the text blob to the list of text blobs
        reviewlist2.append(new_string2)
        
    # Calculate the five most important words
    impt_words = []
    for i, review in enumerate(reviewlist2):
        scores = {word: tdidf(word, review, reviewlist2) for word in review.words}
        sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse = True)

        for word, score in sorted_words[:10]:
            impt_words.append((i + 1, word, round(score, 5)))
            
    # Create a dataframe of important words per review
    df2 = pd.DataFrame(impt_words, columns = ["HotelNo", "Word", "TF-IDF"])

    # Add titles for each review in df2
    titles = []
    for i in range(0, len(df)):
        for p in df2.HotelNo:
            if i == p - 1:
                title = list(df["name"])[i]
                titles.append(title) 

    df2["HotelName"] = titles     
    
    # Create a dictionary of hotel names and ratings
    names = list(df["name"])
    rev_rates = list(df["reviews_rating"])
    names_dict = dict(zip(names, rev_rates))
    
    # Add a column containing state names
    df2["Ratings"] = df2["HotelName"].map(names_dict)
    df2.head()
    
    return df2

In [15]:
# Create a function that calculates the TF-IDF for important words in dataframes
def calc_TFIDF2(df):   
    reviewlist = [tb(review) for review in rate_df2["filteredReview"]]

    # Create an empty list to be filled with text blobs from cleaning reviewlist
    reviewlist2 = []

    # Loop through the reviewlist
    for i in range(0, len(reviewlist)):

        # Remove words that are shorter than 3 characters
        new_string = ' '.join([w for w in str(reviewlist[i]).split() if len(w) > 3])

        # Replace em dash and period with space
        new_string2 = new_string.replace("—", " ")
        new_string2 = new_string.replace(".", " ")

        # Convert string to text blob
        new_string2 = tb(new_string2)

        # Append the text blob to the list of text blobs
        reviewlist2.append(new_string2)

    # Calculate the five most important words
    impt_words = []
    for i, review in enumerate(reviewlist2):
        scores = {word: tdidf(word, review, reviewlist2) for word in review.words}
        sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse = True)

        for word, score in sorted_words[:10]:
            impt_words.append((i + 1, word, round(score, 5)))

    # Create a dataframe of important words per review
    df2 = pd.DataFrame(impt_words, columns = ["Rating", "Word", "TF-IDF"])

    return df2       

In [16]:
# Calculate TF-IDF for the top 10 important words for each hotel
ratings6 = calc_TFIDF(ratings5)
ratings6.head()

Unnamed: 0,HotelNo,Word,TF-IDF,HotelName,Ratings
0,1,1906,0.21701,1906 Lodge At Coronado Beach,5.0
1,1,plunge,0.21701,1906 Lodge At Coronado Beach,5.0
2,1,culinary,0.21701,1906 Lodge At Coronado Beach,5.0
3,1,entre,0.21701,1906 Lodge At Coronado Beach,5.0
4,1,cruiser,0.21701,1906 Lodge At Coronado Beach,5.0


In [17]:
# Calculate TF-IDF for the top 10 important words for each hotel rating
ratings7 = calc_TFIDF2(rate_df2)
ratings7

Unnamed: 0,Rating,Word,TF-IDF
0,1,ruin,0.00111
1,1,hunters,0.00111
2,1,scream,0.00111
3,1,absolute,0.00111
4,1,priceline,0.00103
5,1,avoid,0.00083
6,1,dump,0.00083
7,1,rotten,0.00074
8,1,ownership,0.00074
9,1,india,0.00074


### Visualise the important words

In [18]:
# Dependencies
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "whitegrid")
import numpy as np

from ipywidgets import widgets, interactive

In [19]:
# Create a widget containing ratings (sorted)
rates_list.sort()

review_rates = widgets.Dropdown(options = ["Choose a hotel guest rating..."] + rates_list, 
                                value = "Choose a hotel guest rating...", 
                              description = "Hotel Rating:", disabled = False)

In [20]:
# Create a filter based on review rate
def plot_it(review_rates):
    if review_rates != "Choose a hotel guest rating...":
        df3 = ratings7[ratings7["Rating"] == review_rates]
        
        plt.figure(figsize = (10, 6))
        sns.set(font_scale = 1.5)
        graph = sns.barplot(y = "Word", x = "TF-IDF", data = df3, palette = "Blues_d")
        graph.set_xlim(0, 0.0012)

In [21]:
# Plot the data by poem title
interactive(plot_it, review_rates = review_rates)

interactive(children=(Dropdown(description='Hotel Rating:', options=('Choose a hotel guest rating...', 1.0, 2.…