In [1]:
# Dependencies
import pandas as pd
import numpy as np
import sqlite3
from pyspark import SparkContext
sc =SparkContext()
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [2]:
# Create connections to database
conn = sqlite3.connect("Data/Hotels.db")

#Load the database table into a pandas dataframe
ratings = pd.read_sql_query("select * from ratings;", conn)
conn.close()

# Preview the dataframe
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,Our experience at Rancho Valencia was absolute...,Best romantic vacation ever!!!!,,
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,Amazing place. Everyone was extremely warm and...,Sweet sweet serenity,,
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,We booked a 3 night stay at Rancho Valencia to...,Amazing Property and Experience,,
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,Currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,I live in Md and the Aloft is my Home away fro...,ALWAYS GREAT STAY...,Laurel,MD


In [3]:
#Grouping ratings to get 5 unique ratings
rating_list = ratings['reviews_rating'].tolist()
new_list = []
for rating in rating_list:
    if rating >= 5.0: 
        new_list.append(5.0)
    elif rating >= 4 and rating < 5:
        new_list.append(4.0)
    elif rating >= 3 and rating < 4:
        new_list.append(3.0)
    elif rating >= 2 and rating < 3:
        new_list.append(2.0)
    else:
        new_list.append(1.0)        

In [4]:
# Put all letters in lower case
# Split hotel reviews_rating to "good"/"bad"
ratings["reviews_text"] = ratings["reviews_text"].str.lower()
#ratings["rating"] = np.where(ratings["reviews_rating"]>= 4, 'good', 'bad')
ratings["rating"] = new_list
ratings['reviews_text'] = ratings['reviews_text'].astype(str)
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,rating
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,our experience at rancho valencia was absolute...,Best romantic vacation ever!!!!,,,5.0
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,amazing place. everyone was extremely warm and...,Sweet sweet serenity,,,5.0
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,we booked a 3 night stay at rancho valencia to...,Amazing Property and Experience,,,5.0
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA,2.0
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,i live in md and the aloft is my home away fro...,ALWAYS GREAT STAY...,Laurel,MD,5.0


In [6]:
ratings['reviews_rating'].unique()

array([5.  , 2.  , 4.  , 3.  , 1.  , 4.8 , 4.6 , 3.55, 4.4 , 4.15, 2.5 ,
       3.95, 2.9 , 3.35, 3.75, 4.5 , 2.1 , 1.65, 3.15, 2.7 , 1.45, 2.75,
       2.3 , 3.5 , 4.25, 1.25, 1.9 , 3.45, 3.25, 4.75])

In [5]:
# Dependencies
import re, string

import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Remove stop words from the list
stops = stopwords.words("english")
exclude = set(string.punctuation)

In [7]:
words_list = []
preprocessed_text = []
for review in ratings["reviews_text"]:
    
    # Create a list of words per rating after the words are converted to lowercase    
    words = word_tokenize(review)
    
    # Filter to remove stop words and punctuations    
    words2 = [word for word in words if word not in stops and word not in exclude]
    
    # Add the filtered list of words
    words_list.append(words2)
    
    # Convert the list of strings back to one string
    words3 = " ".join(words2)
    
    # Add the filtered list of words
    preprocessed_text.append(words3)
   
ratings["filteredReview"] = preprocessed_text
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,rating,filteredReview
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,our experience at rancho valencia was absolute...,Best romantic vacation ever!!!!,,,5.0,experience rancho valencia absolutely perfect ...
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,amazing place. everyone was extremely warm and...,Sweet sweet serenity,,,5.0,amazing place everyone extremely warm welcomin...
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,we booked a 3 night stay at rancho valencia to...,Amazing Property and Experience,,,5.0,booked 3 night stay rancho valencia play tenni...
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA,2.0,currently bed writing past hr 1/2 dogs barking...
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,i live in md and the aloft is my home away fro...,ALWAYS GREAT STAY...,Laurel,MD,5.0,live md aloft home away home ... stayed 1 nigh...


In [8]:
# Remove columns that will not be used in the analysis
df = ratings.drop(columns=["reviews_text","reviews_date","reviews_sourceURLs","reviews_title","reviews_userCity","reviews_userProvince"],axis=1)
df.head()

Unnamed: 0,index,name,reviews_rating,rating,filteredReview
0,0,Rancho Valencia Resort Spa,5.0,5.0,experience rancho valencia absolutely perfect ...
1,1,Rancho Valencia Resort Spa,5.0,5.0,amazing place everyone extremely warm welcomin...
2,2,Rancho Valencia Resort Spa,5.0,5.0,booked 3 night stay rancho valencia play tenni...
3,3,Aloft Arundel Mills,2.0,2.0,currently bed writing past hr 1/2 dogs barking...
4,4,Aloft Arundel Mills,5.0,5.0,live md aloft home away home ... stayed 1 nigh...


In [9]:
#Convert Pandas DataFrame to Spark DataFrame
spark_ratings = sqlContext.createDataFrame(df)
spark_ratings.show(5)

+-----+--------------------+--------------+------+--------------------+
|index|                name|reviews_rating|rating|      filteredReview|
+-----+--------------------+--------------+------+--------------------+
|    0|Rancho Valencia R...|           5.0|   5.0|experience rancho...|
|    1|Rancho Valencia R...|           5.0|   5.0|amazing place eve...|
|    2|Rancho Valencia R...|           5.0|   5.0|booked 3 night st...|
|    3| Aloft Arundel Mills|           2.0|   2.0|currently bed wri...|
|    4| Aloft Arundel Mills|           5.0|   5.0|live md aloft hom...|
+-----+--------------------+--------------+------+--------------------+
only showing top 5 rows



In [10]:
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
data = spark_ratings.withColumn('length', length(spark_ratings['filteredReview']))
data.show(5)

+-----+--------------------+--------------+------+--------------------+------+
|index|                name|reviews_rating|rating|      filteredReview|length|
+-----+--------------------+--------------+------+--------------------+------+
|    0|Rancho Valencia R...|           5.0|   5.0|experience rancho...|   112|
|    1|Rancho Valencia R...|           5.0|   5.0|amazing place eve...|   202|
|    2|Rancho Valencia R...|           5.0|   5.0|booked 3 night st...|   335|
|    3| Aloft Arundel Mills|           2.0|   2.0|currently bed wri...|   125|
|    4| Aloft Arundel Mills|           5.0|   5.0|live md aloft hom...|   186|
+-----+--------------------+--------------+------+--------------------+------+
only showing top 5 rows



## Feature Transformations

In [11]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [12]:
#tokenizer = Tokenizer(inputCol="filteredReview", outputCol="token_text")
#tokenized = tokenizer.transform(spark_ratings)
#tokenized.show(3, truncate=False)

In [13]:
#stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
#filtered = stopremove.transform(tokenized)
#filtered.show(3, truncate=False)

In [14]:
#hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token',numFeatures=pow(2,4))
#hashed_df = hashingTF.transform(filtered)
#hashed_df.show(3, truncate=False)

In [15]:
#idf = IDF(inputCol="hash_token", outputCol="idf_token")
#idfModel = idf.fit(hashed_df)
#rescaledData = idfModel.transform(hashed_df)

In [16]:
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='rating',outputCol='label')
tokenizer = Tokenizer(inputCol="filteredReview", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token',numFeatures=pow(2,4))
idf = IDF(inputCol="hash_token", outputCol="idf_token")

In [17]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['label','idf_token', 'length'], outputCol='features')

In [18]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [19]:
 # Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)
cleaned.show(3)

+-----+--------------------+--------------+------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|index|                name|reviews_rating|rating|      filteredReview|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+-----+--------------------+--------------+------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    0|Rancho Valencia R...|           5.0|   5.0|experience rancho...|   112|  0.0|[experience, ranc...|[experience, ranc...|(16,[1,4,5,6,8,9,...|(16,[1,4,5,6,8,9,...|[0.0,0.0,0.184744...|
|    1|Rancho Valencia R...|           5.0|   5.0|amazing place eve...|   202|  0.0|[amazing, place, ...|[amazing, place, ...|(16,[2,3,4,5,6,7,...|(16,[2,3,4,5,6,7,...|[0.0,0.0,0.0,0.28...|
|    2|Rancho Valencia R...|           5.0|   5.0|

In [20]:
# Show label of ham spame and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[0.0,0.0,0.184744...|
|  0.0|[0.0,0.0,0.0,0.28...|
|  0.0|[0.0,0.5991695591...|
|  3.0|[3.0,0.2995847795...|
|  0.0|[0.0,1.4979238978...|
|  0.0|[0.0,0.5991695591...|
|  0.0|[0.0,0.5991695591...|
|  0.0|(18,[2,3,4,5,10,1...|
|  0.0|[0.0,0.5991695591...|
|  0.0|[0.0,0.5991695591...|
|  0.0|[0.0,0.2995847795...|
|  0.0|[0.0,0.2995847795...|
|  0.0|[0.0,1.4979238978...|
|  0.0|[0.0,0.2995847795...|
|  0.0|[0.0,0.8987543387...|
|  1.0|[1.0,0.5991695591...|
|  0.0|[0.0,0.5991695591...|
|  2.0|[2.0,0.5991695591...|
|  0.0|[0.0,0.5991695591...|
|  1.0|[1.0,0.0,0.738976...|
+-----+--------------------+
only showing top 20 rows



In [21]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

In [22]:
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [23]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(3)

+-----+--------------------+--------------+------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|index|                name|reviews_rating|rating|      filteredReview|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------+------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   10|Hampton Inn Suite...|           5.0|   5.0|staff friendly he...|   218|  0.0|[staff, friendly,...|[staff, friendly,...|(16,[0,1,2,4,6,7,...|(16,[0,1,2,4,6,7,...|[0.0,0.2995847795...|[-59.087071149076...|[0.77843363721628...|       0.0|
| 1000|Hotel Zetta San F...|

In [24]:
# rating: 5 - 0.0, 4 - 1.0, 3 - 2.0, 2 - 3.0, 1 - 4.0
test_results.select(['label', 'prediction']).show(10)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  3.0|       4.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 10 rows



In [25]:
 # Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting ratings was: ", acc)

Accuracy of model at predicting ratings was:  0.7687595273646528


## Popular Words

In [102]:
new = df["filteredReview"].str.lower().str.split()
new.head()

0    [experience, rancho, valencia, absolutely, per...
1    [amazing, place, everyone, extremely, warm, we...
2    [booked, 3, night, stay, rancho, valencia, pla...
3    [currently, bed, writing, past, hr, 1/2, dogs,...
4    [live, md, aloft, home, away, home, ..., staye...
Name: filteredReview, dtype: object

In [124]:
new.text.apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0)

AttributeError: 'Series' object has no attribute 'text'

In [104]:
results = set()
df['filteredReview'].str.lower().str.split().apply(results.update)
print(results)



In [108]:
wordcount = {}

In [113]:
results2 = {}

In [122]:
for word in results:
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1

In [123]:
wordcount 

{'everyone': 2,
 'sacrifice': 2,
 'facilitites': 2,
 'jeans': 2,
 'sojourn': 2,
 'antonio': 2,
 'price.the': 2,
 'voluntarily': 2,
 'pair': 2,
 'laminate': 2,
 'uptodate': 2,
 'crossing': 2,
 'barn': 2,
 'home2suites': 2,
 'lancaster': 2,
 'boon': 2,
 'heater/ac': 2,
 '30-minute': 2,
 'é¨': 2,
 'ttl': 2,
 'plymouth': 2,
 'chemicals': 2,
 'face/hand': 2,
 'generated': 2,
 'management': 2,
 'hell': 2,
 'adelaide': 2,
 'yale': 2,
 'yell': 2,
 'fl': 2,
 'record-breaking': 2,
 'sighted': 2,
 'w/couch': 2,
 'maritime': 2,
 'check-in/check-out': 2,
 'ci': 2,
 'tonthe': 2,
 'tad': 2,
 'dawson': 2,
 'buenos': 2,
 'pris': 2,
 'increase': 2,
 'sinuses': 2,
 'fran': 2,
 'kind..breakfast': 2,
 'free..it': 2,
 'multi-prong': 2,
 'rmnp': 2,
 'michelle': 2,
 'tulip': 2,
 'reproach': 2,
 'extended': 2,
 'service': 2,
 'occupants': 2,
 'informed': 2,
 '17:00': 2,
 '7-11.': 2,
 'w.': 2,
 'heve': 2,
 'out-dated': 2,
 'reiteradamente': 2,
 'classic': 2,
 'yachats': 2,
 'hair': 2,
 'har': 2,
 '5048': 2,
 'e

In [105]:
import collections

In [106]:
n_print = int(input("How many most common words to print: "))
print("\nOK. The {} most common words are as follows\n".format(n_print))
word_counter = collections.Counter(results)
for word, count in word_counter.most_common(n_print):
    print(word, ": ", count)

How many most common words to print: 120

OK. The 120 most common words are as follows

everyone :  1
sacrifice :  1
facilitites :  1
jeans :  1
sojourn :  1
antonio :  1
price.the :  1
voluntarily :  1
pair :  1
laminate :  1
uptodate :  1
crossing :  1
barn :  1
home2suites :  1
lancaster :  1
boon :  1
heater/ac :  1
30-minute :  1
é¨ :  1
ttl :  1
plymouth :  1
chemicals :  1
face/hand :  1
generated :  1
management :  1
hell :  1
adelaide :  1
yale :  1
yell :  1
fl :  1
record-breaking :  1
sighted :  1
w/couch :  1
maritime :  1
check-in/check-out :  1
ci :  1
tonthe :  1
tad :  1
dawson :  1
buenos :  1
pris :  1
increase :  1
sinuses :  1
fran :  1
kind..breakfast :  1
free..it :  1
multi-prong :  1
rmnp :  1
michelle :  1
tulip :  1
reproach :  1
extended :  1
service :  1
occupants :  1
informed :  1
17:00 :  1
7-11. :  1
w. :  1
heve :  1
out-dated :  1
reiteradamente :  1
classic :  1
yachats :  1
hair :  1
har :  1
5048 :  1
excelente :  1
think :  1
spackle :  1
culver :

In [86]:
test = test_results.select(['filteredReview'])
test.show()

+--------------------+
|      filteredReview|
+--------------------+
|staff friendly he...|
|good loved room d...|
|bad would 've lov...|
|bad one person to...|
|bad expensive add...|
|bad personel good...|
|bad n/a good mode...|
|bad like fact res...|
|bad wifi system n...|
|good hotel staff ...|
|bad cost valet go...|
|great location ex...|
|breakfast 17 buck...|
|bad booked 2 room...|
|friendly staff gr...|
|bad poor room com...|
|bad breakfast two...|
|upon arriving see...|
|     bed comfortable|
|bad room small ic...|
+--------------------+
only showing top 20 rows



In [99]:
to_list = [list(row) for row in test.collect()]
for element in to_list:
    element.replace('[.,', '')

AttributeError: 'list' object has no attribute 'replace'

In [98]:
to_list

[['staff friendly helpful rooms large nicely furnished feels new first stay actually opulent require free breakfast good sufficient choices including fresh fruit salad eggs bacon bagels pastries great location purpose ...'],
 ['good loved room decor plinko wall game room area'],
 ["bad would 've loved bit larger rooms ended get second room kids older kids since suite simply tight us husband said mattress ready replaced thought funny sign toiletries saying would charge taken home overall really great time loved environment good loved staff friendly helpful good recommendations loved attached restaurant coffee breakfast overall atmosphere cool well done location us great"],
 ["bad one person touch familiar jokey reception checking really problem certainly n't rude might think adds character good central location nice rooms comfortable bed could shop happily westfield mall right next door market street union square"],
 ['bad expensive adding tha taxes become heavy good new age excellant s

In [61]:
wordcount = {}

In [93]:
for word in to_list:
    word = word.replace(".","")
    word = word.replace(",","")
    word = word.replace(":","")
    word = word.replace("\"","")
    word = word.replace("[","")
    word = word.replace("]","")
    word = word.replace("!","")
    word = word.replace("â€œ","")
    word = word.replace("â€˜","")
    word = word.replace("*","")
    if word not in reviewlist:
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1

AttributeError: 'list' object has no attribute 'replace'

In [68]:
wordcount

{}

## Word Importance 

In [26]:
import math
from textblob import TextBlob as tb

In [27]:
def tf(word, review):
    return review.words.count(word) / len(review.words)

def n_containing(word, reviewlist):
    return sum(1 for review in reviewlist if review in review.words)

def idf(word, reviewlist):
    return math.log(len(reviewlist) / (1 + n_containing(word, reviewlist)))

def tfidf(word, review, reviewlist):
    return tf(word, review) * idf(word, reviewlist)

In [40]:
good_reviews = df[(df['rating'] >= 5.0)].head(100)

index             100
name              100
reviews_rating    100
rating            100
filteredReview    100
dtype: int64

In [54]:
bad_reviews = df.loc[(df['rating'] <= 2.0)].head(10)

index             10
name              10
reviews_rating    10
rating            10
filteredReview    10
dtype: int64

In [56]:
# Create the reviewlist from df["lines"]
good_reviewlist = [tb(review) for review in good_reviews["filteredReview"]]
bad_reviewlist = [tb(review) for review in bad_reviews["filteredReview"]]

In [57]:
# Create an empty list to be filled with text blobs from cleaning poemlist
good_reviewlist2 = []

# Loop through the poemlist
for i in range(0, len(good_reviewlist)):
    
    # Remove words that are shorter than 2 characters
    new_string = ' '.join([w for w in str(good_reviewlist[i]).split() if len(w) > 2])
    
    # Replace emm dash with space
    new_string2 = new_string.replace("—", " ")
    
    # Convert string to text blob
    new_string2 = tb(new_string2)

    # Append the text blob to the list of text blobs
    good_reviewlist2.append(new_string2)

[TextBlob("experience rancho valencia absolutely perfect beginning end felt special happy stayed would come back heart beat"),
 TextBlob("amazing place everyone extremely warm welcoming 've stayed top notch places definitely top great romantic getaway take kids along couple stuffed animals waiting girls upon arrival n't wait back"),
 TextBlob("booked night stay rancho valencia play tennis since one highest rated tennis resorts america place really top luxury standpoint overall experience villas really perfect staff great attention details includes fresh squeezed orange juice morning restaurants bar room service amazing tennis program really impressive well want come back"),
 TextBlob("live aloft home away home ... stayed night 7-7-16 ... staff great especially olivia extra special remembered voice phone ... tells alert pays attention customer needs.and thumbs ..."),
 TextBlob("stayed family daughters wedding accommodating staff olivia excellent rooms well maintained would highly recomm

In [58]:
# Create an empty list to be filled with text blobs from cleaning poemlist
bad_reviewlist2 = []

# Loop through the poemlist
for i in range(0, len(bad_reviewlist)):
    
    # Remove words that are shorter than 2 characters
    new_string = ' '.join([w for w in str(bad_reviewlist[i]).split() if len(w) > 2])
    
    # Replace emm dash with space
    new_string2 = new_string.replace("—", " ")
    
    # Convert string to text blob
    new_string2 = tb(new_string2)

    # Append the text blob to the list of text blobs
    bad_reviewlist2.append(new_string2)
    

[TextBlob("currently bed writing past 1/2 dogs barking squealing call front desk advise basically told nothing 315.00 n't sleep"),
 TextBlob("getting bait switch decided rather stay anywhere else sure wasnt missing much looks like project style crack house apartments front desk old middle eastern lady extremely rude charged got travel stay hotels 250 days/ year trust ..."),
 TextBlob("choice stay tornado hit area vineland without power charged 190 one night would n't accept aaa card leaving found charged cats n't another person husband ..."),
 TextBlob("heat room work properly remote broken excessive noise"),
 TextBlob("staff n't helpful one tried come clean room several nights rooms small good location"),
 TextBlob("breakfast bucks bit disgrace particular building"),
 TextBlob("bad booked rooms night boston trip king room double double double beds teenagers booked december july visit emailed twice arrival request rooms close floor times got reply alerted honest sums 'boxer customer se

In [45]:
# Calculate the most important words
impt_words = []
for i, review in enumerate( bad_reviewlist2):
    scores = {word: tfidf(word, review, bad_reviewlist2) for word in review.words}
    sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse = True)
    
    for word, score in sorted_words[:5]:
        impt_words.append((i + 1, word, round(score, 5)))

In [60]:
df2 = pd.DataFrame(impt_words, columns = ["review_number", "word", "TD-IDF"])
df2.head()

Unnamed: 0,review_number,word,TD-IDF
0,1,currently,0.25584
1,1,bed,0.25584
2,1,writing,0.25584
3,1,past,0.25584
4,1,1/2,0.25584


In [46]:
# Create a dataframe of important words per review
df2 = pd.DataFrame(impt_words, columns = ["review_number", "word", "TD-IDF"])

df2["rating"] = new_list

# Remove 
df2.head()

ValueError: Length of values does not match length of index