In [1]:
# Dependencies
import pandas as pd
import sqlite3
from pyspark import SparkContext
sc =SparkContext()
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [2]:
# Create connections to database
conn = sqlite3.connect("Data/Hotels.db")

#Load the database table into a pandas dataframe
ratings = pd.read_sql_query("select * from ratings;", conn)
conn.close()

# Preview the dataframe
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,Our experience at Rancho Valencia was absolute...,Best romantic vacation ever!!!!,,
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,Amazing place. Everyone was extremely warm and...,Sweet sweet serenity,,
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,We booked a 3 night stay at Rancho Valencia to...,Amazing Property and Experience,,
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,Currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,I live in Md and the Aloft is my Home away fro...,ALWAYS GREAT STAY...,Laurel,MD


In [3]:
# Put all letters in lower case
ratings["reviews_text"] = ratings["reviews_text"].str.lower()
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,our experience at rancho valencia was absolute...,Best romantic vacation ever!!!!,,
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,amazing place. everyone was extremely warm and...,Sweet sweet serenity,,
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,we booked a 3 night stay at rancho valencia to...,Amazing Property and Experience,,
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,i live in md and the aloft is my home away fro...,ALWAYS GREAT STAY...,Laurel,MD


In [4]:
ratings['reviews_text'] = ratings['reviews_text'].astype(str)

In [5]:
# Dependencies
import re, string

import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/janelchadiarova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Remove stop words from the list
stops = stopwords.words("english")
exclude = set(string.punctuation)

In [7]:
# Transform the poem in preparation for word counts
words_list = []
preprocessed_text = []
for review in ratings["reviews_text"]:
    
    # Create a list of words per poem after the words are converted to lowercase    
    words = word_tokenize(review)
    
    # Filter to remove stop words and punctuations    
    words2 = [word for word in words if word not in stops and word not in exclude]
    
    # Add the filtered list of words (representing each poem)
    words_list.append(words2)
    
    # Convert the list of strings back to one string
    words3 = " ".join(words2)
    
    # Add the filtered list of words (representing each poem)
    preprocessed_text.append(words3)
   
ratings["filteredReview"] = preprocessed_text
ratings.head()

Unnamed: 0,index,name,reviews_date,reviews_rating,reviews_sourceURLs,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,filteredReview
0,0,Rancho Valencia Resort Spa,2013-11-14T00:00:00Z,5.0,www.hotels.com,our experience at rancho valencia was absolute...,Best romantic vacation ever!!!!,,,experience rancho valencia absolutely perfect ...
1,1,Rancho Valencia Resort Spa,2014-07-06T00:00:00Z,5.0,www.hotels.com,amazing place. everyone was extremely warm and...,Sweet sweet serenity,,,amazing place everyone extremely warm welcomin...
2,2,Rancho Valencia Resort Spa,2015-01-02T00:00:00Z,5.0,www.hotels.com,we booked a 3 night stay at rancho valencia to...,Amazing Property and Experience,,,booked 3 night stay rancho valencia play tenni...
3,3,Aloft Arundel Mills,2016-05-15T00:00:00Z,2.0,www.tripadvisor.com,currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA,currently bed writing past hr 1/2 dogs barking...
4,4,Aloft Arundel Mills,2016-07-09T00:00:00Z,5.0,www.tripadvisor.com,i live in md and the aloft is my home away fro...,ALWAYS GREAT STAY...,Laurel,MD,live md aloft home away home ... stayed 1 nigh...


In [8]:
# Remove columns that will not be used in the analysis
df = ratings.drop(columns=["reviews_text","reviews_date","reviews_sourceURLs","reviews_title","reviews_userCity","reviews_userProvince"],axis=1)
df = df.rename(index=str, columns={"reviews_rating": "label"})
# Preview the dataframe
df.head()

Unnamed: 0,index,name,label,filteredReview
0,0,Rancho Valencia Resort Spa,5.0,experience rancho valencia absolutely perfect ...
1,1,Rancho Valencia Resort Spa,5.0,amazing place everyone extremely warm welcomin...
2,2,Rancho Valencia Resort Spa,5.0,booked 3 night stay rancho valencia play tenni...
3,3,Aloft Arundel Mills,2.0,currently bed writing past hr 1/2 dogs barking...
4,4,Aloft Arundel Mills,5.0,live md aloft home away home ... stayed 1 nigh...


In [9]:
spark_ratings = sqlContext.createDataFrame(df)
spark_ratings.show(5)

+-----+--------------------+-----+--------------------+
|index|                name|label|      filteredReview|
+-----+--------------------+-----+--------------------+
|    0|Rancho Valencia R...|  5.0|experience rancho...|
|    1|Rancho Valencia R...|  5.0|amazing place eve...|
|    2|Rancho Valencia R...|  5.0|booked 3 night st...|
|    3| Aloft Arundel Mills|  2.0|currently bed wri...|
|    4| Aloft Arundel Mills|  5.0|live md aloft hom...|
+-----+--------------------+-----+--------------------+
only showing top 5 rows



In [10]:
from pyspark.sql.functions import length
data = spark_ratings.withColumn('length', length(spark_ratings['filteredReview']))
data.show(5)

+-----+--------------------+-----+--------------------+------+
|index|                name|label|      filteredReview|length|
+-----+--------------------+-----+--------------------+------+
|    0|Rancho Valencia R...|  5.0|experience rancho...|   112|
|    1|Rancho Valencia R...|  5.0|amazing place eve...|   202|
|    2|Rancho Valencia R...|  5.0|booked 3 night st...|   335|
|    3| Aloft Arundel Mills|  2.0|currently bed wri...|   125|
|    4| Aloft Arundel Mills|  5.0|live md aloft hom...|   186|
+-----+--------------------+-----+--------------------+------+
only showing top 5 rows



In [11]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [12]:
tokenizer = Tokenizer(inputCol="filteredReview", outputCol="token_text")
tokenized = tokenizer.transform(spark_ratings)
tokenized.show(3, truncate=False)

+-----+--------------------------+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|index|name                      |label|filteredReview                                                                                                                                                                                        

In [13]:
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
filtered = stopremove.transform(tokenized)
filtered.show(3, truncate=False)

+-----+--------------------------+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token',numFeatures=pow(2,4))
hashed_df = hashingTF.transform(filtered)
hashed_df.show(3, truncate=False)

+-----+--------------------------+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
idf = IDF(inputCol="hash_token", outputCol="idf_token")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [16]:
tokenizer = Tokenizer(inputCol="filteredReview", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token',numFeatures=pow(2,4))
idf = IDF(inputCol="hash_token", outputCol="idf_token")

In [17]:
 from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [18]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

In [19]:
 # Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)
cleaned.show(3)

+-----+--------------------+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|index|                name|label|      filteredReview|length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+-----+--------------------+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    0|Rancho Valencia R...|  5.0|experience rancho...|   112|[experience, ranc...|[experience, ranc...|(16,[1,4,5,6,8,9,...|(16,[1,4,5,6,8,9,...|[0.0,0.1847442471...|
|    1|Rancho Valencia R...|  5.0|amazing place eve...|   202|[amazing, place, ...|[amazing, place, ...|(16,[2,3,4,5,6,7,...|(16,[2,3,4,5,6,7,...|[0.0,0.0,0.289650...|
|    2|Rancho Valencia R...|  5.0|booked 3 night st...|   335|[booked, 3, night...|[booked, 3, night...|(16,[0,1,2,3,4,5,...|(16,[0,1,2,3,4,5,...|[0.59916955914

In [21]:
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  5.0|[0.0,0.1847442471...|
|  5.0|[0.0,0.0,0.289650...|
|  5.0|[0.59916955914253...|
|  2.0|[0.29958477957126...|
|  5.0|[1.49792389785632...|
|  5.0|[0.59916955914253...|
|  5.0|[0.59916955914253...|
|  5.0|(17,[1,2,3,4,9,10...|
|  5.0|[0.59916955914253...|
|  5.0|[0.59916955914253...|
|  5.0|[0.29958477957126...|
|  5.0|[0.29958477957126...|
|  5.0|[1.49792389785632...|
|  5.0|[0.29958477957126...|
|  5.0|[0.89875433871379...|
|  4.0|[0.59916955914253...|
|  5.0|[0.59916955914253...|
|  3.0|[0.59916955914253...|
|  5.0|[0.59916955914253...|
|  4.0|[0.0,0.7389769886...|
+-----+--------------------+
only showing top 20 rows



In [22]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

In [34]:
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [31]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(20)

+-----+--------------------+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|index|                name|label|      filteredReview|length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    1|Rancho Valencia R...|  5.0|amazing place eve...|   202|[amazing, place, ...|[amazing, place, ...|(16,[2,3,4,5,6,7,...|(16,[2,3,4,5,6,7,...|[0.0,0.0,0.289650...|[-65.991521015058...|[0.04315214343598...|      27.0|
| 1009|Hotel Zetta San F...|  4.6|good really quiet...|   195|[good, really, qu...|[good, really, qu...|(16,[0,2,3,4

In [29]:
df["label"].unique()

array([5.  , 2.  , 4.  , 3.  , 1.  , 4.8 , 4.6 , 3.55, 4.4 , 4.15, 2.5 ,
       3.95, 2.9 , 3.35, 3.75, 4.5 , 2.1 , 1.65, 3.15, 2.7 , 1.45, 2.75,
       2.3 , 3.5 , 4.25, 1.25, 1.9 , 3.45, 3.25, 4.75])