In [1]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

# spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction

# spark nlp
#from sparknlp.annotator import LemmatizerModel

# spark ML
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, CountVectorizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator

### Big Data Platforms

**Goal of this notebook:** Build a system that recommends a rating based on the review written by the user using databricks.

**Resources used:**
- Installations:
  - https://www.youtube.com/watch?v=TNX_GShSyHc
  - https://johnsnowlabs.github.io/spark-nlp-workshop/databricks/index.html#Getting%20Started.html

- NLP:
  - https://towardsdatascience.com/natural-language-processing-in-apache-spark-using-nltk-part-1-2-58c68824f660
  - https://towardsdatascience.com/natural-language-processing-with-pyspark-and-spark-nlp-b5b29f8faba
  - https://medium.com/analytics-vidhya/nlp-preprocessing-pipeline-what-when-why-2fc808899d1f
  - https://www.analyticsvidhya.com/blog/2020/07/build-text-categorization-model-with-spark-nlp/

### 1. Read necessary data into spark

In [2]:
# start spark session
spark = SparkSession.builder.appName('rating_prediction_with_reviews').getOrCreate()

# change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '5g'), 
                                        ('spark.app.name', 'Spark Updated Conf'), 
                                        ('spark.executor.cores', '4'), 
                                        ('spark.cores.max', '4'), 
                                        ('spark.driver.memory','8g')])

In [13]:
# define location variables
FILE_PATH = "hdfs://nameservice1/user/vvenkatesan/final_project/"
#FILE_NAME = "yelp_academic_dataset_review.json"
FILE_NAME = "review_subset.json"

# load data from hdfs
review = spark.read.json(str(FILE_PATH) + str(FILE_NAME))

# load data from hive
#review = sqlContext.sql("select * from big_data_group_2.review")

In [14]:
# investigate data types
review.printSchema()

root
 |-- label: double (nullable = true)
 |-- text: string (nullable = true)



Most important features are really the stars (target variable) and text (features) for the prediction of rating with reviews. The remaining features might be stripped down in further analysis.

In [15]:
# prep files depending on which file is loaded (subset vs. full file)
if FILE_NAME == "review_subset.json":
    # no prep necessary
    dat = review
else:
    # select stars and text for now
    dat = review.select("stars", "text")
    # rename text col to label
    dat = dat.withColumnRenamed("stars","label")

# print length of data
print("The dataset has {} observations.".format(dat.count()))
    
# display
dat.show(5)

The dataset has 400430 observations.
+-----+--------------------+
|label|                text|
+-----+--------------------+
|  1.0|"Beware  of the m...|
|  1.0|$99 dollar specia...|
|  1.0|(This is for the ...|
|  1.0|***************bu...|
|  1.0|***DO NOT TAKE YO...|
+-----+--------------------+
only showing top 5 rows



In [16]:
# train/test split
train_df, test_df = dat.randomSplit([.8,.2],seed=3)

### 2. Feature engineering

#### Transformation 1 : HashingTF

In [17]:
# initialize tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# initialize StopWordsRemover
stopword_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")

# initialize hashingTF
hashingTF = HashingTF(inputCol=stopword_remover.getOutputCol(), outputCol="raw_features", numFeatures=1000)

# initialize idf
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")

# build pipeline
pipeline_1 = Pipeline(stages=[tokenizer, stopword_remover, hashingTF, idf])

#### Transformation 2 : CountVecotrizer

In [18]:
# initialize tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# initialize StopWordsRemover
stopword_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")

# initialize countVectorizer
countVectorizer = CountVectorizer(inputCol=stopword_remover.getOutputCol(), outputCol="features", vocabSize=1000)

# build pipeline
pipeline_2 = Pipeline(stages=[tokenizer, stopword_remover, countVectorizer])

### 3. Model development

In [19]:
# initialize evaluator
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

**Logistic regression**:

In [10]:
# start timer
t0 = time.time()

# initialize logistic regression model
lr = LogisticRegression(maxIter=10, regParam=0.01)

# extend pipeline
pipeline_model_1 = Pipeline(stages=[pipeline_1, lr])

# make prediction
lrm_1 = pipeline_model_1.fit(train_df)

# create predictions df
predictions = lrm_1.transform(test_df)

# evaluate predictions
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))

# stop timer
t1 = time.time()

# insight runtime
print("This program took {:.2f} minutes to run.".format((t1-t0)/60))

1.3147435425769072
This program took 1.04 minutes to run.


- On subset:
1.325792785296943
This program took 0.45 minutes to run.
- On full data:
1.3147435425769072
This program took 1.04 minutes to run.

In [20]:
# start timer
t0 = time.time()

# initialize logistic regression model
lr = LogisticRegression(maxIter=10, regParam=0.01)

# extend pipeline
pipeline_model_2 = Pipeline(stages=[pipeline_2, lr])

# make prediction
lrm_2 = pipeline_model_2.fit(train_df)

# create predictions df
predictions = lrm_2.transform(test_df)

# evaluate predictions
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))

# stop timer
t1 = time.time()

# insight runtime
print("This program took {:.2f} minutes to run.".format((t1-t0)/60))

1.0948662025083467
This program took 0.93 minutes to run.


In [12]:
# start timer
t0 = time.time()

# initialize logistic regression model
lr = LogisticRegression(maxIter=10, regParam=0.01)

# extend pipeline
pipeline_model_2 = Pipeline(stages=[pipeline_2, lr])

# make prediction
lrm_2 = pipeline_model_2.fit(train_df)

# create predictions df
predictions = lrm_2.transform(test_df)

# evaluate predictions
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))

# stop timer
t1 = time.time()

# insight runtime
print("This program took {:.2f} minutes to run.".format((t1-t0)/60))

1.0982139117666376
This program took 1.42 minutes to run.


- On subset:
0.9081573127054362
This program took 1.22 minutes to run.
- On full data:
1.0982139117666376
This program took 1.42 minutes to run.

**Comments:**
- Pipeline 1 (hashingTF and IDF) yields worse results than pipeline 2 (CountVectorizer) in this case 

**Random forest:**

In [12]:
# start timer
t0 = time.time()

# initialize random forest model
rf = RandomForestClassifier(maxDepth=5, numTrees=15, labelCol="label", featuresCol="features")

# extend pipeline
pipeline_model_rf_1 = Pipeline(stages=[pipeline_1, rf])

# make prediction
rfm_1 = pipeline_model_rf_1.fit(train_df)

# create predictions df
predictions = rfm_1.transform(test_df)

# evaluate predictions
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))

# stop timer
t1 = time.time()

# insight runtime
print("This program took {:.2f} minutes to run.".format((t1-t0)/60))

1.9451034399092813
This program took 2.17 minutes to run.


- On subset:
1.934188655062775
This program took 0.53 minutes to run.
- On full data:
1.9451034399092813
This program took 2.17 minutes to run.

In [11]:
# start timer
t0 = time.time()

# initialize random forest model
rf = RandomForestRegressor(maxDepth=5, numTrees=15, labelCol="label", featuresCol="features")

# extend pipeline
pipeline_model_rf_1 = Pipeline(stages=[pipeline_1, rf])

# make prediction
rfm_1 = pipeline_model_rf_1.fit(train_df)

# create predictions df
predictions = rfm_1.transform(test_df)

# evaluate predictions
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))

# stop timer
t1 = time.time()

# insight runtime
print("This program took {:.2f} minutes to run.".format((t1-t0)/60))

1.328617320571232
This program took 2.56 minutes to run.


- On subset:
1.3261032099661794
This program took 0.59 minutes to run.
- On full data:
1.328617320571232
This program took 2.56 minutes to run.

**Comments:**
- Using the Random Forest Regressor yields better resutls than using the Random Forest Classifier 
- Overall, the Random Forest performs worse than the Logistic Regression