In [None]:
# install packages
#sc.install_pypi_package("pandas==0.25.1")
#sc.install_pypi_package("matplotlib")
#sc.install_pypi_package("spark-nlp==2.5.4")

# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# nlp
#from nltk.corpus import stopwords

# spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction

# spark ML
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, CountVectorizer, Word2Vec
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator

# spark nlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

In [9]:
import sys
print(sys.path)

['/home/notebook/work', '/opt/conda/lib/python37.zip', '/opt/conda/lib/python3.7', '/opt/conda/lib/python3.7/lib-dynload', '', '/opt/conda/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages/awseditorssparkmonitoringwidget-1.0-py3.7.egg', '/opt/conda/lib/python3.7/site-packages/IPython/extensions', '/home/notebook/.ipython']


In [5]:
!which python

/opt/conda/bin/python


In [6]:
!which pip

/opt/conda/bin/pip


In [7]:
!which conda

/opt/conda/bin/conda


In [8]:
!conda install -y -c johnsnowlabs spark-nlp

Collecting package metadata (current_repodata.json): failed

CondaHTTPError: HTTP 000 CONNECTION FAILED for url <https://conda.anaconda.org/johnsnowlabs/linux-64/current_repodata.json>
Elapsed: -

An HTTP error occurred when trying to retrieve this URL.
HTTP errors are often intermittent, and a simple retry will get you on your way.
'https://conda.anaconda.org/johnsnowlabs/linux-64'




In [1]:
!pip install spark-nlp==2.5.4

^C
[31mERROR: Operation cancelled by user[0m


In [7]:
# start spark session (from https://nlp.johnsnowlabs.com/docs/en/install#databricks) & script Ashish
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[8]")\
    .config("spark.driver.memory","4g")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.4")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Big Data Platforms

**Goal of this notebook:** Build a system that recommends a rating based on the review written by the user using databricks.

**Resources used:**
- Installations:
  - https://www.youtube.com/watch?v=TNX_GShSyHc
  - https://johnsnowlabs.github.io/spark-nlp-workshop/databricks/index.html#Getting%20Started.html

- NLP:
  - https://towardsdatascience.com/natural-language-processing-in-apache-spark-using-nltk-part-1-2-58c68824f660
  - https://towardsdatascience.com/natural-language-processing-with-pyspark-and-spark-nlp-b5b29f8faba
  - https://medium.com/analytics-vidhya/nlp-preprocessing-pipeline-what-when-why-2fc808899d1f
  - https://www.analyticsvidhya.com/blog/2020/07/build-text-categorization-model-with-spark-nlp/

### 1. Load data from S3 bucket

In [6]:
# define location variables
AWS_BUCKET_NAME = "big-data-class-final-project"
FILE_NAME = "yelp_academic_dataset_review.json"

# load data
review = spark.read.json("s3a://" + str(AWS_BUCKET_NAME) + "/" + str(FILE_NAME))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# print schema
review.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)

In [8]:
# select stars and text for now
dat = review.select("stars", "text")

# print length of data
print("The dataset has {} observations.".format(dat.count()))

# rename text col to label
dat = dat.withColumnRenamed("stars", "label")

# display
dat.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

The dataset has 8021122 observations.
+-----+--------------------+
|label|                text|
+-----+--------------------+
|  2.0|As someone who ha...|
|  1.0|I am actually hor...|
|  5.0|I love Deagan's. ...|
|  1.0|Dismal, lukewarm,...|
|  4.0|Oh happy day, fin...|
+-----+--------------------+
only showing top 5 rows

### 2. Feature engineering

In [None]:
# initialize tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# initialize StopWordsRemover
stopword_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")

# initialize hashingTF
hashingTF = HashingTF(inputCol=stopword_remover.getOutputCol(), outputCol="features")

# build pipeline
pipeline_1 = Pipeline(stages=[tokenizer, stopword_remover, hashingTF])

# fit pipeline to data
dat_encoded_1 = pipeline_1.fit(dat).transform(dat)

# show
dat_encoded_1.show(5)

### 3. Model building

In [None]:
# train/test split
train_df, test_df = dat_encoded_1.randomSplit([.8,.2],seed=3)

In [None]:
# initialize logistic regression model
lr = LogisticRegression(maxIter=10, regParam=0.01)

# make prediction
lrm = lr.fit(train_df)

# create predictions df
predictions = lrm.transform(test_df)

In [None]:
# initialize evaluator
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# evaluate predictions
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))