In [18]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# nlp
#from nltk.corpus import stopwords

# spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction

# spark ML
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, CountVectorizer, Word2Vec
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator



In [3]:
# start spark session (from https://nlp.johnsnowlabs.com/docs/en/install#databricks) & script Ashish
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[8]")\
    .config("spark.driver.memory","4g")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.4")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

### Big Data Platforms

**Goal of this notebook:** Build a system that recommends a rating based on the review written by the user using databricks.

**Resources used:**
- Installations:
  - https://www.youtube.com/watch?v=TNX_GShSyHc
  - https://johnsnowlabs.github.io/spark-nlp-workshop/databricks/index.html#Getting%20Started.html

- NLP:
  - https://towardsdatascience.com/natural-language-processing-in-apache-spark-using-nltk-part-1-2-58c68824f660
  - https://towardsdatascience.com/natural-language-processing-with-pyspark-and-spark-nlp-b5b29f8faba
  - https://medium.com/analytics-vidhya/nlp-preprocessing-pipeline-what-when-why-2fc808899d1f
  - https://www.analyticsvidhya.com/blog/2020/07/build-text-categorization-model-with-spark-nlp/

### 1. Load data from storage container

In [13]:
# define location variables
FILE_NAME = "review_subset.json"

# load data
review = spark.read.json("wasb:///" + str(FILE_NAME))

In [14]:
# print schema
review.printSchema()

root
 |-- label: double (nullable = true)
 |-- text: string (nullable = true)

In [16]:
# select stars and text for now
#dat = review.select("stars", "text")
dat = review

# print length of data
print("The dataset has {} observations.".format(dat.count()))

# rename text col to label
#dat = dat.withColumnRenamed("stars","label")

# display
dat.show(5)

The dataset has 400430 observations.
+-----+--------------------+
|label|                text|
+-----+--------------------+
|  1.0|"Beware  of the m...|
|  1.0|$99 dollar specia...|
|  1.0|(This is for the ...|
|  1.0|***************bu...|
|  1.0|***DO NOT TAKE YO...|
+-----+--------------------+
only showing top 5 rows

### 2. Feature engineering

In [19]:
# initialize tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# initialize StopWordsRemover
stopword_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")

# initialize hashingTF
hashingTF = HashingTF(inputCol=stopword_remover.getOutputCol(), outputCol="features")

# build pipeline
pipeline_1 = Pipeline(stages=[tokenizer, stopword_remover, hashingTF])

# fit pipeline to data
dat_encoded_1 = pipeline_1.fit(dat).transform(dat)

# show
dat_encoded_1.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|                text|               words|            filtered|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  1.0|"Beware  of the m...|["beware, , of, t...|["beware, , man, ...|(262144,[2281,205...|
|  1.0|$99 dollar specia...|[$99, dollar, spe...|[$99, dollar, spe...|(262144,[781,1789...|
|  1.0|(This is for the ...|[(this, is, for, ...|[(this, psychic, ...|(262144,[14,3068,...|
|  1.0|***************bu...|[***************b...|[***************b...|(262144,[5937,892...|
|  1.0|***DO NOT TAKE YO...|[***do, not, take...|[***do, take, car...|(262144,[7473,828...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

### 3. Model building

In [20]:
# train/test split
train_df, test_df = dat_encoded_1.randomSplit([.8,.2],seed=3)

In [21]:
# initialize logistic regression model
lr = LogisticRegression(maxIter=10, regParam=0.01)

# make prediction
lrm = lr.fit(train_df)

# create predictions df
predictions = lrm.transform(test_df)

In [22]:
# initialize evaluator
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# evaluate predictions
print(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))

0.942720716442