In [1]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# mounting to aws
import urllib

# spark
from pyspark.sql import SparkSession

# spark ML
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, IDF, CountVectorizer, Word2Vec
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

# spark nlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

In [2]:
# spark context
spark.sparkContext

In [3]:
# start spark session (from https://nlp.johnsnowlabs.com/docs/en/install#databricks) & script Ashish
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[8]")\
    .config("spark.driver.memory","4g")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.5")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

### Big Data Platforms

**Goal of this notebook:** Build a system that recommends a rating based on the review written by the user using databricks.

**Resources used:**
- Installations:
  - https://www.youtube.com/watch?v=TNX_GShSyHc
  - https://johnsnowlabs.github.io/spark-nlp-workshop/databricks/index.html#Getting%20Started.html

- NLP:
  - https://towardsdatascience.com/natural-language-processing-in-apache-spark-using-nltk-part-1-2-58c68824f660
  - https://towardsdatascience.com/natural-language-processing-with-pyspark-and-spark-nlp-b5b29f8faba
  - https://medium.com/analytics-vidhya/nlp-preprocessing-pipeline-what-when-why-2fc808899d1f
  - https://www.analyticsvidhya.com/blog/2020/07/build-text-categorization-model-with-spark-nlp/

### Step 1: Connect to S3 bucket and load data

In [6]:
# retrieve aws credentials from table
aws_credentials = spark.table('databricks_spark_accesskeys_1_csv') 

# mount notebook to S3 bucket
ACCESS_KEY = aws_credentials.select("Access key ID").collect()[0][0]
SECRET_KEY = aws_credentials.select("Secret access key").collect()[0][0]
ENCODED_SECRET_KEY = urllib.parse.quote(SECRET_KEY, "")
AWS_BUCKET_NAME = "big-data-class-final-project"
MOUNT_NAME = "s3_v9"
dbutils.fs.mount("s3n://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)

# display content of S3 bucket
display(dbutils.fs.ls("/mnt/%s" % MOUNT_NAME))

path,name,size
dbfs:/mnt/s3_v9/_metadata,_metadata,0
dbfs:/mnt/s3_v9/bootstrap_action.sh,bootstrap_action.sh,112
dbfs:/mnt/s3_v9/e-CXPSZT7I8V76S2JSSVTHOPBE/,e-CXPSZT7I8V76S2JSSVTHOPBE/,0
dbfs:/mnt/s3_v9/j-16PDN1URFLEDB/,j-16PDN1URFLEDB/,0
dbfs:/mnt/s3_v9/j-1QDUUJ03UG2UB/,j-1QDUUJ03UG2UB/,0
dbfs:/mnt/s3_v9/j-565CHQX5IPNM/,j-565CHQX5IPNM/,0
dbfs:/mnt/s3_v9/j-QLM4SQGJ2LK8/,j-QLM4SQGJ2LK8/,0
dbfs:/mnt/s3_v9/review_subset.json,review_subset.json,248796774
dbfs:/mnt/s3_v9/review_subset_spark_nlp_hashingTF_model_pipeline/,review_subset_spark_nlp_hashingTF_model_pipeline/,0
dbfs:/mnt/s3_v9/yelp_academic_dataset_review.json,yelp_academic_dataset_review.json,6325565224


In [7]:
# define location variables
#FILE_NAME = "yelp_academic_dataset_review.json"
FILE_NAME = "review_subset.json"

# load data
review = spark.read.json("dbfs:/mnt/" + str(MOUNT_NAME) + "/" + str(FILE_NAME))

In [8]:
# print schema
review.printSchema()

In [9]:
# select stars and text for now
#dat = review.select("stars", "text")
dat = review

# print length of data
print("The dataset has {} observations.".format(dat.count()))

# rename text col to label
#dat = dat.withColumnRenamed("stars","label")

# display
dat.show(5)

In [10]:
# spilt into train and test
(dat_train, dat_test) = dat.randomSplit([.8,.2],seed=3)

In [11]:
# display train
dat_train.show(5)

### 2. Pipelines: Preprocessing - feature engineering - model training

**Pipeline 1 : HashingTF**

In [14]:
# assemble document
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

# convert document to array of tokens
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

# clean tokens 
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")

# lemmatize
lemmatizer = LemmatizerModel.pretrained().setInputCols(["normalized"]).setOutputCol("lemma")

# remove stopwords
stopwords_cleaner = StopWordsCleaner().setInputCols("lemma").setOutputCol("cleanTokens").setCaseSensitive(False)

# stems tokens to bring it to root form
stemmer = Stemmer().setInputCols(["cleanTokens"]).setOutputCol("stem")

# convert custom document structure to array of tokens.
finisher = Finisher().setInputCols(["stem"]).setOutputCols(["token_features"]).setOutputAsArray(True).setCleanAnnotations(False)

# initialize hashingTF
hashingTF = HashingTF(inputCol="token_features", outputCol="features", numFeatures=1000)

# initialize logistic regression
lr = LogisticRegression(maxIter=10, regParam=0.01)

# preprocessing pipeline
pipeline_1 = Pipeline(stages=[document_assembler, 
                              tokenizer,
                              normalizer, 
                              lemmatizer,
                              stopwords_cleaner, 
                              stemmer, 
                              finisher,
                              hashingTF,
                              lr])

In [15]:
# fit the pipeline on training data
pipeline_model_1 = pipeline_1.fit(dat_train)

In [16]:
# perform predictions on test data
predictions_1 =  pipeline_model_1.transform(dat_test)

In [17]:
# initialize evaluator
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# evaluate predictions
print(evaluator.evaluate(predictions_1, {evaluator.metricName: "rmse"}))

In [18]:
# save pipeline model
MODEL_NAME = "review_subset_spark_nlp_hashingTF_model_pipeline"
pipeline_model_1.save("dbfs:/mnt/" + str(MOUNT_NAME) + "/" + str(MODEL_NAME))