# Text Processing - Yelp 2021 - Part 3

This notebook covers:
* Tf-Idf Text Vectorization
* Naive Bayes Predictions
* Support Vector Machine Predictions

## Imports and Global Settings

In [1]:
import time
# Basic PySpark
import pyspark as ps
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.functions import vector_to_array
# PySpark NLP
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, IndexToString
# PySpark Classification Models
from pyspark.ml.classification import NaiveBayes, LinearSVC
# PySpark Model Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
# Stopwords
from nltk.corpus import stopwords

## Set Up Spark

In [2]:
spark = (ps.sql.SparkSession.builder
.appName("Spark NLP")
.master("local[3]")
.config("spark.driver.memory","16G")
.config("spark.driver.maxResultSize", "0")
.config("spark.kryoserializer.buffer.max", "2000M")
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.1.0")
.config("spark.driver.extraClassPath", "/home/jovyan/postgresql-42.2.20.jar")
.getOrCreate())

## Import Data

### Connecting To Data

In [3]:
db_properties = {
    "user": "postgres",
    "password": None,
    "driver": "org.postgresql.Driver"
}
db_url = 'jdbc:postgresql://None/yelp_2021_db'

In [4]:
train = spark.read.jdbc(url=db_url,table='(SELECT review_id, review_text, target_ufc_bool FROM text_data_train) AS tmp_train',properties=db_properties)

In [5]:
test = spark.read.jdbc(url=db_url,table='(SELECT review_id, review_text, target_ufc_bool FROM text_data_test) AS tmp_test',properties=db_properties)

In [6]:
train.createOrReplaceTempView("train")
test.createOrReplaceTempView("test")

## Data Overview

In [7]:
train.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- target_ufc_bool: string (nullable = true)



In [8]:
train.show(5)

+--------------------+--------------------+---------------+
|           review_id|         review_text|target_ufc_bool|
+--------------------+--------------------+---------------+
|Y5G32BbSbiMlzLCJn...|Love this place!!...|           True|
|Y5MspuNJc4wvEN7To...|Came here last ni...|          False|
|Y5WF0rNHJjbqn-JHp...|There must be a w...|          False|
|Y5WikZeyRj-B3a5Qf...|The staff was ver...|           True|
|Y5ahrqYGEhMif-_YD...|Waited forever (m...|          False|
+--------------------+--------------------+---------------+
only showing top 5 rows



In [9]:
print(f'Train Records: {train.count()}')
print(f'Test Records: {test.count()}')

Train Records: 5523992
Test Records: 1382379


## Prep Work

### Majority Class Baseline (True or Quality)

In [10]:
# Train Baseline
train_baseline = spark.sql(
    """
    SELECT target_ufc_bool,
           COUNT(*) AS count,
           ROUND((COUNT(*) / (SELECT COUNT(*) FROM train)) * 100, 2) AS percent
    FROM train
    GROUP BY target_ufc_bool
    ORDER BY count DESC
    """
).show()

+---------------+-------+-------+
|target_ufc_bool|  count|percent|
+---------------+-------+-------+
|           True|2796729|  50.63|
|          False|2727263|  49.37|
+---------------+-------+-------+



In [11]:
# Test Baseline
test_baseline = spark.sql(
    """
    SELECT target_ufc_bool,
           COUNT(*) AS count,
           ROUND((COUNT(*) / (SELECT COUNT(*) FROM test)) * 100, 2) AS percent
    FROM test
    GROUP BY target_ufc_bool
    ORDER BY count DESC
    """
).show()

+---------------+------+-------+
|target_ufc_bool| count|percent|
+---------------+------+-------+
|           True|699072|  50.57|
|          False|683307|  49.43|
+---------------+------+-------+



## Text Prep

In [12]:
# Stop Words
eng_stopwords = stopwords.words('english')

In [13]:
documentAssembler = (DocumentAssembler()
                     .setInputCol('review_text')
                     .setOutputCol('document'))

sentence = (SentenceDetector()
            .setInputCols(['document'])
            .setOutputCol('sentence'))

tokenizer = (Tokenizer()
             .setInputCols(['sentence'])
             .setOutputCol('token'))

normalizer = (Normalizer()
              .setInputCols(['token'])
              .setOutputCol('normalized')
              .setLowercase(True))

lemmatizer = (LemmatizerModel.pretrained()
              .setInputCols(['normalized'])
              .setOutputCol('lemma'))

stopwords_cleaner = (StopWordsCleaner()
                     .setInputCols(['lemma'])
                     .setOutputCol('clean_lemma')
                     .setCaseSensitive(False)
                     .setStopWords(eng_stopwords))

finisher = (Finisher()
            .setInputCols(['clean_lemma'])
            .setOutputCols(["token_features"])
            .setOutputAsArray(True)
            .setCleanAnnotations(False))

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


### Class Labeling

In [14]:
label_strIdx = StringIndexer(inputCol="target_ufc_bool", outputCol="label", stringOrderType='alphabetAsc')
label_Idxstr = IndexToString(inputCol="prediction", outputCol="predicted_class", labels=["False", "True"])

### Text Prep Options

In [15]:
hashTF = HashingTF(inputCol="token_features", outputCol="tf_features")
idf = IDF(inputCol="tf_features", outputCol="features", minDocFreq=2)

### Classification Models

In [16]:
MNB_CLF = NaiveBayes(smoothing=1.0) # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.NaiveBayes.html
SVM_CLF = LinearSVC(standardization=False) # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.LinearSVC.html

### Loading Everything to Pipeline

In [17]:
pipeline = (Pipeline()
            .setStages([documentAssembler,
                        sentence,
                        tokenizer,
                        normalizer,
                        lemmatizer,
                        stopwords_cleaner,
                        finisher,
                        hashTF,
                        idf,
                        label_strIdx,
                        MNB_CLF,
                        label_Idxstr
                       ]))

### Fit and Predict

In [18]:
fit_start = time.perf_counter()
cls_model = pipeline.fit(train)
fit_end = time.perf_counter()

In [19]:
transform_start = time.perf_counter()
test_pred = cls_model.transform(test)
train_pred = cls_model.transform(train)
transform_end = time.perf_counter()

### Model Evaluation

In [20]:
eval_start = time.perf_counter()
evaluator = BinaryClassificationEvaluator()
auc = evaluator.evaluate(test_pred, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(test_pred, {evaluator.metricName: "areaUnderPR"})

In [21]:
multi_evaluator = MulticlassClassificationEvaluator()
accuracy = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "weightedRecall"})
f1 = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "f1"})
eval_end = time.perf_counter()

In [22]:
print(f"Accuracy: {accuracy:.3f}")
print(f"AUC: {auc:.3f}")
print(f"AUPR: {aupr:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"Fit Time: {(fit_end - fit_start)/60:.2f} minutes")
print(f"Transform/Predict Time: {transform_end - transform_start:.2f} seconds")
print(f"Eval Time: {(eval_end - eval_start)/60:.2f} minutes")

Accuracy: 0.603
AUC: 0.324
AUPR: 0.398
Precision: 0.611
Recall: 0.603
F1 Score: 0.598
Fit Time: 955.46 minutes
Transform/Predict Time: 0.97 seconds
Eval Time: 817.27 minutes


### Saving Predictions

In [23]:
train_pred = train_pred.withColumn("Prob", vector_to_array("probability"))
test_pred = test_pred.withColumn("Prob", vector_to_array("probability"))

In [24]:
train_pred.createOrReplaceTempView("train_pred")
test_pred.createOrReplaceTempView("test_pred")

In [25]:
train_finished = spark.sql("""
                            SELECT review_id,
                                ROUND(Prob[1], 3) AS NB_tfidf_true_prob
                            FROM train_pred
                           """)

test_finished = spark.sql("""
                            SELECT review_id,
                                ROUND(Prob[1], 3) AS NB_tfidf_true_prob
                            FROM test_pred
                          """)

In [26]:
train_finished.write.jdbc(url=db_url,table='text_data_train_nm_tfidf',mode='overwrite',properties=db_properties)
test_finished.write.jdbc(url=db_url,table='text_data_test_nm_tfidf',mode='overwrite',properties=db_properties)

### Saving Model

In [27]:
sc = spark.sparkContext

In [28]:
model_name = "NB_TFIDF_100k"

In [29]:
cls_model.save(f"spark_models/{model_name}")