# Text Processing - Yelp 2021 - Part 4

This notebook covers:
* Word Embedding Models
    * Word2Vec/GloVe
    * Bert/Elmo
    * Universal Sentence Encoder

In [2]:
import time
# Basic PySpark
import pyspark as ps
from pyspark.sql import functions as F
from pyspark.sql.types import *
# PySpark NLP
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, IndexToString, Word2Vec
# PySpark Classification Models
from pyspark.ml.classification import NaiveBayes, LinearSVC
# PySpark Model Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
# Stopwords
from nltk.corpus import stopwords

## Set Up Spark

In [3]:
spark = (ps.sql.SparkSession.builder
.appName("Spark NLP")
.master("local[3]")
.config("spark.driver.memory","8G")
.config("spark.driver.maxResultSize", "0")
.config("spark.kryoserializer.buffer.max", "2000M")
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.1.0")
.config("spark.driver.extraClassPath", "/home/jovyan/postgresql-42.2.20.jar")
.getOrCreate())

## Import Data

### Connecting To Data

In [4]:
db_properties = {
    "user": "postgres",
    "password": None,
    "driver": "org.postgresql.Driver"
}
db_endpoint = None
db_url = f'jdbc:postgresql://{db_endpoint}/yelp_2021_db'

In [5]:
train = spark.read.jdbc(url=db_url,table='(SELECT review_id, review_text, target_ufc_bool FROM text_data_train LIMIT 1000) AS tmp_train',properties=db_properties)

In [6]:
test = spark.read.jdbc(url=db_url,table='(SELECT review_id, review_text, target_ufc_bool FROM text_data_test LIMIT 1000) AS tmp_test',properties=db_properties)

In [7]:
train.createOrReplaceTempView("train")
test.createOrReplaceTempView("test")

## Data Overview

In [8]:
train.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- target_ufc_bool: string (nullable = true)



In [9]:
train.show(5)

+--------------------+--------------------+---------------+
|           review_id|         review_text|target_ufc_bool|
+--------------------+--------------------+---------------+
|O3WmTrkzh7-7f2_Cf...|Part of my amazin...|           True|
|O3aT56IZhBqRI53co...|I've never had is...|           True|
|O3n5Oq-JFaBUk5ILg...|Solid experience ...|          False|
|O493xYpCSILJ_Gtb2...|I went here last ...|           True|
|O49wqBbevea8ZhuTp...|Overrated. Weak i...|           True|
+--------------------+--------------------+---------------+
only showing top 5 rows



In [10]:
print(f'Train Records: {train.count()}')
print(f'Test Records: {test.count()}')

Train Records: 1000
Test Records: 1000


## Prep Work

### Majority Class Baseline (True or Quality)

In [11]:
# Train Baseline
train_baseline = spark.sql(
    """
    SELECT target_ufc_bool,
           COUNT(*) AS count,
           ROUND((COUNT(*) / (SELECT COUNT(*) FROM train)) * 100, 2) AS percent
    FROM train
    GROUP BY target_ufc_bool
    ORDER BY count DESC
    """
).show()

+---------------+-----+-------+
|target_ufc_bool|count|percent|
+---------------+-----+-------+
|          False|  519|   51.9|
|           True|  481|   48.1|
+---------------+-----+-------+



In [12]:
# Test Baseline
test_baseline = spark.sql(
    """
    SELECT target_ufc_bool,
           COUNT(*) AS count,
           ROUND((COUNT(*) / (SELECT COUNT(*) FROM test)) * 100, 2) AS percent
    FROM test
    GROUP BY target_ufc_bool
    ORDER BY count DESC
    """
).show()

+---------------+-----+-------+
|target_ufc_bool|count|percent|
+---------------+-----+-------+
|           True|  524|   52.4|
|          False|  476|   47.6|
+---------------+-----+-------+



## Text Prep

In [13]:
document_assembler = (DocumentAssembler()
                      .setInputCol('review_text')
                      .setOutputCol('document'))

tokenizer = (Tokenizer()
             .setInputCols(['document'])
             .setOutputCol('tokens'))

normalizer = (Normalizer()
              .setInputCols(['tokens'])
              .setOutputCol('normalized'))

stopwords_cleaner = (StopWordsCleaner()
                     .setInputCols(['normalized'])
                     .setOutputCol('clean_tokens')
                     .setCaseSensitive(False))

lemmatizer = (LemmatizerModel.pretrained()
              .setInputCols(['clean_tokens'])
              .setOutputCol('lemma'))

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


### Class Labeling

In [14]:
label_strIdx = StringIndexer(inputCol="target_ufc_bool", outputCol="label", stringOrderType='alphabetAsc')
label_Idxstr = IndexToString(inputCol="prediction", outputCol="predicted_class", labels=["False", "True"])

### Text Prep Options

In [15]:
word_embeddings = (WordEmbeddingsModel().pretrained()
                   .setInputCols(["document", "lemma"])
                   .setOutputCol("word_embed"))

# bert_embeddings = (BertEmbeddings
#                    .pretrained()
#                    .setInputCols(["document",'lemma'])
#                    .setOutputCol("word_embed"))

# elmo_embeddings = (ElmoEmbeddings
#                    .pretrained()
#                    .setInputCols(["document",'lemma'])
#                    .setOutputCol("word_embed"))

embeddings_sentence = (SentenceEmbeddings()
                      .setInputCols(["document", "word_embed"])
                      .setOutputCol("sentence_embeddings")
                      .setPoolingStrategy("AVERAGE"))

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


### No Preprocessing Pipelines

In [16]:
use = (UniversalSentenceEncoder.pretrained()
       .setInputCols(["document"])
       .setOutputCol("sentence_embeddings"))

# bse = (BertSentenceEmbeddings.pretrained()
#        .setInputCols(["document"])
#        .setOutputCol("sentence_embeddings"))

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


### Classification Models

In [17]:
MNB_CLF = NaiveBayes(smoothing=1.0) # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.NaiveBayes.html
SVM_CLF = LinearSVC(standardization=False) # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.LinearSVC.html
DL_CLF = (ClassifierDLApproach()
          .setInputCols("sentence_embeddings")
          .setOutputCol("prediction")
          .setLabelColumn("target_ufc_bool")
          .setMaxEpochs(25)
          .setEnableOutputLogs(True))

### Loading Everything to Pipeline

In [18]:
pipeline = (Pipeline()
            .setStages([document_assembler,
                        tokenizer,
                        normalizer,
                        stopwords_cleaner,
                        lemmatizer,
                        word_embeddings,
                        embeddings_sentence,
                        label_strIdx,
                        DL_CLF,
                       ]))

# pipeline = (Pipeline()
#             .setStages([document_assembler,
#                         use,
#                         label_strIdx,
#                         DL_CLF,
#                        ]))

### Fit and Predict

In [19]:
fit_start = time.perf_counter()
cls_model = pipeline.fit(train)
fit_end = time.perf_counter()

In [20]:
transform_start = time.perf_counter()
test_pred = cls_model.transform(test)
test_pred = test_pred.select(["review_id", "target_ufc_bool", "prediction.result", "prediction.metadata", "label"])
test_pred = (test_pred.withColumn("prediction", test_pred["result"].getItem(0))
             .withColumn("true_prob", test_pred["metadata"].getItem(0).getItem("True").cast("double"))
             .withColumn("prediction_label", F.udf(lambda x: 1.0 if x == "True" else 0.0, DoubleType())("prediction")))
test_pred = test_pred.select(["review_id", "label", "target_ufc_bool", "prediction", "prediction_label", "true_prob"])

train_pred = cls_model.transform(test)
train_pred = train_pred.select(["review_id", "target_ufc_bool", "prediction.result", "prediction.metadata", "label"])
train_pred = (train_pred.withColumn("prediction", train_pred["result"].getItem(0))
              .withColumn("true_prob", train_pred["metadata"].getItem(0).getItem("True").cast("double"))
              .withColumn("prediction_label", F.udf(lambda x: 1.0 if x == "True" else 0.0, DoubleType())("prediction")))
train_pred = train_pred.select(["review_id", "label", "target_ufc_bool", "prediction", "prediction_label", "true_prob"])
transform_end = time.perf_counter()

In [21]:
train_pred.show(50, truncate=False)

+----------------------+-----+---------------+----------+----------------+-----------+
|review_id             |label|target_ufc_bool|prediction|prediction_label|true_prob  |
+----------------------+-----+---------------+----------+----------------+-----------+
|--p3d1axlnA7ka_p6hO-QQ|0.0  |False          |True      |1.0             |0.68538624 |
|-1v3W4XqQcIe44_I1lZYyA|1.0  |True           |True      |1.0             |0.85042953 |
|-21y2QEKfhjxh2algH_0nQ|1.0  |True           |False     |0.0             |0.065926254|
|-358vecdAUh6ECkNfawvHw|0.0  |False          |False     |0.0             |0.014743484|
|-3_NmlYMibrapNEnS_gfcg|1.0  |True           |False     |0.0             |0.07291885 |
|-5JXmLmRoECQ2coWszNZYg|0.0  |False          |True      |1.0             |0.8159384  |
|-5tMbhsrpjFUGPttynBHKA|1.0  |True           |False     |0.0             |0.4661805  |
|-94ADRRTfjkHsyt6KVyfLg|1.0  |True           |True      |1.0             |0.81080085 |
|-C8Op9jCno7LikFtegYZqg|0.0  |False        

### Model Evaluation

In [22]:
eval_start = time.perf_counter()
evaluator = BinaryClassificationEvaluator().setRawPredictionCol("true_prob")
auc = evaluator.evaluate(test_pred, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(test_pred, {evaluator.metricName: "areaUnderPR"})

In [23]:
multi_evaluator = MulticlassClassificationEvaluator().setPredictionCol("prediction_label")
accuracy = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "weightedRecall"})
f1 = multi_evaluator.evaluate(test_pred, {multi_evaluator.metricName: "f1"})
eval_end = time.perf_counter()

In [24]:
print(f"Accuracy: {accuracy:.3f}")
print(f"AUC: {auc:.3f}")
print(f"AUPR: {aupr:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"Fit Time: {(fit_end - fit_start)/60:.2f} minutes")
print(f"Transform/Predict Time: {transform_end - transform_start:.2f} seconds")
print(f"Eval Time: {(eval_end - eval_start)/60:.2f} minutes")

Accuracy: 0.591
AUC: 0.637
AUPR: 0.646
Precision: 0.591
Recall: 0.591
F1 Score: 0.591
Fit Time: 0.40 minutes
Transform/Predict Time: 1.08 seconds
Eval Time: 0.90 minutes


### Saving Predictions

In [26]:
train_pred.createOrReplaceTempView("train_pred")
test_pred.createOrReplaceTempView("test_pred")

In [27]:
train_finished = spark.sql("""
                            SELECT review_id,
                                ROUND(true_prob, 3) AS glove_prob
                            FROM train_pred
                           """)

test_finished = spark.sql("""
                            SELECT review_id,
                                ROUND(true_prob, 3) AS glove_prob
                            FROM test_pred
                          """)

In [28]:
train_finished.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- glove_prob: double (nullable = true)



In [29]:
train_finished.show(50, truncate=False)

+----------------------+----------+
|review_id             |glove_prob|
+----------------------+----------+
|--p3d1axlnA7ka_p6hO-QQ|0.685     |
|-1v3W4XqQcIe44_I1lZYyA|0.85      |
|-21y2QEKfhjxh2algH_0nQ|0.066     |
|-358vecdAUh6ECkNfawvHw|0.015     |
|-3_NmlYMibrapNEnS_gfcg|0.073     |
|-5JXmLmRoECQ2coWszNZYg|0.816     |
|-5tMbhsrpjFUGPttynBHKA|0.466     |
|-94ADRRTfjkHsyt6KVyfLg|0.811     |
|-C8Op9jCno7LikFtegYZqg|0.608     |
|-CRezM_gS3k1mE0DTPEk1g|0.363     |
|-CrDANDLHRh-f1fQJsF0pg|0.104     |
|-FLOyBd9KC5QANDy2Oo9JQ|0.193     |
|-G1lx3ThcvaHjA3u3OoNEw|0.423     |
|-MZcA4e0wJlG3I3GBqeIsw|0.38      |
|-MZn0pcLOEqF_E_H_uAe7w|0.814     |
|-Oxm3H4OKVIB3Sue6xo7jA|0.777     |
|-GgsY3-Y0siHJZeNgXi7Ng|0.772     |
|-GzcIKwshy6qqgzsXgVvSw|0.823     |
|-HV3oatE8XxljQFYxmQ6qg|0.004     |
|-HaxGF7Hu68HIFx4GWId5w|0.16      |
|-I9dOlrJBMPUGs1q-fBt2A|0.895     |
|-IsBHUf732Rf_EL_fn2Z5Q|0.822     |
|-JQI1D1PQkl3KdXKfz1tsg|0.702     |
|-JbV9-K_5Caa8YoznsRrHw|0.314     |
|-LOR_bwHBC59wERPqew1QA|0.84

In [None]:
# train_finished.write.jdbc(url=db_url,table='text_data_train_glove',mode='overwrite',properties=db_properties)
# test_finished.write.jdbc(url=db_url,table='text_data_test_glove',mode='overwrite',properties=db_properties)

### Saving Model

In [None]:
sc = spark.sparkContext

In [None]:
model_name = "GloVe_all"

In [None]:
# cls_model.save(f"spark_models/{model_name}")