# Text Processing - Yelp 2021 - Part 1

This notebook covers:
* Basic Text Characteristics
* Text Reading Level
* Text Sentiment

### Setting Up Spark

In [1]:
import pyspark as ps
from pyspark.sql import functions as F
from pyspark.sql.types import *

import textstat
from textblob import TextBlob

In [2]:
spark = (ps.sql.SparkSession.builder
        .appName("NLP_2.1")
        .config("spark.driver.extraClassPath", "/home/jovyan/postgresql-42.2.20.jar")
        .config('spark.driver.memory','8G')
        .master('local[3]')
        .getOrCreate()
        )

sc = spark.sparkContext

In [3]:
spark

### Connecting To Data

In [4]:
db_endpoint = None
db_name = None
db_password = None

db_properties = {
    "user": "postgres",
    "password": db_password,
    "driver": "org.postgresql.Driver"
}

db_url = f'jdbc:postgresql://{db_endpoint}/{db_name}'

In [5]:
train = spark.read.jdbc(url=db_url,table='text_data_train',properties=db_properties)

In [6]:
test = spark.read.jdbc(url=db_url,table='text_data_test',properties=db_properties)

In [7]:
train.createOrReplaceTempView("train")
test.createOrReplaceTempView("test")

## Data Overview

In [8]:
train.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- review_text: string (nullable = true)
 |-- target_ufc_bool: string (nullable = true)
 |-- target_ufc_count: long (nullable = true)



In [9]:
train.show(5)

+--------------------+------------+--------------------+---------------+----------------+
|           review_id|review_stars|         review_text|target_ufc_bool|target_ufc_count|
+--------------------+------------+--------------------+---------------+----------------+
|ajrHLNbs06pRlFcId...|         1.0|I was told by num...|          False|               0|
|ak0gksy9n9mKmVN8U...|         4.0|I visited tonight...|          False|               0|
|ak13mP9WRp2NViP5u...|         5.0|The lunch special...|          False|               0|
|ak9YWfGs_1CNEufg1...|         4.0|Another coffee sh...|           True|               5|
|akDkUx3DA1qTFN5Oi...|         5.0|If you like the u...|           True|               8|
+--------------------+------------+--------------------+---------------+----------------+
only showing top 5 rows



In [10]:
print(f'Train Records: {train.count()}')
print(f'Test Records: {test.count()}')

Train Records: 5523992
Test Records: 1382379


## Basic Text Characteristics

### Feature Creation

In [20]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

In [21]:
train = (train.withColumn('word_count', F.udf(lambda x: len(str(x).split(" ")), IntegerType())('review_text'))
               .withColumn('character_count', F.udf(lambda x: len(x), IntegerType())('review_text'))
               .withColumn('avg_word_length', F.udf(avg_word, FloatType())('review_text'))
               .withColumn('num_count', F.udf(lambda x: len([x for x in x.split() if x.isdigit()]), IntegerType())('review_text'))
               .withColumn('uppercase_count', F.udf(lambda x: len([x for x in x.split() if x.isupper()]), IntegerType())('review_text'))
               .withColumn('#_@_count', F.udf(lambda x: len([x for x in x.split() if x.startswith('#') or x.startswith('@')]), IntegerType())('review_text'))
               .withColumn('sentence_count', F.udf(textstat.sentence_count, IntegerType())('review_text'))
               .withColumn('lexicon_count', F.udf(textstat.lexicon_count, IntegerType())('review_text'))
               .withColumn('syllable_count', F.udf(textstat.syllable_count, IntegerType())('review_text')))

In [22]:
test = (test.withColumn('word_count', F.udf(lambda x: len(str(x).split(" ")), IntegerType())('review_text'))
               .withColumn('character_count', F.udf(lambda x: len(x), IntegerType())('review_text'))
               .withColumn('avg_word_length', F.udf(avg_word, FloatType())('review_text'))
               .withColumn('num_count', F.udf(lambda x: len([x for x in x.split() if x.isdigit()]), IntegerType())('review_text'))
               .withColumn('uppercase_count', F.udf(lambda x: len([x for x in x.split() if x.isupper()]), IntegerType())('review_text'))
               .withColumn('#_@_count', F.udf(lambda x: len([x for x in x.split() if x.startswith('#') or x.startswith('@')]), IntegerType())('review_text'))
               .withColumn('sentence_count', F.udf(textstat.sentence_count, IntegerType())('review_text'))
               .withColumn('lexicon_count', F.udf(textstat.lexicon_count, IntegerType())('review_text'))
               .withColumn('syllable_count', F.udf(textstat.syllable_count, IntegerType())('review_text')))

## Reading Level

In [23]:
train = train.withColumn('grade_level', F.udf(textstat.flesch_kincaid_grade, FloatType())('review_text'))
test = test.withColumn('grade_level', F.udf(textstat.flesch_kincaid_grade, FloatType())('review_text'))

## Sentiment Analysis

In [24]:
train = (train.withColumn('polarity', F.udf(lambda x: TextBlob(x).sentiment.polarity, FloatType())('review_text'))
         .withColumn('subjectivity', F.udf(lambda x: TextBlob(x).sentiment.subjectivity, FloatType())('review_text')))
test = (test.withColumn('polarity', F.udf(lambda x: TextBlob(x).sentiment.polarity, FloatType())('review_text'))
         .withColumn('subjectivity', F.udf(lambda x: TextBlob(x).sentiment.subjectivity, FloatType())('review_text')))

In [25]:
train.printSchema()
test.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- review_text: string (nullable = true)
 |-- target_ufc_bool: string (nullable = true)
 |-- target_ufc_count: long (nullable = true)
 |-- word_count: integer (nullable = true)
 |-- character_count: integer (nullable = true)
 |-- avg_word_length: float (nullable = true)
 |-- num_count: integer (nullable = true)
 |-- uppercase_count: integer (nullable = true)
 |-- #_@_count: integer (nullable = true)
 |-- sentence_count: integer (nullable = true)
 |-- lexicon_count: integer (nullable = true)
 |-- syllable_count: integer (nullable = true)
 |-- grade_level: float (nullable = true)
 |-- polarity: float (nullable = true)
 |-- subjectivity: float (nullable = true)

root
 |-- review_id: string (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- review_text: string (nullable = true)
 |-- target_ufc_bool: string (nullable = true)
 |-- target_ufc_count: long (nullable = true)
 |-- word_coun

In [26]:
train.show(5)

+--------------------+------------+--------------------+---------------+----------------+----------+---------------+---------------+---------+---------------+---------+--------------+-------------+--------------+-----------+-----------+------------+
|           review_id|review_stars|         review_text|target_ufc_bool|target_ufc_count|word_count|character_count|avg_word_length|num_count|uppercase_count|#_@_count|sentence_count|lexicon_count|syllable_count|grade_level|   polarity|subjectivity|
+--------------------+------------+--------------------+---------------+----------------+----------+---------------+---------------+---------+---------------+---------+--------------+-------------+--------------+-----------+-----------+------------+
|ajrHLNbs06pRlFcId...|         1.0|I was told by num...|          False|               0|       131|            680|      4.1984735|        2|             11|        0|            11|          130|           173|        4.4|-0.14007936|  0.33055556|


In [27]:
test.show(5)

+--------------------+------------+--------------------+---------------+----------------+----------+---------------+---------------+---------+---------------+---------+--------------+-------------+--------------+-----------+------------+------------+
|           review_id|review_stars|         review_text|target_ufc_bool|target_ufc_count|word_count|character_count|avg_word_length|num_count|uppercase_count|#_@_count|sentence_count|lexicon_count|syllable_count|grade_level|    polarity|subjectivity|
+--------------------+------------+--------------------+---------------+----------------+----------+---------------+---------------+---------+---------------+---------+--------------+-------------+--------------+-----------+------------+------------+
|-19IRiVfPUFgTLS37...|         5.0|i love this place...|          False|               0|        60|            331|      4.5333333|        0|              0|        0|             1|           60|            79|       23.2|  0.51785713|  0.660714

## Save Data

### To File

In [1]:
# train.coalesce(1).write.json(path='train_b.json')

### To AWS RDS

In [28]:
# train.write.jdbc(url=db_url,table='text_data_train_b',mode='overwrite',properties=db_properties)
# test.write.jdbc(url=db_url,table='text_data_test_b',mode='overwrite',properties=db_properties)