In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.2.3'
# spark_version = 'spark-<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  Release
Hit:5 http://security.ubuntu.com/ubuntu focal-security InRelease
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NLPHeadline").getOrCreate()

In [3]:
from pyspark import SparkFiles
# Load in data into a DataFrame

url = "/content/reviews_nlp_input.csv" #enter correct address here
spark.sparkContext.addFile(url)

df = spark.read \
    .option("delimiter", "|") \
    .option("multiline", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("header", "true") \
    .csv(url)

# Show DataFrame
df.show()

+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------+-----------------+--------------+-------------------+----------+-------------+-----------+---------+----------+-------+--------------------+--------------------+--------------------+
|_c0|                firm|date_review|           job_title|             current|            location|overall_rating|work_life_balance|culture_values|diversity_inclusion|career_opp|comp_benefits|senior_mgmt|recommend|ceo_approv|outlook|            headline|                pros|                cons|
+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------+-----------------+--------------+-------------------+----------+-------------+-----------+---------+----------+-------+--------------------+--------------------+--------------------+
|  0|AFH-Wealth-Manage...| 2020-10-01| Office Administr...|Former Employee, ...|Bromsgrove, Engla...|  

Transform DataFrame to fit review_rating table

In [4]:
headline_df = df.select(["headline", "overall_rating", "date_review"])
headline_df.show()

+--------------------+--------------+-----------+
|            headline|overall_rating|date_review|
+--------------------+--------------+-----------+
|The people both m...|             2| 2020-10-01|
|   Very low salaries|             1| 2021-02-05|
|                Good|             4| 2021-02-07|
|          AFH Review|             3| 2021-02-07|
|Terrible- avoid l...|             1| 2021-05-12|
|         Steer clear|             1| 2021-05-13|
|    Mortgage Advisor|             5| 2021-05-13|
|         Nice office|             3| 2020-10-14|
|Good job, awful p...|             3| 2020-11-25|
|Shiny Office Cann...|             1| 2020-12-04|
|Low pay, progress...|             1| 2020-12-08|
| Great place to work|             4| 2020-12-11|
|               Run!!|             2| 2020-12-21|
|Looked after but ...|             4| 2021-01-05|
|Lots of progressi...|             4| 2021-01-09|
|             AJ Hell|             2| 2021-01-14|
|Promotion, growin...|             4| 2021-01-25|


In [5]:
from pyspark.sql.functions import regexp_extract, length
headline_df = df.withColumnRenamed("overall_rating", "label").select(["label", "date_review", "headline"])
headline_df = headline_df.withColumn('headline_length', length(headline_df['headline'])).dropna()
headline_df.cache()
headline_df.show()

+-----+-----------+--------------------+---------------+
|label|date_review|            headline|headline_length|
+-----+-----------+--------------------+---------------+
|    2| 2020-10-01|The people both m...|             43|
|    1| 2021-02-05|   Very low salaries|             17|
|    4| 2021-02-07|                Good|              4|
|    3| 2021-02-07|          AFH Review|             10|
|    1| 2021-05-12|Terrible- avoid l...|             31|
|    1| 2021-05-13|         Steer clear|             11|
|    5| 2021-05-13|    Mortgage Advisor|             16|
|    3| 2020-10-14|         Nice office|             11|
|    3| 2020-11-25|Good job, awful p...|             22|
|    1| 2020-12-04|Shiny Office Cann...|             39|
|    1| 2020-12-08|Low pay, progress...|             41|
|    4| 2020-12-11| Great place to work|             19|
|    2| 2020-12-21|               Run!!|              5|
|    4| 2021-01-05|Looked after but ...|             26|
|    4| 2021-01-09|Lots of prog

Create Data Pipeline

In [6]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
# Create all the features to the data set
tokenizer = Tokenizer(inputCol="headline", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_text", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors (merge idf_token and review_length)
clean_up = VectorAssembler(inputCols=['idf_token', 'headline_length'], outputCol='features')

In [8]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

Transform DataFrame

In [9]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(headline_df)
cleaned = cleaner.transform(headline_df)

In [10]:
# Show label of ham spam and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    2|(262145,[70998,89...|
|    1|(262145,[210040,2...|
|    4|(262145,[113432,2...|
|    3|(262145,[109993,1...|
|    1|(262145,[84796,95...|
|    1|(262145,[94617,17...|
|    5|(262145,[82063,16...|
|    3|(262145,[22346,13...|
|    3|(262145,[7400,850...|
|    1|(262145,[15539,44...|
|    1|(262145,[67416,94...|
|    4|(262145,[27576,34...|
|    2|(262145,[97287,26...|
|    4|(262145,[9420,333...|
|    4|(262145,[17893,20...|
|    2|(262145,[249457,2...|
|    4|(262145,[34149,11...|
|    1|(262145,[121091,1...|
|    1|(262145,[249457,2...|
|    3|(262145,[261610,2...|
+-----+--------------------+
only showing top 20 rows



In [11]:
cleaned.printSchema()

root
 |-- label: string (nullable = true)
 |-- date_review: string (nullable = true)
 |-- headline: string (nullable = true)
 |-- headline_length: integer (nullable = true)
 |-- token_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stop_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hash_token: vector (nullable = true)
 |-- idf_token: vector (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
from pyspark.sql.functions import col
cleaned = cleaned.withColumn("label", col("label").cast("int"))

Run NaiveBayes

In [13]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes() #labelCol='label', featuresCol='features'
predictor = nb.fit(training)

In [14]:
training.show()

+-----+-----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|date_review|            headline|headline_length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+-----+-----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    1| 2020-09-12|Just a Name. Cont...|             68|[just, a, name., ...|[name., contains,...|(262144,[629,5078...|(262144,[629,5078...|(262145,[629,5078...|
|    1| 2020-09-14|Aucun respect du ...|             43|[aucun, respect, ...|[aucun, respect, ...|(262144,[37481,49...|(262144,[37481,49...|(262145,[37481,49...|
|    1| 2020-09-14|Bad culture, good...|             25|[bad, culture,, g...|[bad, culture,, g...|(262144,[52879,94...|(262144,[52879,94...|(262145,[52879,94...|
|    1| 2020-09-14|Good comp

In [15]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+-----+-----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|date_review|            headline|headline_length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+-----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    1| 2020-09-12| Abusive environment|             19|[abusive, environ...|[abusive, environ...|(262144,[24900,23...|(262144,[24900,23...|(262145,[24900,23...|[-161.43159411310...|[8.80106895293720...|       2.0|
|    1| 2020-09-13|           Worst Job|              9|        [worst, job]|        [worst, job]|(262144,[99197,14...|(262144,[99197,14...|

Predict accuracy of the model

In [16]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting headline_reviews was: %f" % acc)

Accuracy of model at predicting headline_reviews was: 0.201556
