In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.2.3'
# spark_version = 'spark-<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.39)] [Wa                                                                               Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
0% [Waiting for headers] [2 InRelease 14.2 kB/114 kB 12%] [Waiting for headers]                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
0% [Waiting for headers] [2 InRelease 14.2 kB/114 kB 12%] [Waiting for headers]                                                                               Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
0% [Waiting for headers] [2 InRelease 14.2 kB/114 kB 12%] [Waiting for headers]                                                                               H

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NLPPros").getOrCreate()

In [3]:
from pyspark import SparkFiles
# Load in data into a DataFrame

url = "/content/reviews_nlp_input.csv" #enter correct address here

spark.sparkContext.addFile(url)

df = spark.read \
    .option("delimiter", "|") \
    .option("multiline", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("header", "true") \
    .csv(url)

# Show DataFrame
df.show()

+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------+-----------------+--------------+-------------------+----------+-------------+-----------+---------+----------+-------+--------------------+--------------------+--------------------+
|_c0|                firm|date_review|           job_title|             current|            location|overall_rating|work_life_balance|culture_values|diversity_inclusion|career_opp|comp_benefits|senior_mgmt|recommend|ceo_approv|outlook|            headline|                pros|                cons|
+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------+-----------------+--------------+-------------------+----------+-------------+-----------+---------+----------+-------+--------------------+--------------------+--------------------+
|  0|AFH-Wealth-Manage...| 2020-10-01| Office Administr...|Former Employee, ...|Bromsgrove, Engla...|  

Transform DataFrame to fit review_rating table

In [4]:
pros_df = df.select(["pros", "overall_rating", "date_review"])
pros_df.show()

+--------------------+--------------+-----------+
|                pros|overall_rating|date_review|
+--------------------+--------------+-----------+
|Great people in s...|             2| 2020-10-01|
|Majority of the p...|             1| 2021-02-05|
|Nice environment,...|             4| 2021-02-07|
|-Great People\n-H...|             3| 2021-02-07|
|None, they lie ab...|             1| 2021-05-12|
|There are none to...|             1| 2021-05-13|
|Good company to w...|             5| 2021-05-13|
|good company to w...|             3| 2020-10-14|
|AJ Bell is an exc...|             3| 2020-11-25|
|If you're happy t...|             1| 2020-12-04|
|Nice offices and ...|             1| 2020-12-08|
|Great people and ...|             4| 2020-12-11|
|Great momentum af...|             2| 2020-12-21|
|Great employee be...|             4| 2021-01-05|
|Lots of internal ...|             4| 2021-01-09|
|everyone is in th...|             2| 2021-01-14|
|AJ Bell is one of...|             4| 2021-01-25|


In [5]:
from pyspark.sql.functions import regexp_extract, length
pros_df = df.withColumnRenamed("overall_rating", "label").select(["label", "date_review", "pros"])
pros_df = pros_df.withColumn('pros_length', length(pros_df['pros'])).dropna()
pros_df.cache()
pros_df.show()

+-----+-----------+--------------------+-----------+
|label|date_review|                pros|pros_length|
+-----+-----------+--------------------+-----------+
|    2| 2020-10-01|Great people in s...|         63|
|    1| 2021-02-05|Majority of the p...|         70|
|    4| 2021-02-07|Nice environment,...|         48|
|    3| 2021-02-07|-Great People\n-H...|         52|
|    1| 2021-05-12|None, they lie ab...|         61|
|    1| 2021-05-13|There are none to...|         26|
|    5| 2021-05-13|Good company to w...|         24|
|    3| 2020-10-14|good company to w...|         38|
|    3| 2020-11-25|AJ Bell is an exc...|         79|
|    1| 2020-12-04|If you're happy t...|        480|
|    1| 2020-12-08|Nice offices and ...|         44|
|    4| 2020-12-11|Great people and ...|        129|
|    2| 2020-12-21|Great momentum af...|         40|
|    4| 2021-01-05|Great employee be...|         64|
|    4| 2021-01-09|Lots of internal ...|        286|
|    2| 2021-01-14|everyone is in th...|      

Create Data Pipeline

In [6]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
# Create all the features to the data set
tokenizer = Tokenizer(inputCol="pros", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_text", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors (merge idf_token and review_length)
clean_up = VectorAssembler(inputCols=['idf_token', 'pros_length'], outputCol='features')

In [8]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

Transform DataFrame

In [9]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(pros_df)
cleaned = cleaner.transform(pros_df)

In [10]:
# Show label of ham spam and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    2|(262145,[19208,78...|
|    1|(262145,[18700,58...|
|    4|(262145,[5923,108...|
|    3|(262145,[107107,1...|
|    1|(262145,[16989,18...|
|    1|(262145,[27576,58...|
|    5|(262145,[27576,34...|
|    3|(262145,[22346,27...|
|    3|(262145,[10446,16...|
|    1|(262145,[16551,16...|
|    1|(262145,[19208,22...|
|    4|(262145,[24980,47...|
|    2|(262145,[9420,641...|
|    4|(262145,[8500,177...|
|    4|(262145,[3000,100...|
|    2|(262145,[11104,27...|
|    4|(262145,[1578,160...|
|    1|(262145,[22371,95...|
|    1|(262145,[22346,24...|
|    3|(262145,[27576,74...|
+-----+--------------------+
only showing top 20 rows



In [11]:
cleaned.printSchema()

root
 |-- label: string (nullable = true)
 |-- date_review: string (nullable = true)
 |-- pros: string (nullable = true)
 |-- pros_length: integer (nullable = true)
 |-- token_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stop_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hash_token: vector (nullable = true)
 |-- idf_token: vector (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
from pyspark.sql.functions import col
cleaned = cleaned.withColumn("label", col("label").cast("int"))

Run NaiveBayes

In [13]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes() #labelCol='label', featuresCol='features'
predictor = nb.fit(training)

In [14]:
training.show()

+-----+-----------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|date_review|                pros|pros_length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+-----+-----------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    1| 2020-09-12|None that I can c...|         28|[none, that, i, c...|        [none, come]|(262144,[19036,48...|(262144,[19036,48...|(262145,[19036,48...|
|    1| 2020-09-12|Work from home is...|         31|[work, from, home...|  [work, home, pros]|(262144,[17893,34...|(262144,[17893,34...|(262145,[17893,34...|
|    1| 2020-09-14|Lots of onboardin...|         53|[lots, of, onboar...|[lots, onboarding...|(262144,[27576,34...|(262144,[27576,34...|(262145,[27576,34...|
|    1| 2020-09-14|Rate of pay and f...|         27|

In [15]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+-----+-----------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|date_review|                pros|pros_length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+-----------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    1| 2020-09-13|I hated this job ...|         24|[i, hated, this, ...|  [hated, job, much]|(262144,[19036,76...|(262144,[19036,76...|(262145,[19036,76...|[-262.22966260422...|[0.99999953910250...|       0.0|
|    1| 2020-09-14|Aucun malheureuse...|         71|[aucun, malheureu...|[aucun, malheureu...|(262144,[26980,44...|(262144,[26980,44...|(262145,[26980,44...

Predict accuracy of the model

In [16]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting pros was: %f" % acc)

Accuracy of model at predicting pros was: 0.191763
