In [2]:
# Import Modules & Set Environment Variables:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql import types as T


In [3]:
# Initialize SparkSession:
# A new SparkSession is created with the name 'ex5_google_reviews', using a single local node.
spark = SparkSession.builder.master("local").appName('ex5_google_reviews').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/23 18:21:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Setting Up Sentiment Mapping:
# This sets up an array of Row objects to map the sentiment to respective ranks.
sentiment_arr = [Row(Sentiment='Positive', sentiment_rank=1),
Row(Sentiment='Neutral', sentiment_rank=0),
Row(Sentiment='Negative', sentiment_rank=-1)]
print(sentiment_arr)

[Row(Sentiment='Positive', sentiment_rank=1), Row(Sentiment='Neutral', sentiment_rank=0), Row(Sentiment='Negative', sentiment_rank=-1)]


In [6]:
# Data Loading
# Load the Google Reviews CSV data into a DataFrame.
google_reviews_df = spark.read.csv('s3a://spark/data/raw/google_reviews/', header=True)
google_reviews_df.show(6)

+--------------------+--------------------+--------------------+------------------+----------------------+
|                 App|   Translated_Review|           Sentiment|Sentiment_Polarity|Sentiment_Subjectivity|
+--------------------+--------------------+--------------------+------------------+----------------------+
|10 Best Foods for...|"I like eat delic...| also ""Best Befo...|          Positive|                   1.0|
|10 Best Foods for...|This help eating ...|            Positive|              0.25|   0.28846153846153844|
|10 Best Foods for...|                 nan|                 nan|               nan|                   nan|
|10 Best Foods for...|Works great espec...|            Positive|               0.4|                 0.875|
|10 Best Foods for...|        Best idea us|            Positive|               1.0|                   0.3|
|10 Best Foods for...|            Best way|            Positive|               1.0|                   0.3|
+--------------------+---------------

In [None]:
# Full code solution
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql import types as T
spark = SparkSession.builder.master("local").appName('ex5_google_reviews').getOrCreate()
sentiment_arr = [Row(Sentiment='Positive', sentiment_rank=1),
Row(Sentiment='Neutral', sentiment_rank=0),
Row(Sentiment='Negative', sentiment_rank=-1)]
print(sentiment_arr)
google_reviews_df = spark.read.csv('s3a://spark/data/raw/google_reviews/', header=True)
google_reviews_df.show(6)
sentiments_df = spark.createDataFrame(sentiment_arr)
sentiments_df.show()
joined_df = google_reviews_df.join(F.broadcast(sentiments_df), ['Sentiment'])
selected_df = joined_df \
.select(F.col('App').alias('application_name'),
F.col('Translated_Review').alias('translated_review'),
F.col('sentiment_rank'),
F.col('Sentiment_Polarity').cast(T.FloatType()).alias('sentiment_polarity'),
F.col('Sentiment_Subjectivity').cast(T.FloatType()).alias('sentiment_subjectivity'))
selected_df.show()
selected_df.printSchema()
selected_df.write.parquet('s3a://spark/data/source/google_reviews', mode='overwrite')
spark.stop()