In [1]:
# Import Modules & Set Environment Variables:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql import types as T


In [2]:
# Initialize SparkSession:
# A new SparkSession is created with the name 'ex5_google_reviews', using a single local node.
spark = SparkSession.builder.master("local").appName('ex5_google_reviews').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/24 06:35:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Setting Up Sentiment Mapping:
# This sets up an array of Row objects to map the sentiment to respective ranks.
sentiment_arr = [Row(Sentiment='Positive', sentiment_rank=1),
Row(Sentiment='Neutral', sentiment_rank=0),
Row(Sentiment='Negative', sentiment_rank=-1)]
print(sentiment_arr)

[Row(Sentiment='Positive', sentiment_rank=1), Row(Sentiment='Neutral', sentiment_rank=0), Row(Sentiment='Negative', sentiment_rank=-1)]


In [4]:
# Data Loading
# Load the Google Reviews CSV data into a DataFrame.
google_reviews_df = spark.read.csv('s3a://spark/data/raw/google_reviews/', header=True)
google_reviews_df.show(6)

24/09/24 06:35:13 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


+--------------------+--------------------+--------------------+------------------+----------------------+
|                 App|   Translated_Review|           Sentiment|Sentiment_Polarity|Sentiment_Subjectivity|
+--------------------+--------------------+--------------------+------------------+----------------------+
|10 Best Foods for...|"I like eat delic...| also ""Best Befo...|          Positive|                   1.0|
|10 Best Foods for...|This help eating ...|            Positive|              0.25|   0.28846153846153844|
|10 Best Foods for...|                 nan|                 nan|               nan|                   nan|
|10 Best Foods for...|Works great espec...|            Positive|               0.4|                 0.875|
|10 Best Foods for...|        Best idea us|            Positive|               1.0|                   0.3|
|10 Best Foods for...|            Best way|            Positive|               1.0|                   0.3|
+--------------------+---------------

In [5]:
# Creating Sentiment DataFrame:
##Convert the sentiment array to a DataFrame .
# This section turns the sentiment mapping array into a DataFrame and displays its content.
sentiments_df = spark.createDataFrame(sentiment_arr)
sentiments_df.show()


+---------+--------------+
|Sentiment|sentiment_rank|
+---------+--------------+
| Positive|             1|
|  Neutral|             0|
| Negative|            -1|
+---------+--------------+



In [7]:
# Joining DataFrames:

# Join the sentiments_df with the main reviews DataFrame based on the 'Sentiment' column.

# The Google reviews data is joined with the sentiment mapping DataFrame. A broadcast join
# is utilized to replicate the smaller DataFrame (sentiments_df) across all nodes to speed up
# the join.

joined_df = google_reviews_df.join(F.broadcast(sentiments_df), ['Sentiment'])

In [8]:
# Data Transformation & Cleaning:

# Data is transformed to:
# 1.Extract and rename relevant columns. 2. Cast 'Sentiment_Polarity' and 'Sentiment_Subjectivity' columns to float type.

selected_df = joined_df \
.select(F.col('App').alias('application_name'),
F.col('Translated_Review').alias('translated_review') ,
F.col('sentiment_rank'),
F.col('Sentiment_Polarity').cast(T.FloatType()).alias('sentiment_polarity') ,
F.col('Sentiment_Subjectivity').cast(T.FloatType()).alias('sentiment_subjectivity'))


In [9]:
# Display & Save Data:

# Display the transformed data and its schema.
selected_df.show()
selected_df.printSchema()
# Save the processed data into a Parquet file.
selected_df.write.parquet('s3a://spark/data/source/google_reviews', mode='overwrite')
spark.stop()

''' 
This portion of the code presents the transformed data, its schema, and then writes the
results to a Parquet file. If the destination file already exists, it will be overwritten.
The SparkSession is closed to free up resources.

Summery:
This solution processes Google Reviews data, extracting sentiment analysis metrics, and
then saves the cleaned and transformed data into an optimized Parquet format.

'''

+--------------------+--------------------+--------------+------------------+----------------------+
|    application_name|   translated_review|sentiment_rank|sentiment_polarity|sentiment_subjectivity|
+--------------------+--------------------+--------------+------------------+----------------------+
|10 Best Foods for...|This help eating ...|             1|              0.25|            0.28846154|
|10 Best Foods for...|Works great espec...|             1|               0.4|                 0.875|
|10 Best Foods for...|        Best idea us|             1|               1.0|                   0.3|
|10 Best Foods for...|            Best way|             1|               1.0|                   0.3|
|10 Best Foods for...|             Amazing|             1|               0.6|                   0.9|
|10 Best Foods for...|Looking forward app,|             0|               0.0|                   0.0|
|10 Best Foods for...|It helpful site !...|             0|               0.0|              

                                                                                

In [None]:
# Full code solution
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql import types as T
spark = SparkSession.builder.master("local").appName('ex5_google_reviews').getOrCreate()
sentiment_arr = [Row(Sentiment='Positive', sentiment_rank=1),
Row(Sentiment='Neutral', sentiment_rank=0),
Row(Sentiment='Negative', sentiment_rank=-1)]
print(sentiment_arr)
google_reviews_df = spark.read.csv('s3a://spark/data/raw/google_reviews/', header=True)
google_reviews_df.show(6)
sentiments_df = spark.createDataFrame(sentiment_arr)
sentiments_df.show()
joined_df = google_reviews_df.join(F.broadcast(sentiments_df), ['Sentiment'])
selected_df = joined_df \
.select(F.col('App').alias('application_name'),
F.col('Translated_Review').alias('translated_review'),
F.col('sentiment_rank'),
F.col('Sentiment_Polarity').cast(T.FloatType()).alias('sentiment_polarity'),
F.col('Sentiment_Subjectivity').cast(T.FloatType()).alias('sentiment_subjectivity'))
selected_df.show()
selected_df.printSchema()
selected_df.write.parquet('s3a://spark/data/source/google_reviews', mode='overwrite')
spark.stop()