In [9]:

# Import Required Modules & Set Environment Variables
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from kafka import KafkaProducer

In [10]:
# Initialize SparkSession:
# This creates a new SparkSession with the name 'ex5_reviews_producer', running on a single
# local node.
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .appName('taxi') \
        .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2') \
        .config("fs.s3a.endpoint", "http://minio:9000") \
        .config("fs.s3a.access.key", "minioadmin") \
        .config("fs.s3a.secret.key", "minioadmin") \
        .config("fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .getOrCreate()

24/09/26 16:31:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [11]:
# Load Data:
# Load data from Parquet file into a DataFrame .
# Here, the processed Google reviews data is loaded from a Parquet file into a DataFrame and
# displayed.
data_df = spark.read.parquet('s3a://spark/data/source/google_reviews/')
data_df.show(6)


+--------------------+--------------------+--------------+------------------+----------------------+
|    application_name|   translated_review|sentiment_rank|sentiment_polarity|sentiment_subjectivity|
+--------------------+--------------------+--------------+------------------+----------------------+
|10 Best Foods for...|This help eating ...|             1|              0.25|            0.28846154|
|10 Best Foods for...|Works great espec...|             1|               0.4|                 0.875|
|10 Best Foods for...|        Best idea us|             1|               1.0|                   0.3|
|10 Best Foods for...|            Best way|             1|               1.0|                   0.3|
|10 Best Foods for...|             Amazing|             1|               0.6|                   0.9|
|10 Best Foods for...|Looking forward app,|             0|               0.0|                   0.0|
+--------------------+--------------------+--------------+------------------+--------------

In [12]:
# Convert Data to JSON:
data = data_df.toJSON ()
print(data.take(6))

#Each record in the DataFrame is converted to a JSON string, which is a suitable format for
#sending messages to a Kafka topic. A sample of six JSON records is printed.

['{"application_name":"10 Best Foods for You","translated_review":"This help eating healthy exercise regular basis","sentiment_rank":1,"sentiment_polarity":0.25,"sentiment_subjectivity":0.28846154}', '{"application_name":"10 Best Foods for You","translated_review":"Works great especially going grocery store","sentiment_rank":1,"sentiment_polarity":0.4,"sentiment_subjectivity":0.875}', '{"application_name":"10 Best Foods for You","translated_review":"Best idea us","sentiment_rank":1,"sentiment_polarity":1.0,"sentiment_subjectivity":0.3}', '{"application_name":"10 Best Foods for You","translated_review":"Best way","sentiment_rank":1,"sentiment_polarity":1.0,"sentiment_subjectivity":0.3}', '{"application_name":"10 Best Foods for You","translated_review":"Amazing","sentiment_rank":1,"sentiment_polarity":0.6,"sentiment_subjectivity":0.9}', '{"application_name":"10 Best Foods for You","translated_review":"Looking forward app,","sentiment_rank":0,"sentiment_polarity":0.0,"sentiment_subjectivi

                                                                                

In [13]:
# Initialize Kafka Producer:
#Set up a Kafka producer .
producer = KafkaProducer(bootstrap_servers='course-kafka:9092', value_serializer=lambda v: v.encode('utf-8'))

# A KafkaProducer instance is initialized to send messages to the Kafka cluster. The producer
# will serialize the messages (JSON strings) into bytes using UTF-8 encoding.

In [14]:

# Produce Messages to Kafka Topic:
i = 0

for json_data in data.collect():
    i = i + 1
    producer.send(topic='gps-user-review-source', value=json_data)
    if i == 50:
        producer.flush()
        time.sleep(5)
        i = 0
'''
In this loop, each JSON string is sent as a message to the gps-user-review-source Kafka topic. After
sending 50 messages, the producer buffers are flushed, ensuring that all messages are sent.
Then the script waits for 10 seconds before resuming. This pattern is used to space out the
message production, sending 50 messages every 10 seconds.
'''        

KeyboardInterrupt: 

In [15]:
# Close Kafka Producer & Terminate SparkSession:
producer.close()
spark.stop()

''' 
After all messages have been sent, the Kafka producer is closed to release resources,
followed by terminating the SparkSession.
'''

' \nAfter all messages have been sent, the Kafka producer is closed to release resources,\nfollowed by terminating the SparkSession.\n'

In [None]:
# Summery

'''
    This solution reads the processed Google Reviews data, converts it to JSON, and then
    produces the messages to a Kafka topic in batches of 50, with a 10-second interval between
    batches.

'''