## Question 3. Connecting to the Kafka server

In [1]:
import json
import time 

from kafka import KafkaProducer

def json_serializer(data):
    return json.dumps(data).encode('utf-8')

server = 'localhost:9092'

producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer
)

producer.bootstrap_connected()

True

## Question 4. Sending data to the stream

In [2]:
t0 = time.time()

topic_name = 'test-topic'
total_message_time=0
for i in range(10):
    message = {'number': i}
    t1 = time.time()
    producer.send(topic_name, value=message)
    t2 = time.time()
    print(f'Sending the messages took {(t2 - t1):.4f} seconds')
    total_message_time+=t2-t1
    print(f"Sent: {message}")
    time.sleep(0.05)

t3 = time.time()
producer.flush()
t4 = time.time()
print(f'Average Sending the messages took {total_message_time/10:.4f} seconds')
print(f'Flushing took {(t4 - t3):.4f} seconds')
print(f'took {(t4 - t0):.2f} seconds')

Sending the messages took 0.0007 seconds
Sent: {'number': 0}
Sending the messages took 0.0003 seconds
Sent: {'number': 1}
Sending the messages took 0.0003 seconds
Sent: {'number': 2}
Sending the messages took 0.0003 seconds
Sent: {'number': 3}
Sending the messages took 0.0003 seconds
Sent: {'number': 4}
Sending the messages took 0.0003 seconds
Sent: {'number': 5}
Sending the messages took 0.0003 seconds
Sent: {'number': 6}
Sending the messages took 0.0003 seconds
Sent: {'number': 7}
Sending the messages took 0.0003 seconds
Sent: {'number': 8}
Sending the messages took 0.0004 seconds
Sent: {'number': 9}
Average Sending the messages took 0.0004 seconds
Flushing took 0.0001 seconds
took 0.51 seconds


> Answer:
```
Sending the messages (0.0004 sec) and Flushing is (0.0002) sec. More time on sending the messages.
```

In [3]:
import pandas as pd
from kafka import KafkaProducer
import json
import time

# Function to serialize data to JSON
def json_serializer(data):
    return json.dumps(data).encode('utf-8')

# Kafka producer configuration
server = 'localhost:9092'
topic = 'green-trips'

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer
)

# Read the CSV file with selected columns
selected_columns = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 
                    'PULocationID', 'DOLocationID', 'passenger_count', 
                    'trip_distance', 'tip_amount']
data_path = 'data/green_tripdata_2019-10.csv.gz'

# Read the CSV file into a DataFrame
start_time = time.time()  # Start time
df_green = pd.read_csv(data_path, usecols=selected_columns, compression='gzip')

# Iterate over the records in the DataFrame and send data to Kafka
for row in df_green.itertuples(index=False):
    row_dict = {col: getattr(row, col) for col in row._fields}
    # Send the row data to Kafka
    producer.send(topic, value=row_dict)

# Close the Kafka producer
producer.close()

# Calculate the time taken in seconds
end_time = time.time()
time_taken = round(end_time - start_time)
print("Time taken (seconds):", time_taken)


Time taken (seconds): 61


In [4]:
import pyspark
from pyspark.sql import SparkSession

pyspark_version = pyspark.__version__
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("GreenTripsConsumer") \
    .config("spark.jars.packages", kafka_jar_package) \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/murat/spark/spark-3.3.2-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/murat/.ivy2/cache
The jars for the packages stored in: /home/murat/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2e591b7b-00f9-485e-a579-0da735bb9fed;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.2 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
downloadi

24/03/14 06:12:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
green_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "green-trips") \
    .option("startingOffsets", "earliest") \
    .load()

In [6]:
def peek(mini_batch, batch_id):
    first_row = mini_batch.take(1)

    if first_row:
        print(first_row[0])

query = green_stream.writeStream.foreachBatch(peek).start()

24/03/14 06:14:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-a4734efc-f1cb-4d27-ad2f-5296504a1f24. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/14 06:14:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

Row(key=None, value=bytearray(b'{"lpep_pickup_datetime": "2019-10-01 00:00:48", "lpep_dropoff_datetime": "2019-10-01 00:05:42", "PULocationID": 210, "DOLocationID": 108, "passenger_count": 2.0, "trip_distance": 1.03, "tip_amount": 2.19}'), topic='green-trips', partition=0, offset=0, timestamp=datetime.datetime(2024, 3, 14, 6, 5, 1, 450000), timestampType=0)


In [7]:
query.stop()

In [8]:
from pyspark.sql import types

schema = types.StructType() \
    .add("lpep_pickup_datetime", types.StringType()) \
    .add("lpep_dropoff_datetime", types.StringType()) \
    .add("PULocationID", types.IntegerType()) \
    .add("DOLocationID", types.IntegerType()) \
    .add("passenger_count", types.DoubleType()) \
    .add("trip_distance", types.DoubleType()) \
    .add("tip_amount", types.DoubleType())

In [9]:
from pyspark.sql import functions as F

green_stream = green_stream \
  .select(F.from_json(F.col("value").cast('STRING'), schema).alias("data")) \
  .select("data.*")

In [11]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types

# Define Kafka package for Spark
pyspark_version = pyspark.__version__
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"

# Initialize Spark session
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("GreenTripsConsumer") \
    .config("spark.jars.packages", kafka_jar_package) \
    .getOrCreate()

# Connect to the Kafka stream
green_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "green-trips") \
    .option("startingOffsets", "earliest") \
    .load()

# Define schema for parsing JSON data
schema = types.StructType() \
    .add("lpep_pickup_datetime", types.StringType()) \
    .add("lpep_dropoff_datetime", types.StringType()) \
    .add("PULocationID", types.IntegerType()) \
    .add("DOLocationID", types.IntegerType()) \
    .add("passenger_count", types.DoubleType()) \
    .add("trip_distance", types.DoubleType()) \
    .add("tip_amount", types.DoubleType())

# Parse JSON data and select columns
parsed_green_stream = green_stream \
    .select(F.from_json(F.col("value").cast('STRING'), schema).alias("data")) \
    .select("data.*")

# Start processing the stream
query = parsed_green_stream \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Wait for the stream to finish
query.awaitTermination()


24/03/14 06:16:53 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-54608275-d28e-449f-adcc-d4dae943e082. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/14 06:16:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+---------------------+------------+------------+---------------+-------------+----------+
|lpep_pickup_datetime|lpep_dropoff_datetime|PULocationID|DOLocationID|passenger_count|trip_distance|tip_amount|
+--------------------+---------------------+------------+------------+---------------+-------------+----------+
| 2019-10-01 00:00:48|  2019-10-01 00:05:42|         210|         108|            2.0|         1.03|      2.19|
| 2019-10-01 00:45:08|  2019-10-01 01:04:28|          83|          36|            2.0|         5.75|       0.0|
| 2019-10-01 00:32:44|  2019-10-01 00:46:53|          92|         260|            2.0|         6.01|      4.16|
| 2019-10-01 00:05:09|  2019-10-01 00:18:34|          75|         119|            1.0|         1.08|      2.36|
| 2019-10-01 00:18:16|  2019-10-01 00:53:16|          29|          92|            1.0|        21.39|       0.0|
| 2019-

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/murat/spark/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/murat/spark/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/murat/anaconda3/envs/myenv/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
query.stop()

In [12]:
# Add a column "timestamp" using the current_timestamp function
stream_with_timestamp = parsed_green_stream.withColumn("timestamp", F.current_timestamp())

# Group by 5 minutes window based on the timestamp column and "DOLocationID"
popular_destinations = stream_with_timestamp \
    .groupBy(F.window(F.col("timestamp"), "5 minutes"), "DOLocationID") \
    .count() \
    .orderBy(F.desc("count"))

# Start the query to print the output to the console
query = popular_destinations \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .start()

# Wait for the stream to finish
query.awaitTermination()


24/03/14 06:19:44 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-d67f5ac7-b256-4df2-97a9-bc76f14d32ae. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/14 06:19:44 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+------------+-----+
|window                                    |DOLocationID|count|
+------------------------------------------+------------+-----+
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|74          |35474|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|42          |31877|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|41          |28119|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|75          |25677|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|129         |23852|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|7           |23060|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|166         |21688|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|236         |15826|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|223         |15082|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|238         |14636|
|{2024-03-14 06:15:00, 2024-03-14 06:20:00}|82          |14577|
|{2024-

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/murat/spark/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/murat/spark/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/murat/anaconda3/envs/myenv/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 