In [None]:
# !pip install jupyter_contrib_nbextensions

- cd /mnt/d/zde/data-engineering-zoomcamp/cohorts/2024/06-streaming
- docker-compose up -d
- docker ps
- dockwer exec -it redpanda-1 bash

# Question 1: Redpanda version

- rpk help 
- rpk version


# Question 2. Creating a topic

- create topic test-topic


# Question 3. Connecting to the Kafka server

- pip install kafka-python

In [None]:
import json
import time 

from kafka import KafkaProducer

def json_serializer(data):
    return json.dumps(data).encode('utf-8')

server = 'localhost:9092'

producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer
)

producer.bootstrap_connected()

**Answer 3 : True**

# Question 4. Sending data to the stream

In [None]:
t0 = time.time()

topic_name = 'test-topic'

for i in range(10):
    message = {'number': i}
    producer.send(topic_name, value=message)
    print(f"Sent: {message}")
    time.sleep(0.05)

t01 = time.time()

producer.flush()

t1 = time.time()
print(f'Sending message took {(t01 - t0):.2f} seconds')
print(f'Flus took {(t1 - t01):.2f} seconds')

How much time did it take? Where did it spend most of the time?

- Sending the messages
- Flushing
- Both took approximately the same amount of time

**Answer 4 : Sending the messages**

# Question 5: Sending the Trip Data

## Sending the taxi data

Now let's send our actual data:

Read the green csv.gz file

We will only need these columns:
- 'lpep_pickup_datetime',
- 'lpep_dropoff_datetime',
- 'PULocationID',
- 'DOLocationID',
- 'passenger_count',
- 'trip_distance',
- 'tip_amount'

Iterate over the records in the dataframe


In [None]:
# Read dataset and define dataFrame
import pandas as pd
df_green =  pd.read_csv("green_tripdata_2019-10.csv.gz")

columns_to_list = ['lpep_pickup_datetime',
'lpep_dropoff_datetime',
'PULocationID',
'DOLocationID',
'passenger_count',
'trip_distance',
'tip_amount']


df_green = df_green[columns_to_list]

In [None]:
t0 = time.time()

topic_name = 'green-trips'

for row in df_green.itertuples(index=False):
    row_dict = {col: getattr(row, col) for col in row._fields}
    print(row_dict)

t01 = time.time()
print('\n')
print(f'Sending data took {(t01 - t0):.2f} seconds')

producer.flush()

t1 = time.time()
print(f'Flusing took {(t1 - t01):.2f} seconds')

- Create a topic green-trips and send the data there
- How much time in seconds did it take? (You can round it to a whole number)
- Make sure you don't include sleeps in your code

**Answer 5 : 7.32 seconds**


# Question 6. Parsing the data

## Creating the PySpark consumer

Now let's read the data with PySpark.

Spark needs a library (jar) to be able to connect to Kafka, so we need to tell PySpark that it needs to use it:

In [None]:
import pyspark
from pyspark.sql import SparkSession

pyspark_version = pyspark.__version__
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("GreenTripsConsumer") \
    .config("spark.jars.packages", kafka_jar_package) \
    .getOrCreate()

Now we can connect to the stream:

In [None]:
green_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "green-trips") \
    .option("startingOffsets", "earliest") \
    .load()

In order to test that we can consume from the stream, let's see what will be the first record there.

In Spark streaming, the stream is represented as a sequence of small batches, each batch being a small RDD (or a small dataframe).

So we can execute a function over each mini-batch. Let's run take(1) there to see what do we have in the stream:

In [None]:
def peek(mini_batch, batch_id):
    first_row = mini_batch.take(1)

    if first_row:
        print(first_row[0])

query = green_stream.writeStream.foreachBatch(peek).start()

You should see a record like this:

Row(key=None, value=bytearray(b'{"lpep_pickup_datetime": "2019-10-01 00:26:02", "lpep_dropoff_datetime": "2019-10-01 00:39:58", "PULocationID": 112, "DOLocationID": 196, "passenger_count": 1.0, "trip_distance": 5.88, "tip_amount": 0.0}'), topic='green-trips', partition=0, offset=0, timestamp=datetime.datetime(2024, 4, 7, 20, 34, 5, 862000), timestampType=0)

Now let's stop the query, so it doesn't keep consuming messages from the stream

In [None]:
query.stop()

The data is JSON, but currently it's in binary format. We need to parse it and turn it into a streaming dataframe with proper columns.

Similarly to PySpark, we define the schema

In [None]:
from pyspark.sql import types

schema = types.StructType() \
    .add("lpep_pickup_datetime", types.StringType()) \
    .add("lpep_dropoff_datetime", types.StringType()) \
    .add("PULocationID", types.IntegerType()) \
    .add("DOLocationID", types.IntegerType()) \
    .add("passenger_count", types.DoubleType()) \
    .add("trip_distance", types.DoubleType()) \
    .add("tip_amount", types.DoubleType())

And apply this schema:

In [None]:
from pyspark.sql import functions as F

green_stream = green_stream \
  .select(F.from_json(F.col("value").cast('STRING'), schema).alias("data")) \
  .select("data.*")

How does the record look after parsing? Copy the output.

In [None]:
green_stream

**Answer 6** :

**DataFrame[lpep_pickup_datetime: string, lpep_dropoff_datetime: string, PULocationID: int, DOLocationID: int, passenger_count: double, trip_distance: double, tip_amount: double]**

# Question 7: Most popular destination

Now let's finally do some streaming analytics. We will see what's the most popular destination currently based on our stream of data (which ideally we should have sent with delays like we did in workshop 2)

This is how you can do it:

- Add a column "timestamp" using the current_timestamp function
- Group by:
    - 5 minutes window based on the timestamp column (F.window(col("timestamp"), "5 minutes"))
    - "DOLocationID"
- Order by count

You can print the output to the console using this code

# do some streaming analytics.
from pyspark.sql.functions import col
from pyspark.sql.functions import current_timestamp

df_green_stream = green_stream.withColumn("timestamp", current_timestamp())

popular_destinations = df_green_stream \
    .groupBy(F.window(col("timestamp"), "5 minutes"), df_green_stream.DOLocationID) \
    .count() \
    .sort(col("count").desc()) \
    .limit(1) \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "true") \
    .start()

popular_destinations.awaitTermination()

In [None]:
# see most_popular_destination.ipynb script

**Answer 7 :**

- LocationID : 74
- Zone : East Harlem North
- number_dropoff : 17741

# Calling another ipynb script

Example : **call most_popular_destination.ipynb** scipt

Refference : https://stackoverflow.com/questions/20186344/importing-an-ipynb-file-from-another-ipynb-file

In [None]:
# !pip install import-ipynb

In [None]:
import import_ipynb

In [None]:
import most_popular_destination