In [None]:
import json
import time
import pandas as pd 

from kafka import KafkaProducer

def json_serializer(data):
    return json.dumps(data).encode('utf-8')

server = 'localhost:9092'

producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer
)

producer.bootstrap_connected()

In [None]:
t0 = time.time()

topic_name = 'test-topic'

for i in range(10):
    message = {'number': i}
    producer.send(topic_name, value=message)
    print(f"Sent: {message}")
    time.sleep(0.05)

t1 = time.time()
print(f'took {(t1 - t0):.2f} seconds')

t0 = time.time()
producer.flush()
t1 = time.time()
print(f'took {(t0 - t1):.2f} seconds')

In [None]:
df_green = pd.read_csv('https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz', 
                       compression='gzip')


In [None]:
df_green.head(5)

In [None]:
topic_name = 'green-trips'

t0 = time.time()

for row in df_green.itertuples(index=False):
    row_dict = {col: getattr(row, col) for col in row._fields}
    #print(row_dict)
    
    message = {
        'lpep_pickup_datetime': row_dict['lpep_pickup_datetime'],
        'lpep_dropoff_datetime': row_dict['lpep_dropoff_datetime'],
        'PULocationID': row_dict['PULocationID'],
        'DOLocationID': row_dict['DOLocationID'],
        'passenger_count': row_dict['passenger_count'],
        'trip_distance': row_dict['trip_distance'],
        'tip_amount': row_dict['tip_amount']
    }
    producer.send(topic_name, value=message)
    print(f"Sent: {message}")
    
t1 = time.time()

print(f'took {(t0 - t1):.2f} seconds')
    

In [None]:
import pyspark
from pyspark.sql import SparkSession

pyspark_version = pyspark.__version__
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("GreenTripsConsumer") \
    .config("spark.jars.packages", kafka_jar_package) \
    .getOrCreate()