In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .master("local[*]") \
    .config("spark.streaming.stopGracefullyOnShutdown","true") \
    .config("spark.jars.packeges","org.apache.spark:spark-sql-kafka-0-10_2.13:3.1.2") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://your-hive-metastore-uri:port") \
    .config("spark.jars", "/path/to/mysql-connector-java-x.x.xx.jar") \
    .config("spark.driver.allowMultipleContexts","true") \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
# Kafka configs
kafka_input_config = {
    "kafka.bootstrap.servers": "localhost:9092",  # Adjust as necessary
    "subscribe": "ylc_topic",  # Your topic name
    "startingOffsets": "latest",
    "failOnDataLoss": "false"
}

In [4]:
schema = StructType([
    StructField("VendorID", DecimalType(10, 0), True),
    StructField("tpep_pickup_datetime", StringType(), True),
    StructField("tpep_dropoff_datetime", StringType(), True),
    StructField("passenger_count", DecimalType(10, 0), True),  
    StructField("trip_distance", DecimalType(10, 2), True),
    StructField("RatecodeID", DecimalType(10, 0), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("PULocationID", DecimalType(10, 0), True),
    StructField("DOLocationID", DecimalType(10, 0), True),
    StructField("payment_type", DecimalType(10, 0), True),
    StructField("fare_amount", DecimalType(10, 2), True),
    StructField("extra", DecimalType(10, 2), True),
    StructField("mta_tax", DecimalType(10, 2), True),
    StructField("tip_amount", DecimalType(10, 2), True),
    StructField("tolls_amount", DecimalType(10, 2), True),
    StructField("improvement_surcharge", DecimalType(10, 2), True),
    StructField("total_amount", DecimalType(10, 2), True),
    StructField("congestion_surcharge", DecimalType(10, 2), True),
    StructField("Airport_fee", DecimalType(10, 2), True)
])


In [5]:
mysql_url = "jdbc:mysql://localhost:3306/ylc"
mysql_properties = {
    "user": "student",
    "password": "student",
    "driver": "com.mysql.jdbc.Driver"
}
def sqlinsert(df, epoch_id):
    df.write \
      .jdbc(url=mysql_url, table="nyc_taxi_trips", mode="append", properties=mysql_properties)

In [6]:
df = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_input_config) \
    .load()

In [7]:
value_df = df.selectExpr("CAST(value AS STRING) as value")

ylc_df = value_df.withColumn("value", from_json(col("value"), schema))

ylc_df = ylc_df.select("value.*") \
    .withColumn("tpep_pickup_datetime", to_timestamp("tpep_pickup_datetime", "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("tpep_pickup_datetime", to_timestamp("tpep_dropoff_datetime", "yyyy-MM-dd HH:mm:ss")) 


In [None]:
myStream = ylc_df.writeStream \
    .foreachBatch(sqlinsert) \
    .option("checkpointLocation", "/Data/hive_checkpoint/YLC") \
    .outputMode("append") \
    .start()
myStream.awaitTermination()