# Real-Time Taxi Trip Analytics 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, unix_timestamp, hour, avg, to_date
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
import findspark 



### Initialize a SparkSession

In [2]:

spark = SparkSession.builder \
    .appName("TaxiTripProcessor") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .config("spark.sql.shuffle.partitions", "2") \
    .config("spark.streaming.stopGracefullyOnShutdown", "true") \
    .getOrCreate()

24/07/25 00:09:39 WARN Utils: Your hostname, Kenneths-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.100.15 instead (on interface en0)
24/07/25 00:09:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/kenneth/.ivy2/cache
The jars for the packages stored in: /Users/kenneth/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0329bf7b-d736-411f-b792-5f37cf1b252a;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central


:: loading settings :: url = jar:file:/opt/anaconda3/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 267ms :: artifacts dl 8ms
	:: modules in use:
	com.google.code.findbugs#jsr305;3.0.0 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	org.apache.commons#commons-pool2;2.11.1 from central in [default]
	org.apache.hadoop#hadoop-client-api;3.3.4 from central in [default]
	org.apache.hadoop#hadoop-client-runtime;3.3.4 from central in [default]
	org.apache.ka

### Schema Definition

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType

schema = StructType([
    StructField("VendorID", IntegerType()),
    StructField("tpep_pickup_datetime", TimestampType()),
    StructField("tpep_dropoff_datetime", TimestampType()),
    StructField("passenger_count", DoubleType()),
    StructField("trip_distance", DoubleType()),
    StructField("RatecodeID", IntegerType()),
    StructField("store_and_fwd_flag", StringType()),
    StructField("PULocationID", IntegerType()),
    StructField("DOLocationID", IntegerType()),
    StructField("payment_type", IntegerType()),
    StructField("fare_amount", DoubleType()),
    StructField("extra", DoubleType()),
    StructField("mta_tax", DoubleType()),
    StructField("tip_amount", DoubleType()),
    StructField("tolls_amount", DoubleType()),
    StructField("improvement_surcharge", DoubleType()),
    StructField("total_amount", DoubleType()),
    StructField("congestion_surcharge", DoubleType()),
    StructField("Airport_fee", DoubleType())
])


### Read Stream from Kafka

In [4]:
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "trips_data") \
    .option("startingOffsets", "earliest") \
    .load() \
    .selectExpr("CAST(value AS STRING) as json_string")

df = df.select(from_json("json_string", schema).alias("data")).select("data.*")


### Data Transformation

In [5]:
# Calculate trip duration in minutes
df = df.withColumn("trip_duration", 
                   (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60)


## Real-Time Analytics

#### Real-Time Analytics

In [6]:
# Average trip duration by hour
avg_duration_by_hour = df.withColumn("hour", hour("tpep_pickup_datetime")) \
    .groupBy("hour") \
    .agg(avg("trip_duration").alias("avg_duration"))

duration_query = avg_duration_by_hour.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()


24/07/25 00:09:42 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/fw/93khyvld0hv9qrmrc7k8kfqm0000gn/T/temporary-80b6a01b-7a91-4255-a952-efdb0757948a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/07/25 00:09:42 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


#### Cost Efficiency Analysis

In [7]:
# Average cost per mile
cost_per_mile = df.withColumn("cost_per_mile", col("total_amount") / col("trip_distance")) \
    .groupBy(to_date("tpep_pickup_datetime").alias("date")) \
    .agg(avg("cost_per_mile").alias("avg_cost_per_mile"))

cost_query = cost_per_mile.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()


24/07/25 00:09:42 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/fw/93khyvld0hv9qrmrc7k8kfqm0000gn/T/temporary-cd4c93d2-f7d1-4b04-a05a-cff083a2301f. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/07/25 00:09:42 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


#### Passenger Trends

In [8]:
# Count trips by passenger count
passenger_trends = df.groupBy("passenger_count").count()

passenger_query = passenger_trends.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()


24/07/25 00:09:42 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/fw/93khyvld0hv9qrmrc7k8kfqm0000gn/T/temporary-5ca7ad34-d71e-4f7f-ab00-dad74c404584. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/07/25 00:09:42 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [9]:
spark.streams.awaitAnyTermination()


24/07/25 00:09:42 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/07/25 00:09:42 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/07/25 00:09:42 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
-------------------------------------------
Batch: 0
-------------------------------------------
-------------------------------------------
Batch: 0
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  230|
|            4.0|    3|
|            3.0|    6|
|            6.0|    2|
|            2.0|   31|
|            NaN|   15|
|            0.0|    4|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-31| 8.321010777708171|
|2024-01-24| 11.53962063086104|
|2024-01-20| 14.16867214194149|
|2024-01-19|11.686192102247402|
|2024-01-22| 15.37956791975897|
|2024-01-06| 11.25129183312016|
|2024-01-27|   9.2495112671592|
|2024-01-08|12.909075176268676|
|2024-01-02| 9.295067083312716|
|2024-01-14| 15.72558049256103|
|2024-01-11|12.9113124291466

                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
-------------------------------------------
Batch: 14
-------------------------------------------
-------------------------------------------
Batch: 14
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  240|
|            4.0|    3|
|            3.0|    6|
|            6.0|    2|
|            2.0|   35|
|            NaN|   15|
|            0.0|    4|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.583333333333332|
|  13|11.760714285714284|
|  12|11.409523809523808|
|  14|14.238888888888889|
|   2|10.473333333333333|
|  10|13.554166666666667|
|  18|15.021052631578948|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|13.724074074074073|
|   8|21.001587301587303|
|  17| 19.62916666666667|
|   6|19.594444444444445|
|  20|15.740476190476187|

                                                                                

-------------------------------------------
Batch: 23
-------------------------------------------
-------------------------------------------
Batch: 23
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  248|
|            4.0|    3|
|            3.0|    6|
|            6.0|    2|
|            2.0|   35|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.583333333333332|
|  13|11.760714285714284|
|  12|11.409523809523808|
|  14|           13.5875|
|   2|10.473333333333333|
|  10|13.554166666666667|
|  18|14.779166666666669|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|13.724074074074073|
|   8|21.001587301587303|
|  17|19.400000000000002|
|   6|16.241666666666667|
|  20|15.740476190476187|
|  19|11.450925925925922|
|  11|21.312962962962967|
|   7| 20.59444444444445|
|  15|             

                                                                                

-------------------------------------------
Batch: 24
-------------------------------------------
-------------------------------------------
Batch: 24
-------------------------------------------
-------------------------------------------
Batch: 24
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  248|
|            4.0|    3|
|            3.0|    6|
|            6.0|    2|
|            2.0|   36|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24| 11.53962063086104|
|2024-01-20| 14.16867214194149|
|2024-01-19|11.686192102247402|
|2024-01-06| 11.25129183312016|
|2024-01-08|12.940389880864029|
|2024-01-02| 9.534499487372312|
|2024-01-31| 9.443609610354292|
|2024-01-22| 15.37956791975897|
|2024-01-27|  9.91546923135237|
|2024-01-14|14.464416978971725|
|2024-01-11|12.5915644997

                                                                                

-------------------------------------------
Batch: 29
-------------------------------------------
-------------------------------------------
Batch: 29
-------------------------------------------
+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.583333333333332|
|  13|11.760714285714284|
|  12| 15.29583333333333|
|  14|14.415873015873016|
|   2|10.473333333333333|
|  10|13.554166666666667|
|  18|14.779166666666669|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|13.724074074074073|
|   8|21.001587301587303|
|  17| 19.16166666666667|
|   6|16.241666666666667|
|  20|15.740476190476187|
|  19|11.450925925925922|
|  11|21.312962962962967|
|   7| 20.59444444444445|
|  15|               8.3|
|  16|12.761458333333335|
|  21|16.870512820512822|
+----+------------------+
only showing top 20 rows

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24| 11.53962063086104|
|2024-01-20| 14.1686721419

                                                                                

-------------------------------------------
Batch: 31
-------------------------------------------
-------------------------------------------
Batch: 31
-------------------------------------------
-------------------------------------------
Batch: 31
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  253|
|            4.0|    3|
|            3.0|    6|
|            6.0|    2|
|            5.0|    1|
|            2.0|   37|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.583333333333332|
|  13|11.760714285714284|
|  12| 15.29583333333333|
|  14|14.415873015873016|
|   2|10.473333333333333|
|  10|13.554166666666667|
|  18|14.779166666666669|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|13.724074074074073|
|   8|21.001587301587303|
|  17| 19.16166666666667|
|   6|16.241666666666667|
|

                                                                                

-------------------------------------------
Batch: 32
-------------------------------------------
-------------------------------------------
Batch: 32
-------------------------------------------
-------------------------------------------
Batch: 32
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  254|
|            4.0|    3|
|            3.0|    6|
|            6.0|    2|
|            5.0|    1|
|            2.0|   37|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.583333333333332|
|  13|11.760714285714284|
|  12| 15.29583333333333|
|  14|14.415873015873016|
|   2|10.473333333333333|
|  10|13.554166666666667|
|  18|14.779166666666669|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|13.724074074074073|
|   8|21.001587301587303|
|  17| 19.16166666666667|
|   6|16.241666666666667|
|

                                                                                

-------------------------------------------
Batch: 33
-------------------------------------------
-------------------------------------------
Batch: 33
-------------------------------------------
-------------------------------------------
Batch: 33
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  255|
|            4.0|    3|
|            3.0|    6|
|            6.0|    2|
|            5.0|    1|
|            2.0|   37|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24| 11.53962063086104|
|2024-01-20|13.906013192718232|
|2024-01-19|11.686192102247402|
|2024-01-06| 11.25129183312016|
|2024-01-08|12.940389880864029|
|2024-01-02| 9.534499487372312|
|2024-01-31|10.493332046970346|
|2024-01-22| 15.37956791975897|
|2024-01-27|  9.91546923135237|
|2024-01-14|14.464416978971725|
|

                                                                                

-------------------------------------------
Batch: 39
-------------------------------------------
-------------------------------------------
Batch: 39
-------------------------------------------
-------------------------------------------
Batch: 39
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  258|
|            4.0|    3|
|            3.0|    8|
|            6.0|    2|
|            5.0|    1|
|            2.0|   38|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24| 11.53962063086104|
|2024-01-20|13.906013192718232|
|2024-01-19|11.686192102247402|
|2024-01-06| 11.25129183312016|
|2024-01-08|13.061582098376778|
|2024-01-02| 9.534499487372312|
|2024-01-31|10.493332046970346|
|2024-01-22| 15.37956791975897|
|2024-01-27|  9.91546923135237|
|2024-01-14|14.464416978971725|
|

                                                                                

-------------------------------------------
Batch: 40
-------------------------------------------
-------------------------------------------
Batch: 40
-------------------------------------------
-------------------------------------------
Batch: 40
-------------------------------------------
+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.278124999999998|
|  13|11.760714285714284|
|  12|14.972549019607841|
|  14|14.415873015873016|
|   2|10.473333333333333|
|  10|13.554166666666667|
|  18|14.445652173913043|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|12.544999999999998|
|   8|21.001587301587303|
|  17| 19.16166666666667|
|   6|16.241666666666667|
|  20|15.740476190476187|
|  19|11.450925925925922|
|  11|21.312962962962967|
|   7| 20.59444444444445|
|  15|         8.4015625|
|  16|13.779824561403512|
|  21|16.741666666666667|
+----+------------------+
only showing top 20 rows

+----------+------------------+
|      date| avg_cost_pe

                                                                                

-------------------------------------------
Batch: 41
-------------------------------------------
-------------------------------------------
Batch: 41
-------------------------------------------
-------------------------------------------
Batch: 41
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  260|
|            4.0|    3|
|            3.0|    8|
|            6.0|    2|
|            5.0|    1|
|            2.0|   38|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24| 11.53962063086104|
|2024-01-20|13.906013192718232|
|2024-01-19|11.686192102247402|
|2024-01-06| 11.25129183312016|
|2024-01-08|13.061582098376778|
|2024-01-02| 9.534499487372312|
|2024-01-31|10.493332046970346|
|2024-01-22| 15.37956791975897|
|2024-01-27|  9.91546923135237|
|2024-01-14|14.464416978971725|
|

                                                                                

-------------------------------------------
Batch: 47
-------------------------------------------
-------------------------------------------
Batch: 47
-------------------------------------------
-------------------------------------------
Batch: 47
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  264|
|            4.0|    3|
|            3.0|    8|
|            6.0|    2|
|            5.0|    1|
|            2.0|   40|
|            NaN|   15|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.278124999999998|
|  13|11.760714285714284|
|  12|14.972549019607841|
|  14|14.193939393939395|
|   2|10.473333333333333|
|  10|12.720512820512822|
|  18|14.445652173913043|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|12.544999999999998|
|   8|21.001587301587303|
|  17| 19.16166666666667|
|   6|16.241666666666667|
|

                                                                                

-------------------------------------------
Batch: 49
-------------------------------------------
-------------------------------------------
Batch: 49
-------------------------------------------
-------------------------------------------
Batch: 49
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  264|
|            4.0|    3|
|            3.0|    9|
|            6.0|    2|
|            5.0|    1|
|            2.0|   40|
|            NaN|   16|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22|13.278124999999998|
|  13|11.760714285714284|
|  12|14.972549019607841|
|  14|14.193939393939395|
|   2|10.473333333333333|
|  10|12.720512820512822|
|  18|14.445652173913043|
|   3|21.616666666666667|
|   1| 9.503333333333332|
|   9|12.544999999999998|
|   8|21.001587301587303|
|  17|18.588888888888892|
|   6|16.241666666666667|
|

                                                                                

-------------------------------------------
Batch: 53
-------------------------------------------
-------------------------------------------
Batch: 53
-------------------------------------------
-------------------------------------------
Batch: 53
-------------------------------------------
+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22| 12.96862745098039|
|  13|11.760714285714284|
|  12|14.972549019607841|
|  14|14.193939393939395|
|   2|10.473333333333333|
|  10|12.720512820512822|
|  18|14.445652173913043|
|   3|13.583333333333334|
|   1| 9.503333333333332|
|   9|12.544999999999998|
|   8|21.001587301587303|
|  17|18.588888888888892|
|   6|16.241666666666667|
|  20|15.272222222222219|
|  19|11.419298245614032|
|  11|21.065151515151516|
|   7|19.330000000000002|
|  15|         8.4015625|
|  16|16.815000000000005|
|  21|16.047777777777778|
+----+------------------+
only showing top 20 rows

+----------+------------------+
|      date| avg_cost_pe

                                                                                

-------------------------------------------
Batch: 55
-------------------------------------------
-------------------------------------------
Batch: 55
-------------------------------------------
-------------------------------------------
Batch: 55
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  268|
|            4.0|    3|
|            3.0|   11|
|            6.0|    2|
|            5.0|    1|
|            2.0|   40|
|            NaN|   16|
|            0.0|    5|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24|12.994113007174995|
|2024-01-20|13.906013192718232|
|2024-01-19|11.686192102247402|
|2024-01-06|10.729994196309324|
|2024-01-08|13.061582098376778|
|2024-01-02| 9.534499487372312|
|2024-01-31|10.493332046970346|
|2024-01-22| 15.37956791975897|
|2024-01-27|10.541335664865791|
|2024-01-14|14.464416978971725|
|

                                                                                

-------------------------------------------
Batch: 62
-------------------------------------------
-------------------------------------------
Batch: 62
-------------------------------------------
-------------------------------------------
Batch: 62
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  275|
|            4.0|    3|
|            3.0|   11|
|            6.0|    2|
|            5.0|    1|
|            2.0|   40|
|            NaN|   16|
|            0.0|    5|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24|12.994113007174995|
|2024-01-20|13.906013192718232|
|2024-01-19|11.686192102247402|
|2024-01-06|10.729994196309324|
|2024-01-08|13.061582098376778|
|2024-01-02| 9.534499487372312|
|2024-01-31|11.187582700862366|
|2024-01-22| 15.37956791975897|
|2024-01-27|10.541335664865791|
|2024-01-14|14.464416978971725|
|

                                                                                

-------------------------------------------
Batch: 67
-------------------------------------------
-------------------------------------------
Batch: 67
-------------------------------------------
-------------------------------------------
Batch: 67
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  277|
|            4.0|    3|
|            3.0|   12|
|            6.0|    2|
|            5.0|    1|
|            2.0|   41|
|            NaN|   17|
|            0.0|    5|
+---------------+-----+

+----------+------------------+
|      date| avg_cost_per_mile|
+----------+------------------+
|2024-01-24|12.994113007174995|
|2024-01-20| 13.96826821382862|
|2024-01-19|11.686192102247402|
|2024-01-06|10.729994196309324|
|2024-01-08|13.061582098376778|
|2024-01-02| 9.534499487372312|
|2024-01-31|11.187582700862366|
|2024-01-22| 15.37956791975897|
|2024-01-27|10.541335664865791|
|2024-01-14|14.464416978971725|
|

                                                                                

-------------------------------------------
Batch: 68
-------------------------------------------
-------------------------------------------
Batch: 68
-------------------------------------------
-------------------------------------------
Batch: 68
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  277|
|            4.0|    3|
|            3.0|   12|
|            6.0|    2|
|            5.0|    2|
|            2.0|   41|
|            NaN|   17|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22| 12.96862745098039|
|  13|11.760714285714284|
|  12|15.269444444444442|
|  14|14.193939393939395|
|   2|10.473333333333333|
|  10|14.058888888888891|
|  18|14.112666666666666|
|   3|10.611111111111112|
|   1|11.361111111111109|
|   9|11.951515151515151|
|   8|21.001587301587303|
|  17|18.588888888888892|
|   6|16.241666666666667|
|

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

-------------------------------------------
Batch: 69
-------------------------------------------
-------------------------------------------
Batch: 69
-------------------------------------------
-------------------------------------------
Batch: 69
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  277|
|            4.0|    3|
|            3.0|   12|
|            6.0|    2|
|            5.0|    2|
|            2.0|   42|
|            NaN|   17|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22| 12.96862745098039|
|  13|11.760714285714284|
|  12|15.269444444444442|
|  14|14.193939393939395|
|   2|10.473333333333333|
|  10|14.058888888888891|
|  18|14.112666666666666|
|   3|10.611111111111112|
|   1|11.361111111111109|
|   9|11.951515151515151|
|   8|21.001587301587303|
|  17|18.008333333333336|
|   6|16.241666666666667|
|

                                                                                

-------------------------------------------
Batch: 70
-------------------------------------------
-------------------------------------------
Batch: 70
-------------------------------------------
-------------------------------------------
Batch: 70
-------------------------------------------
+---------------+-----+
|passenger_count|count|
+---------------+-----+
|            1.0|  278|
|            4.0|    3|
|            3.0|   12|
|            6.0|    2|
|            5.0|    2|
|            2.0|   42|
|            NaN|   17|
|            0.0|    5|
+---------------+-----+

+----+------------------+
|hour|      avg_duration|
+----+------------------+
|  22| 12.96862745098039|
|  13|11.760714285714284|
|  12|15.269444444444442|
|  14|14.193939393939395|
|   2|10.473333333333333|
|  10|14.058888888888891|
|  18|14.112666666666666|
|   3|10.611111111111112|
|   1|11.361111111111109|
|   9|11.951515151515151|
|   8|21.001587301587303|
|  17|18.008333333333336|
|   6|16.241666666666667|
|

In [None]:
spark.stop()