In [27]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window



spark = SparkSession \
    .builder \
    .master("spark://spark-master:7077") \
    .appName("ejemplo_DF") \
    .getOrCreate()

df = spark.read\
 .option("header", "true")\
 .csv("hdfs://namenode:9000/input/data/bus_trips.csv")

# Filter for only regular services
df = df.filter(F.col("trip_type") == "regular")

In [18]:
df.groupby("trip_code").count().filter(F.col("count") > 1).count()

0

In [21]:
df.groupby("line_number").count().filter(F.col("count") > 0).count()

2327

In [28]:
from pyspark.sql.window import Window

w = Window.partitionBy("line_number")

# Convert necessary columns to numeric types
df = df.withColumn("trip_duration_hours", F.coalesce(F.col("trip_duration_hours"), F.avg("trip_duration_hours").over(w)).cast("float")) \
       .withColumn("delay_start_minutes", F.coalesce(F.col("delay_start_minutes"), F.lit(0)).cast("float")) \
       .withColumn("delay_end_minutes", F.coalesce(F.col("delay_end_minutes"), F.lit(0)).cast("float")) \
       .withColumn("travelled_distance_km", F.coalesce(F.col("travelled_distance_km"), F.avg("trip_duration_hours").over(w)).cast("float"))

# Calculate total delay, delay percentage, and average speed
df = df.withColumn("total_delay_minutes", F.col("delay_start_minutes") + F.col("delay_end_minutes")) \
       .withColumn("delay_percentage", (F.col("total_delay_minutes") / (F.col("trip_duration_hours") * 60)) * 100) \
       .withColumn("speed_kmh", F.round(F.col("travelled_distance_km") / F.col("trip_duration_hours"), 2))

# Fake the dates. If year 2020 we use month = 8 If year 2019, month = 7 and drop year
df = df.withColumn("month", F.when(F.col("year") == 2019, 7).otherwise(8))
df = df.withColumn("year", F.lit(2024))
df = df.withColumn("date", F.make_date(F.col("year"), F.col("month"), F.col("day")))
df = df.drop("trip_type", "year", "day")

# Save the intermediate table with partitioning by `day`
df.coalesce(1).write.mode("overwrite").partitionBy("date").bucketBy(4, "line_number").format("parquet").option("path", "hdfs://namenode:9000/output/bus_trips/intermediate").saveAsTable("intermediate")

                                                                                

In [24]:
# Create a standardized key for each line to combine two-way trips
df_lines = df.withColumn("line_key", F.when(F.col("origin") < F.col("destination"),
                                            F.concat_ws("-", F.col("origin"), F.col("destination")))
                                 .otherwise(F.concat_ws("-", F.col("destination"), F.col("origin"))))

# Aggregate data to keep only unique two-way records per line
lines_df = df_lines.select("company", "line_number", "line_key").distinct()

# Save this as a second intermediate table
lines_df.write.mode("overwrite").parquet("hdfs://namenode:9000/output/bus_trips/lines")

# Show results for verification
lines_df.show()

                                                                                

+--------+-----------+--------------------+
| company|line_number|            line_key|
+--------+-----------+--------------------+
|808be8bd|   7d1c7a7b|    FORTALEZA-RECIFE|
|1c31a7b3|   9cf36911| CARANGOLA-ITAPERUNA|
|1dffde3c|   e8acc322|RIO DE JANEIRO-VA...|
|94179ee0|   3435ac8b|TEOFILO OTONI-VIT...|
|1dffde3c|   98d99196|APARECIDA-VOLTA R...|
|68f76547|   148fbe87|BELO HORIZONTE-CO...|
|bf040c87|   9afa9729| CURITIBA-RIO DO SUL|
|bf040c87|   57776c2b|   BLUMENAU-CURITIBA|
|94179ee0|   cb59a396|GOVERNADOR VALADA...|
|d7951af4|   4ef44285|  FORTALEZA-TERESINA|
|95480e8f|   c2583349|    CAMPINAS-MARINGA|
|a4b92c82|   c0c0f650|  GOIANIA-UBERLANDIA|
|a4b92c82|   0cd593a5|     GOIANIA-UBERABA|
|808be8bd|   2879bded|CAMPINA GRANDE-FO...|
|94179ee0|   3c150c3f|LINHARES-TEIXEIRA...|
|9bcede42|   fb2e26bd|   DOURADOS-LONDRINA|
|bf040c87|   a82134b3|FLORIANOPOLIS-SAO...|
|68f76547|   e8905f50|ITAPETINGA-RIO DE...|
|663c9c5c|   56fb9224|ANDRELANDIA-SAO P...|
|975b6e95|   d9dc491d|   ALFENAS

In [6]:
lines_df.count()

2328

In [29]:
df = spark.read.parquet("hdfs://namenode:9000/output/bus_trips/intermediate")

# Define a window across all lines for comparison
window_spec = Window.partitionBy("line_number").rowsBetween(Window.unboundedPreceding, 0)

df = df.groupby("date", "line_number").agg(
    F.avg("delay_percentage").alias("delay_percentage"),
    F.avg("trip_duration_hours").alias("trip_duration_hours"),
)
# Join overall statistics with trip summary and calculate ratios
trip_summary_ratios_df = df.withColumn("avg_delay_ratio", F.col("delay_percentage") / F.avg("delay_percentage").over(window_spec)) \
                           .withColumn("max_delay_ratio", F.col("delay_percentage") / F.max("delay_percentage").over(window_spec)) \
                           .withColumn("avg_trip_duration_hours", F.col("trip_duration_hours") / F.avg("trip_duration_hours").over(window_spec))\
                           .withColumn("max_trip_duration_hours", F.col("trip_duration_hours") / F.max("trip_duration_hours").over(window_spec))

trip_summary_ratios_df = trip_summary_ratios_df.drop("delay_percentage", "trip_duration_hours")

# Save results as the final output
trip_summary_ratios_df.write.partitionBy("date").mode("overwrite").parquet("hdfs://namenode:9000/output/bus_trips/stats")

24/10/27 21:23:44 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
24/10/27 21:23:44 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:978)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [8]:
trip_summary_ratios_df.count()

                                                                                

35516

In [11]:
trip_summary_ratios_df.show()

[Stage 38:===>                                                    (1 + 15) / 16]

+----------+-----------+--------------------+--------------------+-----------------------+-----------------------+
|      date|line_number|     avg_delay_ratio|     max_delay_ratio|avg_trip_duration_hours|max_trip_duration_hours|
+----------+-----------+--------------------+--------------------+-----------------------+-----------------------+
|2024-08-18|   000cfbc6|-2.14241513705786...|-2.14241513705786...|     0.8206744409596496|    0.29435483211967384|
|2024-08-03|   000cfbc6|0.039168163744122575|0.039168163744122575|     0.8546818490119048|    0.30655241545898365|
|2024-08-29|   000cfbc6| 0.30704400634318335| 0.30704400634318335|     0.1840896380990262|      0.066028222414538|
|2024-08-16|   000cfbc6|-0.00852008703537...|-0.00852008703537...|     1.0356799015148972|     0.3714717655683966|
|2024-08-06|   000cfbc6|-0.00232726779529...|-0.00232726779529...|     0.9211508388142824|    0.33039313396797537|
|2024-08-13|   000cfbc6|-5.64202669274409...|-5.64202669274409...|     2.7880447

                                                                                