In [0]:
df = spark.read.csv('/mnt/robot-dreams-source-mount/Lecture_3/nyc_taxi/taxi_zone_lookup.csv', header=True, inferSchema=True)
display(df)



In [0]:
%sql
CREATE CATALOG igor_zabelkin_nyc_catalog
MANAGED LOCATION 's3://izabelkin-emr-databricks/igor_zabelkin_nyc_catalog/';


In [0]:
%sql
USE CATALOG igor_zabelkin_nyc_catalog;


In [0]:
%sql
CREATE SCHEMA trips_schema;


In [0]:
%sql
USE SCHEMA trips_schema;

In [0]:
YELLOW_PATH = "/mnt/robot-dreams-source-mount/home-work-1-unified/nyc_taxi/yellow/"
GREEN_PATH = "/mnt/robot-dreams-source-mount/home-work-1-unified/nyc_taxi/green/"
ZONES_PATH = "/mnt/robot-dreams-source-mount/home-work-1-unified/nyc_taxi/taxi_zone_lookup.csv"

yellow_df = spark.read.option("recursiveFileLookup", "true").parquet(YELLOW_PATH)
green_df = spark.read.option("recursiveFileLookup", "true").parquet(GREEN_PATH)
#zones_df = spark.read.option("header", "true").csv(ZONES_PATH)


In [0]:
from pyspark.sql.functions import col, lit
yellow_df = yellow_df.withColumn("taxi_type", lit("yellow"))
green_df = green_df.withColumn("taxi_type", lit("green"))


In [0]:
trips_df = yellow_df.unionByName(green_df)


In [0]:
from pyspark.sql.functions import unix_timestamp

trips_df = trips_df.withColumn(
    "duration_min",
    (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60
)

trips_df = trips_df.filter(
    (col("trip_distance") >= 0.1) &
    (col("fare_amount") >= 2.0) &
    (col("duration_min") >= 1.0)
)


In [0]:
from pyspark.sql.functions import hour, date_format

trips_df = trips_df \
    .withColumn("pickup_hour", hour("tpep_pickup_datetime")) \
    .withColumn("pickup_day_of_week", date_format("tpep_pickup_datetime", "E"))


In [0]:
taxi_zones_df = spark.read.option("header", True).csv(
    "s3a://robot-dreams-source-data/Lecture_3/nyc_taxi/taxi_zone_lookup.csv"
)

# JOIN для pickup_zone
trips_df = trips_df.join(
    taxi_zones_df.withColumnRenamed("LocationID", "PULocationID")
                 .withColumnRenamed("Zone", "tpep_pickup_zone"),
    on="PULocationID",
    how="left"
)

# JOIN для dropoff_zone
trips_df = trips_df.join(
    taxi_zones_df.withColumnRenamed("LocationID", "DOLocationID")
                 .withColumnRenamed("Zone", "tpep_dropoff_zone"),
    on="DOLocationID",
    how="left"
)




In [0]:
trips_df.write.format("delta").mode("overwrite").saveAsTable("igor_zabelkin_nyc_catalog.trips_schema.raw_trips")




In [0]:
%sql
SELECT * 
FROM igor_zabelkin_nyc_catalog.trips_schema.raw_trips
LIMIT 10;


In [0]:
from pyspark.sql.functions import (
    col, count, avg, max, min, sum as _sum, expr
)

zone_summary_df = trips_df.groupBy("tpep_pickup_zone").agg(
  count("*").alias("total_trips"),
    avg("trip_distance").alias("avg_trip_distance"),
    avg("total_amount").alias("avg_total_amount"),
    avg("tip_amount").alias("avg_tip_amount"),
    _sum(expr("CASE WHEN taxi_type = 'yellow' THEN 1 ELSE 0 END")).alias("yellow_trips"),
    _sum(expr("CASE WHEN taxi_type = 'green' THEN 1 ELSE 0 END")).alias("green_trips"),
    max("trip_distance").alias("max_trip_distance"),
    min("tip_amount").alias("min_tip_amount"),
    _sum("total_amount").alias("total_trip_amount")
).withColumn(
    "yellow_share", col("yellow_trips") / col("total_trips")
).withColumn(
    "green_share", col("green_trips") / col("total_trips")
).drop("yellow_trips", "green_trips")


In [0]:
zone_summary_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("igor_zabelkin_nyc_catalog.trips_schema.zone_summary")


In [0]:
%sql
SELECT * 
FROM igor_zabelkin_nyc_catalog.trips_schema.zone_summary
LIMIT 10;

In [0]:
from pyspark.sql.functions import col, when, count, avg, sum as _sum

# Створення колонки is_high_fare
trips_df = trips_df.withColumn("is_high_fare", when(col("fare_amount") > 30, 1).otherwise(0))

# Агрегація по pickup_day_of_week
agg_by_day_df = trips_df.groupBy("pickup_day_of_week").agg(
    count("*").alias("total_trips_per_day"),
    avg("duration_min").alias("avg_duration_per_zone"),
    (_sum("is_high_fare") / count("*")).alias("high_fare_share")
)


In [0]:
agg_by_day_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("igor_zabelkin_nyc_catalog.trips_schema.zone_days_summary")


In [0]:
%sql
SELECT * 
FROM igor_zabelkin_nyc_catalog.trips_schema.zone_days_summary
LIMIT 10;