In [None]:
from pyspark.sql import SparkSession, functions as fn
from pyspark import SparkFiles

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
spark.version

In [None]:
SparkFiles.get('yellow_tripdata_2024-10.parquet')

In [None]:

data_file = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet"
spark.sparkContext.addFile(data_file)
filePath  = 'file://' + SparkFiles.get('yellow_tripdata_2024-10.parquet')
yellow_trips = spark.read.parquet(filePath, header=True, inferSchema= True)

In [None]:
zone_lookup_file = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
spark.sparkContext.addFile(zone_lookup_file)
filePath  = 'file://' + SparkFiles.get('taxi_zone_lookup.csv')
zone_lookup = spark.read.csv(filePath, header=True, inferSchema= True)

In [None]:
zone_lookup.printSchema()

In [None]:
yellow_trips.printSchema()

In [None]:
yellow_trips.repartition(4).write.parquet( "partitioned.parquet", mode="overwrite")

In [None]:
! ls -lh partitioned.parquet/

## Question 3: Count records 

How many trips were started on the 15th of October?

In [None]:
yellow_trips.filter(fn.date_trunc("day", "tpep_pickup_datetime")==fn.lit("2024-10-15")).filter(fn.date_trunc("day", "tpep_dropoff_datetime")==fn.lit("2024-10-15")).count()

## Question 5: User Interface

Spark’s User Interface which shows the application's dashboard runs on which local port?

In [None]:
spark.sparkContext.uiWebUrl


## Question 4: Longest trip

What is the length of the longest trip in the dataset in hours?


In [None]:
yellow_trips = yellow_trips.withColumn("trip_duration", (fn.unix_timestamp("tpep_dropoff_datetime") - fn.unix_timestamp("tpep_pickup_datetime"))/3600)

In [None]:
yellow_trips.orderBy(fn.col("trip_duration").desc()).show()

## Question 6: Least frequent pickup location zone

Using the zone lookup data and the Yellow October 2024 data, what is the name of the LEAST frequent pickup location Zone?

In [None]:
yellow_trips_enriched = yellow_trips.join(
    zone_lookup.select(fn.col("locationID").alias("PULocationID"), fn.col("Zone").alias("pickup_zone")), how="left", on="PULocationID"
    ).join(
    zone_lookup.select(fn.col("locationID").alias("DOLocationID"), fn.col("Zone").alias("dropoff_zone")), how="left", on="DOLocationID"
    )

In [None]:
yellow_trips_enriched.groupBy("pickup_zone").agg(fn.count("*").alias("no_trips")).orderBy("no_trips").show(truncate=False)

In [None]:
yellow_trips_enriched.filter(fn.col("pickup_zone")=="Governor's Island/Ellis Island/Liberty Island").show()