## NYC Yellow Taxi 2019

In [0]:
# To view the root 'databricks-datasets' folder, run: `display(dbutils.fs.ls("/databricks-datasets/"))`
display(dbutils.fs.ls("/databricks-datasets/nyctaxi/tripdata/yellow/"))

In [0]:
from pyspark.sql.functions import min, max, col, unix_timestamp, date_format, hour

class NYCTaxiData:
    def __init__(self, month, year):
        self.month = month
        self.year = year
        self.df = self.read_data()
        self.clean_df = self.clean_data()
    
    def read_data(self):
        return spark.read \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .csv(f"dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_{self.year}-{self.month}.csv.gz")

    def clean_data(self):
        next_month = (int(self.month) + 1) % 13
        next_month = f"0{next_month}" if next_month < 10 else str(next_month)
        year_of_next_month = self.year if self.month != "12" else f"{int(self.year) + 1}"
        print(f"Cleaning data: The next_month is: {next_month}, year of the next month is: {year_of_next_month}\n")

        # Drop rows if missing pickup or dropoff datetime
        # Filter rows to include only trips in the year-month
        # Exclude trips with invalid durations: negative or exceeding 24 hours (86400 seconds)
        # Extract day and hour (military) from pickup and dropoff datetime
        clean_df = self.df\
            .na.drop(subset=["tpep_pickup_datetime", "tpep_dropoff_datetime"])\
            .filter(
                (col("tpep_pickup_datetime") >= f"{self.year}-{self.month}-01") & 
                (col("tpep_pickup_datetime") < f"{year_of_next_month}-{next_month}-01") &
                (col("tpep_dropoff_datetime") >= f"{self.year}-{self.month}-01") & 
                (col("tpep_dropoff_datetime") < f"{year_of_next_month}-{next_month}-01")
            )\
            .filter(
                (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")).between(0, 86400)
            )\
            .withColumn("pickup_date", date_format(col("tpep_pickup_datetime"), "yyyy-MM-dd"))\
            .withColumn("pickup_hour", hour(col("tpep_pickup_datetime")))\
            .withColumn("dropoff_date", date_format(col("tpep_dropoff_datetime"), "yyyy-MM-dd"))\
            .withColumn("dropoff_hour", hour(col("tpep_dropoff_datetime")))

        return clean_df

    def len(self):
        return self.clean_df.count()
    
    def original_len(self):
        return self.df.count()
    
    # Return the top n hours with the most pickups
    def pickups_per_time_interval(self, time_interval, top_n):
        # time_interval either based on date or hour
        pickups_per_time_interval = self.clean_df\
            .groupby(f"pickup_{time_interval}")\
            .count()\
            .withColumnRenamed("count", "num_pickups") \
            .orderBy("num_pickups", ascending=False)\
            .limit(top_n)
        return pickups_per_time_interval


In [0]:
nyc_2019_08_df = NYCTaxiData("08", "2019")
print(f"Number of rows: {nyc_2019_08_df.len()}")

In [0]:
nyc_2019_08_df.pickups_per_time_interval("hour", 10).show()

In [0]:
nyc_2019_08_df.pickups_per_time_interval("date", 5).show()

In [0]:
nyc_2019_09_df = NYCTaxiData("09", "2019")
print(f"Number of rows: {nyc_2019_09_df.len()}")

In [0]:
nyc_2019_09_df.pickups_per_time_interval("date", 5).show()