In [10]:
import polars as pl

from capstone_project_1.config import EXTERNAL_DATA_DIR, REPORTS_DIR

## Data

About the data:

- fhv: **For-Hire Vehicle** trip records
- fhvhv: **High Volume For-Hire Vehicle** trip records
- green: **Green** colored taxis
- yellow: **Yellow** color taxis

Official Data Dictionaries:

- https://www.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_hvfhs.pdf

In [None]:
# Load the data
fhv = pl.read_parquet(EXTERNAL_DATA_DIR / "ny-taxi-data" / "fhv_tripdata_2024-11.parquet")
fhvhv = pl.read_parquet(EXTERNAL_DATA_DIR / "ny-taxi-data" / "fhvhv_tripdata_2024-11.parquet")
green = pl.read_parquet(EXTERNAL_DATA_DIR / "ny-taxi-data" / "green_tripdata_2024-11.parquet")
yellow = pl.read_parquet(EXTERNAL_DATA_DIR / "ny-taxi-data" / "yellow_tripdata_2024-11.parquet")

In [3]:
fhv.head()

dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
str,datetime[μs],datetime[μs],i64,i64,i64,str
"""B00008""",2024-11-01 00:24:00,2024-11-01 00:44:00,,,,"""B00008"""
"""B00008""",2024-11-01 00:30:00,2024-11-01 01:00:00,,,,"""B00008"""
"""B00013""",2024-11-01 00:30:00,2024-11-01 01:15:00,,,,"""B00381"""
"""B00014""",2024-11-01 00:01:00,2024-11-01 00:40:00,,,,"""B01875"""
"""B00112""",2024-11-01 00:47:42,2024-11-01 00:52:11,,14.0,,"""B00112"""


In [4]:

fhvhv = fhvhv.with_columns([
    pl.col("DOLocationID").cast(pl.UInt16),
    pl.col("PULocationID").cast(pl.UInt16),
    pl.col("trip_miles").cast(pl.Float32),
    pl.col("trip_time").cast(pl.UInt16),
    pl.col("base_passenger_fare").cast(pl.Float32),
    pl.col("tolls").cast(pl.Float32),
    pl.col("bcf").cast(pl.Float32),
    pl.col("sales_tax").cast(pl.Float32),
    pl.col("airport_fee").cast(pl.Float32),
    pl.col("tips").cast(pl.Float32),
    pl.col("driver_pay").cast(pl.Float32),
])


In [6]:
fhvhv.head(1)

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
str,str,str,datetime[μs],datetime[μs],datetime[μs],datetime[μs],u16,u16,f32,u16,f32,f32,f32,f32,f64,f32,f32,f32,str,str,str,str,str
"""HV0003""","""B03404""","""B03404""",2024-10-31 23:53:04,2024-11-01 00:02:38,2024-11-01 00:02:38,2024-11-01 00:27:46,87,170,4.79,1508,29.379999,0.0,0.86,2.77,2.75,0.0,0.0,31.889999,"""N""","""N""","""N""","""N""","""N"""


In [7]:
green.head()

VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
i32,datetime[μs],datetime[μs],str,i64,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64
2,2024-11-01 00:09:19,2024-11-01 00:49:44,"""N""",5,97,50,1,6.68,65.0,0.0,0.0,0.0,0.0,,1.0,68.75,1,2,2.75
2,2024-11-01 00:28:47,2024-11-01 00:34:16,"""N""",1,166,41,1,1.44,8.6,1.0,0.5,3.0,0.0,,1.0,14.1,1,1,0.0
2,2024-11-01 00:24:54,2024-11-01 00:26:55,"""N""",1,129,82,1,0.21,4.4,1.0,0.5,0.0,0.0,,1.0,6.9,2,1,0.0
1,2024-11-01 00:03:21,2024-11-01 00:43:38,"""N""",1,66,164,2,6.6,38.7,3.75,1.5,10.0,0.0,,1.0,53.95,1,1,2.75
2,2024-11-01 00:05:54,2024-11-01 00:12:42,"""N""",1,83,83,1,0.97,8.6,1.0,0.5,2.22,0.0,,1.0,13.32,1,1,0.0


In [8]:
yellow.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
i32,datetime[μs],datetime[μs],i64,f64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2,2024-11-01 00:46:24,2024-11-01 00:57:17,1,1.93,1,"""N""",239,262,2,-12.8,-1.0,-0.5,0.0,0.0,-1.0,-17.8,-2.5,0.0
2,2024-11-01 00:46:24,2024-11-01 00:57:17,1,1.93,1,"""N""",239,263,2,12.8,1.0,0.5,0.0,0.0,1.0,17.8,2.5,0.0
1,2024-11-01 00:37:36,2024-11-01 01:28:36,1,34.3,5,"""N""",219,265,1,259.0,0.0,0.0,15.0,0.0,1.0,275.0,0.0,0.0
2,2024-11-01 00:12:55,2024-11-01 00:22:17,2,0.93,1,"""N""",186,107,1,10.0,1.0,0.5,1.0,0.0,1.0,16.0,2.5,0.0
2,2024-11-01 00:54:45,2024-11-01 00:59:47,1,0.38,1,"""N""",79,79,1,6.5,1.0,0.5,1.0,0.0,1.0,12.5,2.5,0.0


## Automated Exploratory Data Analysis

In [None]:
from ydata_profiling import ProfileReport

# Profile the data
fhv_profile = ProfileReport(
    fhv.to_pandas(),
    title="For-Hire Vehicle (FHV) Data",
    dataset={
        "url": "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2024-11.parquet"
    },
)
fhvhv_profile = ProfileReport(
    fhvhv.to_pandas(),
    title="High Volume For-Hire Vehicle (FHVHV) Data",
    dataset={
        "url": "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-11.parquet"
    },
)
green_profile = ProfileReport(
    green.to_pandas(),
    title="Green Taxi Data",
    dataset={
        "url": "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-11.parquet"
    },
)
yellow_profile = ProfileReport(
    yellow.to_pandas(),
    title="Yellow Taxi Data",
    dataset={
        "url": "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-11.parquet"
    },
)

In [9]:
# Save the reports
save_path = REPORTS_DIR / "ny-taxi-data"
if not save_path.exists():
    save_path.mkdir(parents=True)

fhv_profile.to_file(save_path / "fhv_profile.html")
fhvhv_profile.to_file(save_path / "fhvhv_profile.html")
green_profile.to_file(save_path / "green_profile.html")
yellow_profile.to_file(save_path / "yellow_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]