Let's create a function to download one file of raw data from NYC data from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [12]:
import os
import requests
from pathlib import Path

# src/data.py
def download_one_file_of_raw_data(year: int, month: int) -> Path:

    URL = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet"
    response = requests.get(URL)

    if response.status_code == 200:
        os.makedirs("../data/raw", exist_ok=True)
        path = f"../data/raw/rides_{year}-{month:02d}.parquet"
        with open(path, 'wb') as f:
            f.write(response.content)
        return path
    else:
        raise Exception(f"{URL} is not available")

In [13]:
download_one_file_of_raw_data(year=2022, month=1)

'../data/raw/rides_2022-01.parquet'

Loading the downloaded data

In [14]:
import pandas as pd

rides = pd.read_parquet("../data/raw/rides_2022-01.parquet")

rides.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


Picking only the needed columns

In [15]:
rides = rides[['tpep_pickup_datetime', 'PULocationID']].copy()

Renaming the columns

In [16]:
rides = rides.rename(columns={
    'tpep_pickup_datetime': 'pickup_datetime',
    'PULocationID': 'pickup_location_id'
})

rides.head()

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68


The data downloaded has some data from outside January 2022, and it shouldn't. Let's validate it by filtering only rides from January 2022.

In [17]:
rides['pickup_datetime'].describe()

count                          2463931
mean     2022-01-17 01:19:51.689726208
min                2008-12-31 22:23:09
25%                2022-01-09 15:37:41
50%                2022-01-17 12:11:45
75%         2022-01-24 13:49:37.500000
max                2022-05-18 20:41:57
Name: pickup_datetime, dtype: object

In [18]:
# validating raw data by filtering it
rides = rides[rides['pickup_datetime'] >= '2022-01-01']
rides = rides[rides['pickup_datetime'] < '2022-02-01']
rides['pickup_datetime'].describe()

count                          2463879
mean     2022-01-17 01:58:40.393673472
min                2022-01-01 00:00:08
25%                2022-01-09 15:37:56
50%                2022-01-17 12:11:54
75%                2022-01-24 13:49:37
max                2022-01-31 23:59:58
Name: pickup_datetime, dtype: object

In [21]:
# src/data.py
def validate_raw_data(
    rides: pd.DataFrame,
    year: int,
    month: int,
) -> pd.DataFrame:
    """
    Removes rows with pickup_datetimes outside their valid range
    """
    # keep only rides for this month
    this_month_start = f'{year}-{month:02d}-01'
    next_month_start = f'{year}-{month+1:02d}-01' if month < 12 else f'{year+1}-01-01'
    rides = rides[rides.pickup_datetime >= this_month_start]
    rides = rides[rides.pickup_datetime < next_month_start]
    
    return rides

In [23]:
validate_raw_data(rides, year=2022, month=1).describe()

Unnamed: 0,pickup_datetime,pickup_location_id
count,2463879,2463879.0
mean,2022-01-17 01:58:40.393673472,166.0769
min,2022-01-01 00:00:08,1.0
25%,2022-01-09 15:37:56,132.0
50%,2022-01-17 12:11:54,162.0
75%,2022-01-24 13:49:37,234.0
max,2022-01-31 23:59:58,265.0
std,,65.46809


In [24]:
os.makedirs('../data/transformed', exist_ok=True)
rides.to_parquet('../data/transformed/validated_rides_2022_01.parquet')