# Downloading Dataset from NYC Trip Data
This website provides the historical of taxi data in NYC. The website can be accessed through this url https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [10]:
from pathlib import Path
import requests

def download_one_file_of_raw_data(year: int, month: int) -> Path:
    """
    Funtion to download data from NYC Trip Data
    """
    URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'  # month:02d -> 2 digit value start by 0 (format for month)
    response = requests.get(URL)

    if response.status_code == 200: # status ok: request was fulfilled
        path = f'../data/raw/rides_{year}--{month:02d}.parquet'
        open(path, "wb").write(response.content)  # wb = writing binary file
        return path
    else:
        raise Exception(f'{URL} is not available')


In [None]:
download_one_file_of_raw_data(year=2023, month=1)

In [14]:
import pandas as pd
rides = pd.read_parquet('../data/raw/rides_2023--01.parquet')

rides.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [15]:
# Selecting the required columns
rides = rides[['tpep_pickup_datetime', 'PULocationID']]

rides.head(5)

Unnamed: 0,tpep_pickup_datetime,PULocationID
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107


In [17]:
# Renaming the column

rides.rename(columns={
    'tpep_pickup_datetime': 'pickup_datetime',
    'PULocationID': 'pickup_location_id'},
    inplace = True
)

rides.head(5)

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107


In [18]:
rides['pickup_datetime'].describe()

  rides['pickup_datetime'].describe()


count                 3066766
unique                1610975
top       2023-01-11 19:22:56
freq                       12
first     2008-12-31 23:01:42
last      2023-02-01 00:56:53
Name: pickup_datetime, dtype: object

In [19]:
# Selecting data from 1st january - 31 january 2023
rides = rides[rides.pickup_datetime >= '2023-01-01']
rides = rides[rides.pickup_datetime < '2023-02-01']
rides['pickup_datetime'].describe()

  rides['pickup_datetime'].describe()


count                 3066718
unique                1610927
top       2023-01-11 19:22:56
freq                       12
first     2023-01-01 00:00:00
last      2023-01-31 23:59:59
Name: pickup_datetime, dtype: object

In [20]:
rides.to_parquet('../data/transformed/validated_rides_2023_01.parquet')