In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import geopandas as gpd
import re
import os
import zipfile
import duckdb

## Retrieving the data links

In [2]:
# make a request to the website - NYC Taxi & Limousine Commission TLC Trip Record Data
url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a")

In [3]:
# extract all the yellow taxi link from the webpage
taxi_links = list()
for link in links:
    cur_link = link.get("href")
    data_link = re.findall(r".+\.parquet$", cur_link)
    if data_link:
        cur_link2 = data_link[0]
        is_yellow = re.findall(r".*yellow.*", cur_link2)
        if is_yellow:
            taxi_links.append(is_yellow[0])

## Viewing sample data

In [4]:
# sample data
df1 = pd.read_parquet("./temp-data/taxi-2021-1.parquet", engine="pyarrow")

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369769 entries, 0 to 1369768
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1369769 non-null  int64         
 1   tpep_pickup_datetime   1369769 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  1369769 non-null  datetime64[ns]
 3   passenger_count        1271417 non-null  float64       
 4   trip_distance          1369769 non-null  float64       
 5   RatecodeID             1271417 non-null  float64       
 6   store_and_fwd_flag     1271417 non-null  object        
 7   PULocationID           1369769 non-null  int64         
 8   DOLocationID           1369769 non-null  int64         
 9   payment_type           1369769 non-null  int64         
 10  fare_amount            1369769 non-null  float64       
 11  extra                  1369769 non-null  float64       
 12  mta_tax                13697

In [6]:
df1.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,


## Data Analysis using DuckDB

### References:
1. [PySpark @ RealPython](https://realpython.com/pyspark-intro/)
2. [NYC Data Dictionary](https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)
3. [DuckDB - Query Syntax](https://duckdb.org/docs/sql/functions/timestamp.html)

- Questions:
    - What is the yearly/daily/hourly taxi traffic for each taxi zone in NYC?
    - When is a driver more likely to get higher amount of tip?
    - When/where a driver is more likely to get a customer(s)?
    - For each taxi zone, how far a taxi driver is likely to drive? Is there any particular zone with particularly higher overall trip distance?

### Reading in Each Month's Data (parquet files)

1. Thought about compressing parquet files, but [it is inefficient](https://stackoverflow.com/questions/60774906/how-to-write-a-parquet-bytes-object-as-zipfile-to-disk).
2. Tried combining all of them into a single SQL database, but it takes up too much storage.
3. Tried reading and analyzing monthly data individually from 2011 to 2022.
   - Changed the scope to from 2016 to 2022 due to column specifiation (DOLocationID & PULocationID)
4. Thought about using API, but it's much more time consuming (working with a generator with millions of rows).
5. **Currently trying to work with `pandas` and `duckdb`**
    - `dask` does not work when trying to read parquet files from the web due to "encoding error."

## Preparing Analysis

In [7]:
taxi_links[:5]

['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet']

In [8]:
# update taxi_links: only select data between 2016 and 2022
taxi_links_11_22 = list()
for link in taxi_links:
    cur_year = link.split("_")[-1].split("-")[0]
    if 2011 <= int(cur_year) and int(cur_year) <= 2022:
        taxi_links_11_22.append(link)
taxi_links_11_22[-5:]

['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2011-08.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2011-09.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2011-10.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2011-11.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2011-12.parquet']

In [9]:
print(len(taxi_links), ",", len(taxi_links_11_22))

166 , 142


### Efficiently Load Data from Large Parquet Files

1. Load less data by specifying desired columns
2. Change data type into more efficient formats
3. Use `duckdb` to query each dataframe

In [10]:
columns_to_use = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", "total_amount", "tip_amount", "trip_distance"]
jan_2011 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2011-01.parquet", columns = columns_to_use)
jan_2011

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance
0,2011-01-01 00:10:00,2011-01-01 00:12:00,145,4.18,0.28,0.0
1,2011-01-01 00:04:00,2011-01-01 00:13:00,264,6.94,0.24,0.0
2,2011-01-01 00:14:00,2011-01-01 00:16:00,264,5.01,1.11,0.0
3,2011-01-01 00:04:00,2011-01-01 00:06:00,146,3.90,0.00,0.0
4,2011-01-01 00:08:00,2011-01-01 00:08:00,146,3.61,0.11,0.0
...,...,...,...,...,...,...
13464992,2011-01-31 23:38:17,2011-01-31 23:40:39,114,4.30,0.00,0.3
13464993,2011-01-31 23:49:55,2011-02-01 00:01:00,249,8.70,0.00,1.8
13464994,2011-01-31 23:23:48,2011-01-31 23:34:27,87,8.70,0.00,1.8
13464995,2011-01-31 23:48:06,2011-01-31 23:59:51,13,11.50,0.00,3.1


In [11]:
jan_2011.memory_usage(deep=True)

Index                          128
tpep_pickup_datetime     107719976
tpep_dropoff_datetime    107719976
PULocationID             107719976
total_amount             107719976
tip_amount               107719976
trip_distance            107719976
dtype: int64

In [12]:
jan_2011[["PULocationID", "total_amount", "tip_amount", "trip_distance"]] = jan_2011[["PULocationID", "total_amount", "tip_amount", "trip_distance"]].apply(pd.to_numeric, downcast='float')

In [13]:
jan_2011.memory_usage(deep=True)

Index                          128
tpep_pickup_datetime     107719976
tpep_dropoff_datetime    107719976
PULocationID              53859988
total_amount              53859988
tip_amount                53859988
trip_distance             53859988
dtype: int64

In [14]:
jan_2011.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance
0,2011-01-01 00:10:00,2011-01-01 00:12:00,145.0,4.18,0.28,0.0
1,2011-01-01 00:04:00,2011-01-01 00:13:00,264.0,6.94,0.24,0.0
2,2011-01-01 00:14:00,2011-01-01 00:16:00,264.0,5.01,1.11,0.0
3,2011-01-01 00:04:00,2011-01-01 00:06:00,146.0,3.9,0.0,0.0
4,2011-01-01 00:08:00,2011-01-01 00:08:00,146.0,3.61,0.11,0.0


## Understand the data

In order to understand the data more in depth, it is crucial to know how they are recorded. First, make sure to read the data description from the [NYC TLC data dictionary](https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)
and study how [taximeter works](https://www.staxi.nl/en/how-taximeters-work/#:~:text=How%20a%20taximeter%20works,taxi%20travels%20a%20certain%20distance.). There are few irregular rows and deciding what to do with them heavily depends on these domain knowledge. 

## Clean Data

Filtered out the followings rows:

1. Rides with `trip_distance` of 0.
2. Rides with `trip_time` <= 1.
3. Rides that would require the speed of more than 100 mph.

Let's create a new column `trip_time` which indicates the total time spent per ride. Notice there are few rides with negative `trip_time` which does not make sense. Let's filter and take a look at those rows.

In [15]:
jan_2011 = duckdb.query("""
    SELECT *, DATE_DIFF('minute', tpep_pickup_datetime, tpep_dropoff_datetime) AS trip_time
    FROM jan_2011
""").df()

In [16]:
duckdb.query("""
    SELECT *
    FROM jan_2011
    WHERE trip_time < 0
""").df()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time
0,2011-01-02 15:40:54,2011-01-02 15:39:58,264.0,7.25,1.45,0.8,-1
1,2011-01-07 17:52:22,2011-01-07 17:16:44,264.0,7.65,1.25,0.0,-36
2,2011-01-07 17:52:28,2011-01-07 17:44:28,264.0,7.00,1.00,0.0,-8
3,2011-01-07 22:57:36,2011-01-07 21:41:45,264.0,7.05,1.15,0.0,-76
4,2011-01-08 23:10:00,2011-01-08 20:31:47,264.0,11.40,1.90,0.0,-159
...,...,...,...,...,...,...,...
402,2011-01-31 23:32:45,2011-01-31 22:41:28,264.0,15.70,2.60,0.0,-51
403,2011-01-31 23:34:05,2011-01-31 23:24:19,264.0,11.85,1.95,0.0,-10
404,2011-01-31 23:21:30,2011-01-31 21:16:03,264.0,11.85,1.95,0.0,-125
405,2011-01-31 23:21:32,2011-01-31 21:33:32,264.0,12.50,1.00,0.0,-108


Notice nearly all of them are from `PULocatonID` of 264 which is unknown borough according to [NYC Taxi Zone Code](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page).

In [17]:
duckdb.query("""
    SELECT PULocationID, COUNT(*) AS cnt
    FROM jan_2011
    WHERE DATE_DIFF('minute', tpep_pickup_datetime, tpep_dropoff_datetime) < 0
    GROUP BY PULocationID
""").df()

Unnamed: 0,PULocationID,cnt
0,264.0,406
1,145.0,1


There is a single row whose `PULocationID` is not 264 (Unknown location doe) and has a negative `trip_time`. Notice its `trip_distance` is also 0.

In [18]:
duckdb.query("""
    SELECT *
    FROM jan_2011
    WHERE DATE_DIFF('minute', tpep_pickup_datetime, tpep_dropoff_datetime) < 0 AND PULocationID = 145.0
""").df()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time
0,2011-01-12 02:19:31,2011-01-11 18:21:50,145.0,45.5,0.0,0.0,-478


Let's see how many rows are of `trip_distance` 0. There are quite a few of them (n=76093). 

In [19]:
duckdb.query("""
    SELECT *, DATE_DIFF('minute', tpep_pickup_datetime, tpep_dropoff_datetime) AS trip_time
    FROM jan_2011
    WHERE trip_distance = 0
""").df()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time,trip_time_2
0,2011-01-01 00:10:00,2011-01-01 00:12:00,145.0,4.18,0.28,0.0,2,2
1,2011-01-01 00:04:00,2011-01-01 00:13:00,264.0,6.94,0.24,0.0,9,9
2,2011-01-01 00:14:00,2011-01-01 00:16:00,264.0,5.01,1.11,0.0,2,2
3,2011-01-01 00:04:00,2011-01-01 00:06:00,146.0,3.90,0.00,0.0,2,2
4,2011-01-01 00:08:00,2011-01-01 00:08:00,146.0,3.61,0.11,0.0,0,0
...,...,...,...,...,...,...,...,...
76088,2011-01-31 23:51:55,2011-01-31 23:55:25,186.0,5.10,0.00,0.0,4,4
76089,2011-01-31 23:34:41,2011-01-31 23:34:58,132.0,3.50,0.00,0.0,0,0
76090,2011-01-31 23:47:35,2011-01-31 23:47:40,132.0,3.50,0.00,0.0,0,0
76091,2011-01-31 23:32:41,2011-01-31 23:33:17,163.0,3.50,0.00,0.0,1,1


What are the `trip_time` of those trips whose `trip_distance` is 0?

- Notice many for them have `trip_time` of 0, 1, or negative values. Let's filter those rows out.

In [20]:
duckdb.query("""
    SELECT trip_time, COUNT(*) AS cnt
    FROM
        (SELECT *, DATE_DIFF('minute', tpep_pickup_datetime, tpep_dropoff_datetime) AS trip_time
        FROM jan_2011
        WHERE trip_distance = 0) AS t1
    GROUP BY trip_time
""").df()

Unnamed: 0,trip_time,cnt
0,2,3686
1,9,1193
2,0,27442
3,3,1456
4,1,22762
...,...,...
319,-25,1
320,-142,1
321,127,1
322,-52,1


In [21]:
jan_2011 = duckdb.query("""
    SELECT *
    FROM jan_2011
    WHERE trip_distance != 0
""").df()
jan_2011.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time
0,2011-01-01 00:58:10,2011-01-01 01:15:35,138.0,21.1,0.0,8.0,17
1,2011-01-01 00:23:27,2011-01-01 00:39:39,170.0,10.3,0.0,1.6,16
2,2011-01-01 00:42:08,2011-01-01 00:51:50,237.0,9.1,0.0,2.5,9
3,2011-01-01 00:53:36,2011-01-01 01:17:43,170.0,18.280001,2.38,3.9,24
4,2011-01-01 00:37:47,2011-01-01 00:41:20,90.0,5.1,0.0,0.6,4


Rides with `trip_time` less than 0, 0, or 1 does not make any sense. Let's filter those rides out too.

In [22]:
duckdb.query("""
    SELECT *
    FROM jan_2011
    WHERE trip_time <= 0
""").df().head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time
0,2011-01-01 00:11:25,2011-01-01 00:11:25,233.0,3.5,0.0,17.6,0
1,2011-01-01 00:06:12,2011-01-01 00:06:45,239.0,3.9,0.0,0.1,0
2,2011-01-01 00:48:15,2011-01-01 00:48:56,246.0,3.5,0.0,0.3,0
3,2011-01-01 00:04:32,2011-01-01 00:04:42,48.0,3.5,0.0,8.3,0
4,2011-01-01 00:06:04,2011-01-01 00:06:58,148.0,5.9,2.0,0.1,0


In [23]:
jan_2011 = duckdb.query("""
    SELECT *
    FROM jan_2011
    WHERE trip_time > 0
""").df()

jan_2011.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time
0,2011-01-01 00:58:10,2011-01-01 01:15:35,138.0,21.1,0.0,8.0,17
1,2011-01-01 00:23:27,2011-01-01 00:39:39,170.0,10.3,0.0,1.6,16
2,2011-01-01 00:42:08,2011-01-01 00:51:50,237.0,9.1,0.0,2.5,9
3,2011-01-01 00:53:36,2011-01-01 01:17:43,170.0,18.280001,2.38,3.9,24
4,2011-01-01 00:37:47,2011-01-01 00:41:20,90.0,5.1,0.0,0.6,4


Few rows have really short `trip_time` but long `trip_distance`. Let's filter those rows out too.

1. Convert `trip_time` to hour. [(Do the conversion first)](https://stackoverflow.com/questions/34504497/division-not-giving-my-answer-in-postgresql).
2. Calculate the `avg_mph` column (the average speed of each ride).
3. Filter out rows with `avg_mph` greater than 100.

In [24]:
duckdb.query("""
    SELECT *
    FROM
        (SELECT
            *,
            ROUND(CAST(trip_time AS decimal) / 60, 2) AS trip_time_hour,
            trip_distance / trip_time_hour AS avg_mph
        FROM jan_2011) AS t1
    WHERE avg_mph >= 100
    ORDER BY avg_mph 
""").df()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time,trip_time_hour,avg_mph
0,2011-01-05 22:26:48,2011-01-05 22:27:30,230.0,3.500000,0.0,2.000000,1,0.02,100.000000
1,2011-01-10 11:23:52,2011-01-10 11:24:49,262.0,3.000000,0.0,2.000000,1,0.02,100.000000
2,2011-01-13 11:04:51,2011-01-13 11:05:28,138.0,3.000000,0.0,2.000000,1,0.02,100.000000
3,2011-01-17 18:26:56,2011-01-17 18:27:03,141.0,3.000000,0.0,2.000000,1,0.02,100.000000
4,2011-01-20 19:59:59,2011-01-20 20:00:32,164.0,51.500000,0.0,2.000000,1,0.02,100.000000
...,...,...,...,...,...,...,...,...,...
700,2011-01-03 18:56:40,2011-01-03 18:57:14,236.0,18.000000,3.0,31.500000,1,0.02,1575.000000
701,2011-01-06 22:58:47,2011-01-06 22:59:07,265.0,160.000000,0.0,32.400002,1,0.02,1620.000076
702,2011-01-26 01:57:39,2011-01-26 01:58:07,265.0,108.000000,0.0,38.900002,1,0.02,1945.000076
703,2011-01-07 14:21:00,2011-01-07 14:24:00,264.0,34.599998,0.0,100.000000,3,0.05,2000.000000


Let's also filter out rides with extremely slow speed (less than 1).

In [25]:
duckdb.query("""
    SELECT *
    FROM
        (SELECT
            *,
            ROUND(CAST(trip_time AS decimal) / 60, 2) AS trip_time_hour,
            trip_distance / trip_time_hour AS avg_mph
        FROM jan_2011) AS t1
    WHERE avg_mph < 1
    ORDER BY avg_mph 
""").df().head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time,trip_time_hour,avg_mph
0,2011-01-06 17:17:41,2011-03-10 17:24:49,264.0,5.2,0.0,0.7,90727,1512.12,0.000463
1,2011-01-04 15:31:07,2011-03-01 14:45:22,264.0,7.4,0.0,1.2,80594,1343.23,0.000893
2,2011-01-15 14:25:00,2011-01-15 16:06:00,226.0,3.0,0.0,0.01,101,1.68,0.005952
3,2011-01-14 11:08:00,2011-01-14 12:49:00,48.0,3.0,0.0,0.01,101,1.68,0.005952
4,2011-01-22 02:51:00,2011-01-22 04:25:00,233.0,40.700001,0.0,0.01,94,1.57,0.006369


In [26]:
jan_2011 = duckdb.query("""
    SELECT *
    FROM
        (SELECT
            *,
            ROUND(CAST(trip_time AS decimal) / 60, 2) AS trip_time_hour,
            trip_distance / trip_time_hour AS avg_mph
        FROM jan_2011) AS t1
    WHERE avg_mph > 1 AND avg_mph < 100
    ORDER BY tpep_pickup_datetime
""").df()

jan_2011.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time,trip_time_hour,avg_mph
0,2011-01-01 00:00:00,2011-01-01 00:14:08,186.0,9.9,0.0,1.2,14,0.23,5.217392
1,2011-01-01 00:00:01,2011-01-01 00:08:43,79.0,10.3,0.0,3.3,8,0.13,25.384615
2,2011-01-01 00:00:03,2011-01-01 00:04:36,144.0,5.9,0.0,0.7,4,0.07,10.0
3,2011-01-01 00:00:04,2011-01-01 00:04:59,233.0,6.3,0.0,1.2,4,0.07,17.142858
4,2011-01-01 00:00:07,2011-01-01 00:19:19,140.0,12.3,0.0,2.1,19,0.32,6.5625


### Create A New Column `PUDate`

For hourly analysis, let's truncate `tpep_pickup_datetime` to hour and save it in the `PUDate` column. 

In [27]:
jan_2011_v2 = duckdb.query("""
    SELECT *, DATE_TRUNC('hour', tpep_pickup_datetime) AS PUDate
    FROM jan_2011
""").df()

In [28]:
# change data type into more efficient ones for newly created columns
jan_2011_v2[["trip_time"]] = jan_2011_v2[["trip_time"]].apply(pd.to_numeric, downcast='float')

In [29]:
jan_2011_v2.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,total_amount,tip_amount,trip_distance,trip_time,trip_time_hour,avg_mph,PUDate
0,2011-01-01 00:00:00,2011-01-01 00:14:08,186.0,9.9,0.0,1.2,14.0,0.23,5.217392,2011-01-01
1,2011-01-01 00:00:01,2011-01-01 00:08:43,79.0,10.3,0.0,3.3,8.0,0.13,25.384615,2011-01-01
2,2011-01-01 00:00:03,2011-01-01 00:04:36,144.0,5.9,0.0,0.7,4.0,0.07,10.0,2011-01-01
3,2011-01-01 00:00:04,2011-01-01 00:04:59,233.0,6.3,0.0,1.2,4.0,0.07,17.142858,2011-01-01
4,2011-01-01 00:00:07,2011-01-01 00:19:19,140.0,12.3,0.0,2.1,19.0,0.32,6.5625,2011-01-01


In [30]:
# drop unused columns
jan_2011_v3 = duckdb.query("""
    SELECT PUDate, PULocationID, total_amount, tip_amount, trip_time, trip_distance
    FROM jan_2011_v2
""").df()

In [31]:
jan_2011_v3.head()

Unnamed: 0,PUDate,PULocationID,total_amount,tip_amount,trip_time,trip_distance
0,2011-01-01,186.0,9.9,0.0,14.0,1.2
1,2011-01-01,79.0,10.3,0.0,8.0,3.3
2,2011-01-01,144.0,5.9,0.0,4.0,0.7
3,2011-01-01,233.0,6.3,0.0,4.0,1.2
4,2011-01-01,140.0,12.3,0.0,19.0,2.1


In [32]:
jan_2011_v3.memory_usage()

Index                  128
PUDate           106498024
PULocationID      53249012
total_amount      53249012
tip_amount        53249012
trip_time         53249012
trip_distance     53249012
dtype: int64

## Trend

For NYC TLC (New York City Taxi and Limousine Commission), it would be important to know the overall traffic trend of yellow taxi by several time periods: hourly, daily, monhtly, and yearly.
Let's analyze the overall traffic, total fare amount, and tip amount for each time period using previously created columns: `pickup_year`, `pickup_month`, `pickup_wday`, `pickup_hour`






In [33]:
jan_2011_v4 = duckdb.query("""
    SELECT
        PUDate, PULocationID,
        COUNT(*) AS total_rides,
        AVG(total_amount) AS avg_total_fare,
        AVG(tip_amount) AS avg_tip,
        AVG(trip_time) AS avg_trip_time,
        AVG(trip_distance) AS avg_trip_distance
    FROM jan_2011_v3
    GROUP BY PUDate, PULocationID
    ORDER BY PULocationID ASC, PUDate ASC
""").df()

jan_2011_v4.head()

Unnamed: 0,PUDate,PULocationID,total_rides,avg_total_fare,avg_tip,avg_trip_time,avg_trip_distance
0,2011-01-01 03:00:00,1.0,1,5.5,0.0,3.0,1.0
1,2011-01-01 05:00:00,1.0,1,80.5,35.0,2.0,0.3
2,2011-01-01 10:00:00,1.0,1,4.6,0.0,4.0,0.6
3,2011-01-01 15:00:00,1.0,1,129.600006,21.6,1.0,0.6
4,2011-01-02 07:00:00,1.0,1,90.0,13.7,38.0,25.6


## Spatial Data for Visualization

Certain columns are numerically coded including `PULocationID` and `pickup_wday`. Let's use another dataframe that contain geospatial data
and Tableau to convert those values properly.

In [46]:
# https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
location_shape = gpd.read_file("./taxi_zones/taxi_zones.shp")
location_shape.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.770 256767.698, 1026495.593 ..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.310 144283.336, 936046.565 14..."


---

## Iterating Processing Procedure for All Data from 2011 to 2022

### Read Data

In [8]:
def read_data(link):
    
    columns_to_use = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", "total_amount", "tip_amount", "trip_distance"]
    
    df = pd.read_parquet(link, columns = columns_to_use)
    
    df[["PULocationID", "total_amount", "tip_amount", "trip_distance"]] = df[["PULocationID", "total_amount", "tip_amount", "trip_distance"]].apply(pd.to_numeric, downcast='float')
    
    return df

### Clean Data

In [9]:
def clean_data(df):
    # New column : trip_time
    # Filter out rows with `trip_distance` = 0
    # Filter out rows with `trip_time` <= 0
    df1 = duckdb.query("""
        SELECT *, DATE_DIFF('minute', tpep_pickup_datetime, tpep_dropoff_datetime) AS trip_time
        FROM df
        WHERE trip_distance != 0 AND trip_time > 0
    """).df()
    
    # Filter out rides with average mph > 100 or mph < 1
    df2 = duckdb.query("""
        SELECT *
        FROM
            (SELECT
                *,
                ROUND(CAST(trip_time AS decimal) / 60, 2) AS trip_time_hour,
                trip_distance / trip_time_hour AS avg_mph
            FROM df1) AS t1
        WHERE avg_mph > 1 AND avg_mph < 100
        ORDER BY tpep_pickup_datetime
    """).df()
    
    # New Columns : Convert datetime to date for group analysis
    df3 = duckdb.query("""
        SELECT *, date_trunc('hour', tpep_pickup_datetime) as PUDate
        FROM df2
    """).df()

    df3[["trip_time"]] = df3[["trip_time"]].apply(pd.to_numeric, downcast='float')
    
    # Drop unused columns
    df4 = duckdb.query("""
        SELECT PUDate, PULocationID, total_amount, tip_amount, trip_time, trip_distance
        FROM df3
    """).df()
    
    return df4

## Grouped Analysis

- Analysis Grouped by `LocationID` and `PUDate` (pickup date)

In [10]:
def trend(df):
    df1 = duckdb.query("""
        SELECT
            PUDate, PULocationID,
            COUNT(*) AS total_rides,
            AVG(total_amount) AS avg_total_fare,
            AVG(tip_amount) AS avg_tip,
            AVG(trip_time) AS avg_trip_time,
            AVG(trip_distance) AS avg_trip_distance
        FROM df
        GROUP BY PUDate, PULocationID
        ORDER BY PULocationID ASC, PUDate ASC
    """).df()
    
    return df1

In [17]:
# first few rows of the links of the data
taxi_links_11_22[:5]

['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet',
 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet']

In [16]:
# specifying required columns for the data analysis
data_list = list()
total_row = 0
for link in reversed(taxi_links_11_22):
    cur_year = link.split("_")[-1].split("-")[0]
    cur_month = link.split("-")[-1].split(".")[0]
    
    # read raw data with columns specified
    data = read_data(link)
    
    # clean data
    cleaned_df = clean_data(data)
    
    # trend analysis per location_id
    trend_df = trend(cleaned_df)
    
    data_list.append(trend_df)
    
    total_row += trend_df.shape[0]
    
    print(f">> {cur_year}-{cur_month} Data Processed, Total Row: ", total_row)

>> 2011-12 Data Processed, Total Row:  104695
>> 2011-11 Data Processed, Total Row:  206923
>> 2011-10 Data Processed, Total Row:  315818
>> 2011-09 Data Processed, Total Row:  427225
>> 2011-08 Data Processed, Total Row:  534977
>> 2011-07 Data Processed, Total Row:  644151
>> 2011-06 Data Processed, Total Row:  750436
>> 2011-05 Data Processed, Total Row:  859713
>> 2011-04 Data Processed, Total Row:  962151
>> 2011-03 Data Processed, Total Row:  1069789
>> 2011-02 Data Processed, Total Row:  1166291
>> 2011-01 Data Processed, Total Row:  1268030
>> 2012-12 Data Processed, Total Row:  1365748
>> 2012-11 Data Processed, Total Row:  1460889
>> 2012-10 Data Processed, Total Row:  1558664
>> 2012-09 Data Processed, Total Row:  1659004
>> 2012-08 Data Processed, Total Row:  1764154
>> 2012-07 Data Processed, Total Row:  1868176
>> 2012-06 Data Processed, Total Row:  1970560
>> 2012-05 Data Processed, Total Row:  2073919
>> 2012-04 Data Processed, Total Row:  2172068
>> 2012-03 Data Proces

In [47]:
# temporarily saving the current dataframe
if not os.path.exists(os.path.join(".", "data")):
    os.mkdir(os.path.join(".", "data"))
temp_taxi_df = pd.concat(data_list, ignore_index=True)
temp_taxi_df.to_csv(os.path.join(".", "data", "temp_taxi.csv"))

Notice there are few rows with a suspicious pick up date. Let's filter out those rows.

In [28]:
duckdb.query("""
    SELECT *
    FROM  temp_taxi_df
    WHERE YEAR(PUDate) < 2011 OR YEAR(PUDate) > 2022
""").df()

Unnamed: 0,PUDate,PULocationID,total_rides,avg_total_fare,avg_tip,avg_trip_time,avg_trip_distance
0,2008-12-31 23:00:00,13.0,1,9.80,0.00,11.0,1.33
1,2009-01-01 01:00:00,41.0,1,54.36,8.80,28.0,13.44
2,2009-01-01 23:00:00,50.0,1,10.80,0.00,18.0,1.97
3,2008-12-31 11:00:00,70.0,1,49.27,8.21,40.0,9.53
4,2009-01-01 01:00:00,79.0,1,21.80,0.00,31.0,3.89
...,...,...,...,...,...,...,...
1608,2008-12-31 23:00:00,41.0,1,5.80,0.00,3.0,0.72
1609,2009-01-01 00:00:00,137.0,1,9.30,0.00,5.0,1.02
1610,2009-01-01 09:00:00,140.0,1,8.80,0.00,9.0,0.64
1611,2008-12-31 23:00:00,161.0,1,10.80,0.00,7.0,1.55


In [31]:
temp_taxi_v2 = duckdb.query("""
    SELECT *
    FROM temp_taxi_df
    WHERE YEAR(PUDate) >= 2011 AND YEAR(PUDate) <= 2022
""").df()

# type conversion for Tableau (using PULocationID as a key to join the spatial)
# turns out PULocationID can't be used as a key to joinning if it's float data type
temp_taxi_v2[["PULocationID"]] = temp_taxi_v2[["PULocationID"]].astype(int)
temp_taxi_v2.to_csv(os.path.join(".", "data", "taxi_df.csv"), index=False)

In [45]:
# RUN ONLY ONCE : Writing a huge data set (1.8GB).
# taxi_df.to_csv(os.path.join(".", "data", "taxi_df.csv"), index=False)

---