# **NYC Taxi Data Analysis - Yellow Taxis**

Exploratory analysis on NYC Yellow Taxis looking into customer segmentation and factors that may impact tipping behaviors across the 5 boroughs

--- 

### **Imports & Setup**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# setting display options for pandas
pd.set_option('display.max_columns', None) # displays all columns in the dataframe
pd.set_option('display.max_rows', 100) # sets the max number of rows to 100

### **Loading Data**

In [3]:
# look up tables
zone = pd.read_csv('../data/taxi_zone_lookup.csv') 
vendor = pd.read_csv('../data/taxi_vendor_lookup.csv')
payment = pd.read_csv('../data/payment_lookup.csv')
ratecode = pd.read_csv('../data/ratecode_lookup.csv')

# yellow taxi data
df = pd.read_parquet('../data/yellow_tripdata_2025-05.parquet')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4591845 entries, 0 to 4591844
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [5]:
df.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
count,4591845.0,4591845,4591845,3395669.0,4591845.0,3395669.0,4591845.0,4591845.0,4591845.0,4591845.0,4591845.0,4591845.0,4591845.0,4591845.0,4591845.0,4591845.0,3395669.0,3395669.0,4591845.0
mean,1.875653,2025-05-16 07:15:25.300312,2025-05-16 07:33:20.003034,1.29527,7.653422,2.432585,161.1871,161.1876,0.912372,18.35794,1.164838,0.4774078,2.858122,0.511476,0.9563765,26.88033,2.197677,0.1531993,0.5278734
min,1.0,2009-01-01 00:20:39,2009-01-01 00:20:49,0.0,0.0,1.0,1.0,1.0,0.0,-998.0,-17.39,-0.5,-90.44,-148.17,-1.0,-1147.17,-2.5,-1.75,-0.75
25%,2.0,2025-05-08 18:23:12,2025-05-08 18:42:50,1.0,1.06,1.0,114.0,107.0,0.0,8.6,0.0,0.5,0.0,0.0,1.0,15.54,2.5,0.0,0.0
50%,2.0,2025-05-16 00:49:45,2025-05-16 01:05:55,1.0,1.89,1.0,161.0,162.0,1.0,14.15,0.0,0.5,2.0,0.0,1.0,21.42,2.5,0.0,0.75
75%,2.0,2025-05-23 13:28:04,2025-05-23 13:47:48,1.0,3.77,1.0,233.0,233.0,1.0,23.0,2.5,0.5,4.0,0.0,1.0,30.98,2.5,0.0,0.75
max,7.0,2025-06-01 00:04:31,2025-06-04 11:17:10,9.0,263104.0,99.0,265.0,265.0,4.0,1583.6,133.6,22.14,443.21,148.17,1.0,1614.29,2.5,6.75,1.25
std,0.7239395,,,0.7252927,653.3947,11.33495,66.55046,70.58759,0.7590985,19.82062,1.845559,0.1380384,4.043892,2.144886,0.2750487,24.19524,0.9446371,0.5431268,0.36076


In [6]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-05-01 00:07:06,2025-05-01 00:24:15,1.0,3.7,1.0,N,140,202,1,18.4,4.25,0.5,4.85,0.0,1.0,29.0,2.5,0.0,0.75
1,2,2025-05-01 00:07:44,2025-05-01 00:14:27,1.0,1.03,1.0,N,234,161,1,8.6,1.0,0.5,4.3,0.0,1.0,18.65,2.5,0.0,0.75
2,2,2025-05-01 00:15:56,2025-05-01 00:23:53,1.0,1.57,1.0,N,161,234,2,10.0,1.0,0.5,0.0,0.0,1.0,15.75,2.5,0.0,0.75
3,2,2025-05-01 00:00:09,2025-05-01 00:25:29,1.0,9.48,1.0,N,138,90,1,40.8,6.0,0.5,11.7,6.94,1.0,71.94,2.5,1.75,0.75
4,2,2025-05-01 00:45:07,2025-05-01 00:52:45,1.0,1.8,1.0,N,90,231,1,10.0,1.0,0.5,1.5,0.0,1.0,17.25,2.5,0.0,0.75


---
### **Data Assumptions**
* Each row represents a single trip record
* No explicit primary key column
* Each row's uniqueness could be composed of a combination of columns (pickup, dropoff, vendor) 
  
*Note: for this exploratory analysis, a primary key is not critical, as much of the analysis is focused on aggregation patterns rather than uniquely identifying rows* 
> If necessary, could implement 'ride_id' as a primary key using row index 

---
### **Data Cleaning**

In [7]:
# shows the number of missing values per column
df.isna().sum() 


VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count          1196176
trip_distance                  0
RatecodeID               1196176
store_and_fwd_flag       1196176
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     1196176
Airport_fee              1196176
cbd_congestion_fee             0
dtype: int64

In [8]:
# checking to see the unique values in the passenger_count column 
df['passenger_count'].unique() 

array([ 1.,  0.,  2.,  3.,  4.,  5.,  6.,  9.,  8., nan])

In [9]:
# it's odd that there are so many 0's for passenger_count, doesn't make sense, investigate further
df[df['passenger_count'] == 0]

# doesn't seem like there was an actual reason for it, will replace 0s with nulls for now
# also replace np.nan with NA to standardize
df['passenger_count'] = df['passenger_count'].replace({0: pd.NA, np.nan: pd.NA})

In [10]:
# dropping store_and_fwd_flag because it's not relevant for this specific analysis
df.drop(columns=['store_and_fwd_flag'], inplace=True)

In [11]:
df.isna().sum()

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count          1220143
trip_distance                  0
RatecodeID               1196176
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     1196176
Airport_fee              1196176
cbd_congestion_fee             0
dtype: int64

In [12]:
df['RatecodeID'].unique()

array([ 1.,  2., 99.,  5.,  4.,  3.,  6., nan])

In [13]:
# there are nulls for ratecodeid, according to the source dict 99 stands for null/unknown
df[df['RatecodeID'] == 99]

# replace nulls with 99
df['RatecodeID'] = df['RatecodeID'].replace(np.nan, 99)

In [14]:
# checking for duplicates
df.duplicated().sum()

0

In [15]:
# checking lookup tables individually
zone.head()
payment.head()
ratecode.head() 
vendor.head()

Unnamed: 0,VendorID,vendor_name,Unnamed: 2
0,1,"Creative Mobile Technologies, LLC",
1,2,"Curb Mobility, LLC",
2,6,Myle Technologies Inc,
3,7,Helix,


In [16]:
# vendor has 2 unnamed columns, let's filter those out
vendor = vendor.loc[:, ~vendor.columns.str.contains('^Unnamed')] # ~ removes columns that start with 'Unnamed'
vendor.head()

Unnamed: 0,VendorID,vendor_name
0,1,"Creative Mobile Technologies, LLC"
1,2,"Curb Mobility, LLC"
2,6,Myle Technologies Inc
3,7,Helix


---
### **Exploratory**
*A few notes:*
* Will be exploring the data based off each of the 5 different boroughs of NYC
* Dataframes labeled with a borough name are strictly for rides in that specific borough, both pickup & drpoff
* Dataframes that have '_overall' attached to it shows any rides that include the borough for either pickup OR dropoff
* The reason for this is because some questions require a broader or more specific scope depending on what it's asking for
  

In [17]:
# merging lookup tables with the taxi data
# using left joins because we want to retain all records from the taxi data

# saving location information based off PULocationID & DOLocationID
merged = df.merge(zone, how='left', left_on='PULocationID', right_on='LocationID').rename(columns={'Borough': 'PU_Borough', 'Zone': 'PU_Zone', 'service_zone': 'PU_service_zone'})
merged = merged.merge(zone, how='left', left_on='DOLocationID', right_on='LocationID').rename(columns={'Borough': 'DO_Borough', 'Zone': 'DO_Zone', 'service_zone': 'DO_service_zone'})

# dropping the LocationID columns as they are no longer needed
merged = merged.drop(columns=['LocationID_x', 'LocationID_y'])

# merging the rest of the lookups
merged = merged.merge(vendor, how='left', on='VendorID').merge(payment, how='left', on='payment_type').merge(ratecode, how='left', on='RatecodeID')
merged.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,PU_Borough,PU_Zone,PU_service_zone,DO_Borough,DO_Zone,DO_service_zone,vendor_name,payment_method,Rate
0,1,2025-05-01 00:07:06,2025-05-01 00:24:15,1.0,3.7,1.0,140,202,1,18.4,4.25,0.5,4.85,0.0,1.0,29.0,2.5,0.0,0.75,Manhattan,Lenox Hill East,Yellow Zone,Manhattan,Roosevelt Island,Boro Zone,"Creative Mobile Technologies, LLC",Credit card,Standard
1,2,2025-05-01 00:07:44,2025-05-01 00:14:27,1.0,1.03,1.0,234,161,1,8.6,1.0,0.5,4.3,0.0,1.0,18.65,2.5,0.0,0.75,Manhattan,Union Sq,Yellow Zone,Manhattan,Midtown Center,Yellow Zone,"Curb Mobility, LLC",Credit card,Standard
2,2,2025-05-01 00:15:56,2025-05-01 00:23:53,1.0,1.57,1.0,161,234,2,10.0,1.0,0.5,0.0,0.0,1.0,15.75,2.5,0.0,0.75,Manhattan,Midtown Center,Yellow Zone,Manhattan,Union Sq,Yellow Zone,"Curb Mobility, LLC",Cash,Standard
3,2,2025-05-01 00:00:09,2025-05-01 00:25:29,1.0,9.48,1.0,138,90,1,40.8,6.0,0.5,11.7,6.94,1.0,71.94,2.5,1.75,0.75,Queens,LaGuardia Airport,Airports,Manhattan,Flatiron,Yellow Zone,"Curb Mobility, LLC",Credit card,Standard
4,2,2025-05-01 00:45:07,2025-05-01 00:52:45,1.0,1.8,1.0,90,231,1,10.0,1.0,0.5,1.5,0.0,1.0,17.25,2.5,0.0,0.75,Manhattan,Flatiron,Yellow Zone,Manhattan,TriBeCa/Civic Center,Yellow Zone,"Curb Mobility, LLC",Credit card,Standard


#### **Understanding the Data (Geographic & Trip Characteristics)**
>How often people request rides & where/when they request the most?
* Focusing on datetime & location to perform frequency based aggregation

In [18]:
merged.isna().sum() #after merging, there are some nulls in the lookup columns, which is expected since not all records will have a corresponding entry in the lookup tables

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count          1220143
trip_distance                  0
RatecodeID                     0
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     1196176
Airport_fee              1196176
cbd_congestion_fee             0
PU_Borough                  2091
PU_Zone                     7596
PU_service_zone             9687
DO_Borough                 20490
DO_Zone                     9800
DO_service_zone            30290
vendor_name                    0
payment_method                 0
Rate                           0
dtype: int64

In [19]:
# checking unique values in the merged columns
merged['PU_Zone'].unique() 
merged['DO_Zone'].unique()
merged['PU_Borough'].unique() 
merged['DO_Borough'].unique() 

# there are 'Unknown' and 'NaN' values in PU_Borough & DO_Borough, replace na with 'Unknown' for consistency
merged['PU_Borough'].replace(np.nan, 'Unknown', inplace=True)
merged['DO_Borough'].replace(np.nan, 'Unknown', inplace=True)

In [20]:
ride_combination = merged.groupby(['PU_Borough', 'DO_Borough']).size().reset_index(name='count')  # count occurrences of rides requested from PU_Borough and DO_Borough
ride_combination.sort_values(by='count',ascending=False, inplace=True)  # sort by count in descending order
ride_combination.head() # by a good chunk, the most frequent combination is from Manhattan to Manhattan

Unnamed: 0,PU_Borough,DO_Borough,count
23,Manhattan,Manhattan,3657462
30,Queens,Manhattan,225823
31,Queens,Queens,136701
24,Manhattan,Queens,114579
8,Brooklyn,Brooklyn,104316


*As shown above:*
> The results shows Manhattan having the most ride frequency, let's investigate Manhattan a bit further:
  * Could there be a trend since most rides are within Manhattan? 
    * Maybe yellow taxis are commonly used for short trips strictly within Manhattan, which could be why we see a high number of rides from __Manhattan to Manhattan__
    * Let's take a closer look at the datetime & trip distance to confirm the durations of these rides



In [21]:
# create a column with the difference between pickup and dropoff datetime as 'duration_mins'
merged['duration_mins'] = (merged['tpep_dropoff_datetime'] - merged['tpep_pickup_datetime']).dt.total_seconds() / 60 # duration in minutes

In [22]:
# strictly Manhattan (PU_Borough and DO_Borough both equal 'Manhattan')
manhattan = merged[(merged['PU_Borough'] == 'Manhattan') & (merged['DO_Borough'] == 'Manhattan')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})

# reindexing to have the columns in a more readable order
manhattan_datetime_cols = ['PU_datetime', 'DO_datetime', 'duration_mins', 'trip_distance', 'passenger_count', 'PU_Zone', 'PU_service_zone', 'DO_Zone', 'DO_service_zone']
manhattan = manhattan[manhattan_datetime_cols + [c for c in manhattan.columns if c not in manhattan_datetime_cols]]
manhattan.head()  

Unnamed: 0,PU_datetime,DO_datetime,duration_mins,trip_distance,passenger_count,PU_Zone,PU_service_zone,DO_Zone,DO_service_zone,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,PU_Borough,DO_Borough,vendor_name,payment_method,Rate
0,2025-05-01 00:07:06,2025-05-01 00:24:15,17.15,3.7,1.0,Lenox Hill East,Yellow Zone,Roosevelt Island,Boro Zone,1,1.0,140,202,1,18.4,4.25,0.5,4.85,0.0,1.0,29.0,2.5,0.0,0.75,Manhattan,Manhattan,"Creative Mobile Technologies, LLC",Credit card,Standard
1,2025-05-01 00:07:44,2025-05-01 00:14:27,6.716667,1.03,1.0,Union Sq,Yellow Zone,Midtown Center,Yellow Zone,2,1.0,234,161,1,8.6,1.0,0.5,4.3,0.0,1.0,18.65,2.5,0.0,0.75,Manhattan,Manhattan,"Curb Mobility, LLC",Credit card,Standard
2,2025-05-01 00:15:56,2025-05-01 00:23:53,7.95,1.57,1.0,Midtown Center,Yellow Zone,Union Sq,Yellow Zone,2,1.0,161,234,2,10.0,1.0,0.5,0.0,0.0,1.0,15.75,2.5,0.0,0.75,Manhattan,Manhattan,"Curb Mobility, LLC",Cash,Standard
4,2025-05-01 00:45:07,2025-05-01 00:52:45,7.633333,1.8,1.0,Flatiron,Yellow Zone,TriBeCa/Civic Center,Yellow Zone,2,1.0,90,231,1,10.0,1.0,0.5,1.5,0.0,1.0,17.25,2.5,0.0,0.75,Manhattan,Manhattan,"Curb Mobility, LLC",Credit card,Standard
6,2025-05-01 00:18:14,2025-05-01 00:27:38,9.4,1.5,,Lenox Hill East,Yellow Zone,Yorkville West,Yellow Zone,1,1.0,140,263,1,11.4,3.5,0.5,4.05,0.0,1.0,20.45,2.5,0.0,0.0,Manhattan,Manhattan,"Creative Mobile Technologies, LLC",Credit card,Standard


In [23]:
# what is the % of manhattan to manhattan rides that have short durations or trip distances?
# first, lets take a look at the distribution of trip durations and distances 
manhattan[['duration_mins', 'trip_distance']].describe()  

Unnamed: 0,duration_mins,trip_distance
count,3657462.0,3657462.0
mean,14.06164,4.643753
std,24.14157,523.6386
min,-2.183333,0.0
25%,7.55,0.95
50%,12.08333,1.58
75%,18.16667,2.56
max,7664.717,263104.0


In [24]:
# we can use values <= the 25th percentile of the overall df to define short rides by durations or distances
duration_25_all = merged['duration_mins'].quantile(0.25)
distance_25_all = merged['trip_distance'].quantile(0.25)

# defining short rides in Manhattan
m_short_rides = manhattan[(manhattan['duration_mins'] <= duration_25_all) | (manhattan['trip_distance'] <= distance_25_all)] # all rides with short durations or distances
m_short_rides_count = m_short_rides.shape[0]
total_rides_count = manhattan.shape[0]
m_short_rides_percentage = (m_short_rides_count / total_rides_count) * 100
print(f"% of short rides in Manhattan (by duration or distance): {m_short_rides_percentage:.2f}%")

% of short rides in Manhattan (by duration or distance): 38.83%


>It doesn't necessarily seem like yellow taxis are used for short trips strictly in Manhattan
* Let's look from a broader perspective and display zones pairs where rides are frequently being called from and to

In [25]:
# rides involving Manhattan (either PU_Borough or DO_Borough equal 'Manhattan')
manhattan_overall = merged[(merged['PU_Borough'] == 'Manhattan') | (merged['DO_Borough'] == 'Manhattan')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})
manhattan_overall.groupby(['PU_Zone', 'DO_Zone']).size().reset_index(name='count').sort_values(by='count', ascending=False).head(10)  # top 10 most frequent ride combinations involving Manhattan

Unnamed: 0,PU_Zone,DO_Zone,count
18782,Upper East Side South,Upper East Side North,29651
18547,Upper East Side North,Upper East Side South,25399
18783,Upper East Side South,Upper East Side South,20890
18546,Upper East Side North,Upper East Side North,19259
12306,Midtown Center,Upper East Side South,13813
18711,Upper East Side South,Midtown Center,12992
12305,Midtown Center,Upper East Side North,10849
10345,Lincoln Square East,Upper West Side South,10146
18712,Upper East Side South,Midtown East,9970
18479,Upper East Side North,Midtown Center,9769


>Could be useful to look into Same-Zone Rides in Manhattan vs. Manhattan-Involved Rides

In [26]:
# Checking the amount of rides within the same zone for strictly Manhattan 
m_same_zone_count = manhattan[manhattan['PU_Zone'] == manhattan['DO_Zone']].shape[0]
m_same_zone_percentage = (m_same_zone_count/manhattan.shape[0])*100  # percentage of rides within the same zone in Manhattan
print(f"% of same zone rides in Manhattan: {m_same_zone_percentage:.2f}%")

% of same zone rides in Manhattan: 4.70%


>It's a low percentage:
  * Maybe there are less local taxi rides because the borough is more walkable?
    * Considering the numbers are on the lower end for duration and distance
    * Could indicate most taxi rides are cross-neighborhood or intra-borough
    * Will compare with other boroughs to see if Manhattan is unique in this behavior

In [27]:
# Checking the amount of rides within the same zone for all Manhattan-involved rides
m_same_zone_percentage = (m_same_zone_count/manhattan_overall.shape[0])*100  # percentage of rides within the same zone in Manhattan
print(f"% of same zone rides in Manhattan: {m_same_zone_percentage:.2f}%")

% of same zone rides in Manhattan: 4.09%


> The percentage is low for both, and even lower when considering all Manhattan-involved rides
* This demonstrates that most yellow taxi rides in Manhattan are not hyper-local, but instead cross zones or neighborhoods.
* This pattern supports the idea that Manhattan's walkability reduces the need for taxis for very short, local trips.

>Analyzing the rest of the boroughs: Queens

In [28]:
# strictly Queens (PU_Borough and DO_Borough both equal 'Queens')
queens = merged[(merged['PU_Borough'] == 'Queens') & (merged['DO_Borough'] == 'Queens')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})

# reindexing to have the columns in a more readable order
queens_datetime_cols = ['PU_datetime', 'DO_datetime', 'duration_mins', 'trip_distance', 'passenger_count', 'PU_Zone', 'PU_service_zone', 'DO_Zone', 'DO_service_zone']
queens = queens[queens_datetime_cols + [c for c in queens.columns if c not in queens_datetime_cols]]
queens.head()

Unnamed: 0,PU_datetime,DO_datetime,duration_mins,trip_distance,passenger_count,PU_Zone,PU_service_zone,DO_Zone,DO_service_zone,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,PU_Borough,DO_Borough,vendor_name,payment_method,Rate
5,2025-05-01 00:09:24,2025-05-01 00:22:04,12.666667,5.11,1.0,LaGuardia Airport,Airports,Sunnyside,Boro Zone,2,1.0,138,226,1,22.6,6.0,0.5,6.02,0.0,1.0,37.87,0.0,1.75,0.0,Queens,Queens,"Curb Mobility, LLC",Credit card,Standard
54,2025-05-01 00:39:32,2025-05-01 01:13:48,34.266667,16.51,1.0,JFK Airport,Airports,JFK Airport,Airports,2,1.0,132,132,1,66.0,1.0,0.5,17.12,0.0,1.0,85.62,0.0,0.0,0.0,Queens,Queens,"Curb Mobility, LLC",Credit card,Standard
126,2025-05-01 00:11:11,2025-05-01 00:24:54,13.716667,4.53,2.0,JFK Airport,Airports,Richmond Hill,Boro Zone,2,1.0,132,197,1,20.5,1.0,0.5,6.9,0.0,1.0,31.65,0.0,1.75,0.0,Queens,Queens,"Curb Mobility, LLC",Credit card,Standard
148,2025-05-01 00:29:36,2025-05-01 01:06:47,37.183333,9.18,2.0,JFK Airport,Airports,Hollis,Boro Zone,2,1.0,132,122,2,43.6,1.0,0.5,0.0,0.0,1.0,47.85,0.0,1.75,0.0,Queens,Queens,"Curb Mobility, LLC",Cash,Standard
172,2025-05-01 00:13:08,2025-05-01 00:30:13,17.083333,7.2,1.0,LaGuardia Airport,Airports,Kew Gardens,Boro Zone,1,1.0,138,134,2,29.6,7.75,0.5,0.0,0.0,1.0,38.85,0.0,1.75,0.0,Queens,Queens,"Creative Mobile Technologies, LLC",Cash,Standard


> It seems like there's a lot of service for airports (LaGuardia & JFK) strictly within Queens
* Worth noting that both airports are located in Queens, could explain the high volume of airport type services

In [29]:
# checking the amount of rides in each service_zone
q_PU_service_zones = queens['PU_service_zone'].value_counts() 
q_DO_service_zones = queens['DO_service_zone'].value_counts()

pd.concat([q_PU_service_zones, q_DO_service_zones], axis=1) # side-by-side counts of PU and DO service zones for Queens

Unnamed: 0,count,count.1
Boro Zone,70585,114926
Airports,66116,21775


> Looks like there's more airport services at pickup compared to dropoff

In [30]:
# rides involving Queens (either PU_Borough or DO_Borough equal 'Queens')
queens_overall = merged[(merged['PU_Borough'] == 'Queens') | (merged['DO_Borough'] == 'Queens')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})

# the percentage of rides that are airport-related
q_airport_rides = queens_overall[queens_overall['PU_service_zone'].str.contains('Airport') | queens_overall['DO_service_zone'].str.contains('Airport')].shape[0]
q_airport_percentage = (q_airport_rides / queens_overall.shape[0]) * 100
print(f"% of airport-related rides in queens: {q_airport_percentage:.2f}%")

% of airport-related rides in queens: 67.23%


In [31]:
# Let's take a look at how many of manhattan rides are airport-related at pickup and dropoff
m_airport_rides = manhattan_overall[manhattan_overall['PU_service_zone'].str.contains('Airport') | manhattan_overall['DO_service_zone'].str.contains('Airport')].shape[0]
m_airport_percentage = (m_airport_rides / manhattan_overall.shape[0]) * 100
print(f"% of airport-related rides in manhattan: {m_airport_percentage:.2f}%")

% of airport-related rides in manhattan: 6.20%


* In total, over half the rides involving queens are airport-related
* Could be that queens uses taxi services for lesser casual means
  * Longer trips

In [32]:
# checking the distribution of durations for rides strictly Queens
queens[['duration_mins','trip_distance']].describe()  

Unnamed: 0,duration_mins,trip_distance
count,136701.0,136701.0
mean,18.028301,16.183759
std,40.859405,1189.735896
min,-0.833333,0.0
25%,8.65,1.67
50%,15.333333,4.4
75%,23.566667,8.19
max,6011.45,232507.92


*As shown above:*
* The duration is much higher than that of Manhattan, which could be due to the fact that Queens is less densely populated and has longer distances between locations
* Which could mean that Queens uses taxi services for longer trips, possibly for airport-related services or other longer-distance travel needs
* Meaning queens is less walkable and has longer distances between locations

>Analyzing the rest of the boroughs: Brooklyn

In [33]:
# strictly Brooklyn (PU_Borough and DO_Borough both equal 'Brooklyn')
brooklyn = merged[(merged['PU_Borough'] == 'Brooklyn') & (merged['DO_Borough'] == 'Brooklyn')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})

# rides involving Brooklyn (either PU_Borough or DO_Borough equal 'Brooklyn')
brooklyn_overall = merged[(merged['PU_Borough'] == 'Brooklyn') | (merged['DO_Borough'] == 'Brooklyn')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})

# reindexing to have the columns in a more readable order
brooklyn_datetime_cols = ['PU_datetime', 'DO_datetime', 'duration_mins', 'trip_distance', 'passenger_count', 'PU_Zone', 'PU_service_zone', 'DO_Zone', 'DO_service_zone']
brooklyn = brooklyn[brooklyn_datetime_cols + [c for c in brooklyn.columns if c not in brooklyn_datetime_cols]]
brooklyn.head()

Unnamed: 0,PU_datetime,DO_datetime,duration_mins,trip_distance,passenger_count,PU_Zone,PU_service_zone,DO_Zone,DO_service_zone,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,PU_Borough,DO_Borough,vendor_name,payment_method,Rate
266,2025-05-01 00:00:07,2025-05-01 00:22:57,22.833333,3.91,1.0,Boerum Hill,Boro Zone,Sunset Park West,Boro Zone,2,1.0,25,228,2,24.0,1.0,0.5,0.0,0.0,1.0,26.5,0.0,0.0,0.0,Brooklyn,Brooklyn,"Curb Mobility, LLC",Cash,Standard
313,2025-05-01 00:28:24,2025-05-01 00:34:01,5.616667,1.0,1.0,Williamsburg (South Side),Boro Zone,Williamsburg (South Side),Boro Zone,1,1.0,256,256,1,7.2,1.0,0.5,4.0,0.0,1.0,13.7,0.0,0.0,0.0,Brooklyn,Brooklyn,"Creative Mobile Technologies, LLC",Credit card,Standard
484,2025-05-01 00:50:08,2025-05-01 00:56:34,6.433333,1.03,1.0,Williamsburg (North Side),Boro Zone,East Williamsburg,Boro Zone,2,1.0,255,80,1,7.9,1.0,0.5,3.12,0.0,1.0,13.52,0.0,0.0,0.0,Brooklyn,Brooklyn,"Curb Mobility, LLC",Credit card,Standard
567,2025-05-01 00:12:59,2025-05-01 00:26:24,13.416667,2.8,1.0,Prospect Park,Boro Zone,Flatbush/Ditmas Park,Boro Zone,2,1.0,190,89,1,15.6,1.0,0.5,3.62,0.0,1.0,21.72,0.0,0.0,0.0,Brooklyn,Brooklyn,"Curb Mobility, LLC",Credit card,Standard
760,2025-05-01 00:54:25,2025-05-01 01:10:29,16.066667,4.51,1.0,Williamsburg (North Side),Boro Zone,Carroll Gardens,Boro Zone,2,1.0,255,40,1,21.9,1.0,0.5,4.88,0.0,1.0,29.28,0.0,0.0,0.0,Brooklyn,Brooklyn,"Curb Mobility, LLC",Credit card,Standard


In [34]:
brooklyn[['duration_mins', 'trip_distance']].describe()  

Unnamed: 0,duration_mins,trip_distance
count,104316.0,104316.0
mean,20.421525,18.990859
std,14.60704,1126.48235
min,-9.066667,0.0
25%,11.816667,1.53
50%,18.1,2.83
75%,26.170833,4.4
max,1419.316667,125465.4


> It seems within brooklyn, the duration and distance are generally longer than manhattan & queens
* What are the most common pickup and dropoff zones
* Dig deeper into short vs. long rides

In [35]:
# checking the paired zones in Brooklyn
brooklyn.groupby(['PU_Zone','DO_Zone']).size().reset_index(name='counts').sort_values(by='counts', ascending=False).head(10)  # top 10 most frequent pickup service zones in Brooklyn

Unnamed: 0,PU_Zone,DO_Zone,counts
1570,East New York,East New York,655
1098,Crown Heights North,Crown Heights North,434
2652,Park Slope,Park Slope,403
747,Canarsie,Canarsie,400
2120,Greenpoint,Greenpoint,349
3272,Williamsburg (North Side),Greenpoint,338
45,Bay Ridge,Bay Ridge,326
3248,Williamsburg (North Side),Bushwick South,323
1808,Flatbush/Ditmas Park,Flatbush/Ditmas Park,277
3293,Williamsburg (North Side),Williamsburg (North Side),276


In [36]:
# numbers seem small
# checking the amount of rides within the same zone for strictly Brooklyn 
b_same_zone_count = brooklyn[brooklyn['PU_Zone'] == brooklyn['DO_Zone']].shape[0]
b_same_zone_percentage = (b_same_zone_count/brooklyn.shape[0])*100  # percentage of rides within the same zone in Brooklyn
print(f"% of same zone rides in Brooklyn: {b_same_zone_percentage:.2f}%")

% of same zone rides in Brooklyn: 7.45%


> Could be that Brooklyn is a borough that's considered "walkable" due to the low % 
* But if that's the case, why are the numbers so different than Manhattan?
  * Most of the rides may be for intra-borough trips, I'd assume brooklyn's neighborhoods would be further apart due to the larger numbers for brooklyn

In [37]:
# let's compare short rides vs. long rides in Brooklyn
# first, we can use the same 25th percentile values for duration and distance as we did for Manhattan

# defining short rides in Brooklyn
b_short_rides = brooklyn[(brooklyn['duration_mins'] <= duration_25_all) | (brooklyn['trip_distance'] <= distance_25_all)] # all rides with short durations or distances
b_short_rides_count = b_short_rides.shape[0]
total_brooklyn_rides = brooklyn.shape[0]
b_short_rides_percentage = (b_short_rides_count / total_brooklyn_rides) * 100
print(f"% of short rides in Brooklyn (by duration or distance): {b_short_rides_percentage:.2f}%")

# defining long rides in Brooklyn
duration_75_all = merged['duration_mins'].quantile(0.75)
distance_75_all = merged['trip_distance'].quantile(0.75)

# using & instead of | to define long rides to take into account impact of delays and traffic
b_long_rides = brooklyn[(brooklyn['duration_mins'] >= duration_75_all) & (brooklyn['trip_distance'] >= distance_75_all)] # all rides with long durations and distances
b_long_rides_count = b_long_rides.shape[0]
b_long_rides_percentage = (b_long_rides_count / total_brooklyn_rides) * 100
print(f"% of long rides in Brooklyn (by duration and distance): {b_long_rides_percentage:.2f}%")

% of short rides in Brooklyn (by duration or distance): 21.08%
% of long rides in Brooklyn (by duration and distance): 25.00%


In [38]:
# comparing % of long rides in Brooklyn to Manhattan
# defining long rides in Manhattan using the same 75th percentile values
m_long_rides = manhattan[(manhattan['duration_mins'] >= duration_75_all) & (manhattan['trip_distance'] >= distance_75_all)] # all rides with long durations and distances
m_long_rides_count = m_long_rides.shape[0]
m_long_rides_percentage = (m_long_rides_count / total_rides_count) * 100
print(f"% of long rides in Manhattan (by duration and distance): {m_long_rides_percentage:.2f}%")

% of long rides in Manhattan (by duration and distance): 6.67%


> Brooklyn having a bigger % of longer rides show that, on average, it's likely to be more spaced out and less "walkable". The smaller % of short rides also reinforces that taxi trips are likely to be longer than Manhattan

In [39]:
# Checking for non airport-related rides in Brooklyn
br_is_airport = (brooklyn_overall['PU_service_zone'].str.contains('Airport') | brooklyn_overall['DO_service_zone'].str.contains('Airport'))
non_airport_rides = brooklyn_overall[~br_is_airport]  # number of rides that are not airport-related
print(f"% of non-airport rides in Brooklyn: {non_airport_rides.shape[0]/brooklyn_overall.shape[0] * 100:.2f}%")

% of non-airport rides in Brooklyn: 85.81%


>Analyzing the rest of the boroughs: Bronx

In [40]:
# strictly Bronx (PU_Borough and DO_Borough both equal 'Bronx')
bronx = merged[(merged['PU_Borough'] == 'Bronx') & (merged['DO_Borough'] == 'Bronx')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})

# reindexing to have the columns in a more readable order
bronx_datetime_cols = ['PU_datetime', 'DO_datetime', 'duration_mins', 'trip_distance', 'passenger_count', 'PU_Zone', 'PU_service_zone', 'DO_Zone', 'DO_service_zone']
bronx = bronx[bronx_datetime_cols + [c for c in bronx.columns if c not in bronx_datetime_cols]]
bronx.head()  

Unnamed: 0,PU_datetime,DO_datetime,duration_mins,trip_distance,passenger_count,PU_Zone,PU_service_zone,DO_Zone,DO_service_zone,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,PU_Borough,DO_Borough,vendor_name,payment_method,Rate
3809,2025-05-01 02:53:48,2025-05-01 02:53:58,0.166667,0.02,1.0,West Concourse,Boro Zone,West Concourse,Boro Zone,2,5.0,247,247,4,-42.2,-1.0,-0.5,0.0,0.0,-1.0,-47.95,-2.5,0.0,-0.75,Bronx,Bronx,"Curb Mobility, LLC",Dispute,Negotiated fare
3810,2025-05-01 02:53:48,2025-05-01 02:53:58,0.166667,0.02,1.0,West Concourse,Boro Zone,West Concourse,Boro Zone,2,5.0,247,247,4,42.2,1.0,0.5,0.0,0.0,1.0,47.95,2.5,0.0,0.75,Bronx,Bronx,"Curb Mobility, LLC",Dispute,Negotiated fare
4388,2025-05-01 04:58:21,2025-05-01 05:08:25,10.066667,4.28,1.0,Hunts Point,Boro Zone,Mount Hope,Boro Zone,2,1.0,126,169,1,18.4,1.0,0.5,0.0,0.0,1.0,20.9,0.0,0.0,0.0,Bronx,Bronx,"Curb Mobility, LLC",Credit card,Standard
6589,2025-05-01 06:56:33,2025-05-01 08:05:18,68.75,9.2,1.0,Morrisania/Melrose,Boro Zone,Van Nest/Morris Park,Boro Zone,1,99.0,167,242,1,27.5,0.0,0.5,0.0,0.0,0.0,28.0,0.0,0.0,0.0,Bronx,Bronx,"Creative Mobile Technologies, LLC",Credit card,Null/Unknown
6856,2025-05-01 06:49:55,2025-05-01 07:44:07,54.2,6.3,1.0,Morrisania/Melrose,Boro Zone,Norwood,Boro Zone,1,99.0,167,174,1,28.5,0.0,0.5,0.0,0.0,0.0,29.0,0.0,0.0,0.0,Bronx,Bronx,"Creative Mobile Technologies, LLC",Credit card,Null/Unknown


In [47]:
bronx[['duration_mins', 'trip_distance']].describe()

Unnamed: 0,duration_mins,trip_distance
count,19315.0,19315.0
mean,16.502069,81.086826
std,10.145838,2776.613192
min,-0.55,0.0
25%,10.583333,1.6
50%,15.1,3.11
75%,20.666667,5.1
max,519.533333,132178.78


In [41]:
# rides involving Bronx (either PU_Borough or DO_Borough equal 'Bronx')
bronx_overall = merged[(merged['PU_Borough'] == 'Bronx') | (merged['DO_Borough'] == 'Bronx')].rename(columns={'tpep_pickup_datetime': 'PU_datetime', 'tpep_dropoff_datetime': 'DO_datetime'})

# reindexing to have the columns in a more readable order
bronx_datetime_cols = ['PU_datetime', 'DO_datetime', 'duration_mins', 'trip_distance', 'passenger_count', 'PU_Zone', 'PU_service_zone', 'DO_Zone', 'DO_service_zone']
bronx_overall = bronx_overall[bronx_datetime_cols + [c for c in bronx_overall.columns if c not in bronx_datetime_cols]]
bronx_overall.head()  

Unnamed: 0,PU_datetime,DO_datetime,duration_mins,trip_distance,passenger_count,PU_Zone,PU_service_zone,DO_Zone,DO_service_zone,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,PU_Borough,DO_Borough,vendor_name,payment_method,Rate
71,2025-05-01 00:32:50,2025-05-01 01:02:09,29.316667,14.4,1.0,East Village,Yellow Zone,Riverdale/North Riverdale/Fieldston,Boro Zone,1,1.0,79,200,1,54.8,4.25,0.5,15.95,3.18,1.0,79.68,2.5,0.0,0.75,Manhattan,Bronx,"Creative Mobile Technologies, LLC",Credit card,Standard
100,2025-05-01 00:25:54,2025-05-01 00:50:54,25.0,11.37,1.0,East Chelsea,Yellow Zone,Spuyten Duyvil/Kingsbridge,Boro Zone,2,1.0,68,220,1,46.4,1.0,0.5,4.0,3.18,1.0,59.33,2.5,0.0,0.75,Manhattan,Bronx,"Curb Mobility, LLC",Credit card,Standard
147,2025-05-01 00:12:09,2025-05-01 00:21:09,9.0,0.0,1.0,East Harlem North,Boro Zone,Mott Haven/Port Morris,Boro Zone,1,1.0,74,168,1,17.5,0.0,0.5,0.0,0.0,1.0,19.0,0.0,0.0,0.0,Manhattan,Bronx,"Creative Mobile Technologies, LLC",Credit card,Standard
153,2025-05-01 00:17:39,2025-05-01 00:37:53,20.233333,10.11,1.0,Clinton East,Yellow Zone,Claremont/Bathgate,Boro Zone,2,1.0,48,47,1,41.5,1.0,0.5,9.45,0.0,1.0,56.7,2.5,0.0,0.75,Manhattan,Bronx,"Curb Mobility, LLC",Credit card,Standard
203,2025-05-01 00:15:58,2025-05-01 00:38:01,22.05,6.9,1.0,Midtown East,Yellow Zone,Morrisania/Melrose,Boro Zone,1,99.0,162,167,1,29.5,0.0,0.5,0.0,0.0,1.0,31.0,0.0,0.0,0.0,Manhattan,Bronx,"Creative Mobile Technologies, LLC",Credit card,Null/Unknown


---
*Before going into analysis with fares & tips, it's important to note that there may be values that are negative, but they might just be cancelled fares, there should be a duplicate value but in a positive value*

#### **1. Location Segmentation & Tipping Behavior**  
  * Specifically with the specific zones in the top and bottom boroughs 
    * Recall the datetime dataframes of each borough 

---
#### **2.Time Segmentation & Tipping Behavior**
* Let's take a closer look at whether there are datetime trends:
  * Weekends (people could be going out more)
  * Maybe there are activities after working hours (events, personal outings)
  * Time of day: morning, afternoon, night

In [42]:
manhattan['duration_mins'].describe()  # checking the values

count    3.657462e+06
mean     1.406164e+01
std      2.414157e+01
min     -2.183333e+00
25%      7.550000e+00
50%      1.208333e+01
75%      1.816667e+01
max      7.664717e+03
Name: duration_mins, dtype: float64

In [43]:
# the minimum duration is a negative value, which doesn't make sense, let's look into that
manhattan[manhattan['duration_mins'] <= 0].head()  # checking the rows with negative duration  

Unnamed: 0,PU_datetime,DO_datetime,duration_mins,trip_distance,passenger_count,PU_Zone,PU_service_zone,DO_Zone,DO_service_zone,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,PU_Borough,DO_Borough,vendor_name,payment_method,Rate
9,2025-05-01 00:22:31,2025-05-01 00:22:31,0.0,1.09,1.0,Sutton Place/Turtle Bay North,Yellow Zone,Central Park,Yellow Zone,7,1.0,229,43,1,8.6,0.0,0.5,2.87,0.0,1.0,17.22,2.5,0.0,0.75,Manhattan,Manhattan,Helix,Credit card,Standard
15,2025-05-01 00:01:22,2025-05-01 00:01:22,0.0,5.01,1.0,Lower East Side,Yellow Zone,Lincoln Square East,Yellow Zone,7,1.0,148,142,1,19.8,0.0,0.5,6.39,0.0,1.0,31.94,2.5,0.0,0.75,Manhattan,Manhattan,Helix,Credit card,Standard
60,2025-05-01 00:16:34,2025-05-01 00:16:34,0.0,7.2,1.0,Greenwich Village North,Yellow Zone,Bloomingdale,Yellow Zone,7,1.0,113,24,1,35.9,0.0,0.5,14.0,0.0,1.0,55.65,2.5,0.0,0.75,Manhattan,Manhattan,Helix,Credit card,Standard
170,2025-05-01 00:08:45,2025-05-01 00:08:45,0.0,0.88,1.0,West Chelsea/Hudson Yards,Yellow Zone,Clinton West,Yellow Zone,7,1.0,246,50,1,5.8,0.0,0.5,2.31,0.0,1.0,13.86,2.5,0.0,0.75,Manhattan,Manhattan,Helix,Credit card,Standard
171,2025-05-01 00:36:31,2025-05-01 00:36:31,0.0,2.75,1.0,West Chelsea/Hudson Yards,Yellow Zone,Upper West Side South,Yellow Zone,7,1.0,246,239,2,14.2,0.0,0.5,0.0,0.0,1.0,19.95,2.5,0.0,0.75,Manhattan,Manhattan,Helix,Cash,Standard


In [44]:
# there's positive trip distances with negative durations, which also doesn't make sense, lets check how many there are
manhattan[(manhattan['duration_mins'] <= 0) & (manhattan['trip_distance'] > 0)].shape[0]  

# it seems these rows are most likely due to data entry errors or system glitches, we can drop these rows for now
# it won't affect the overall analysis as they are a small percentage of the data
manhattan  = manhattan[~((manhattan['duration_mins'] < 0) & (manhattan['trip_distance'] > 0))]

In [45]:
# assumming these rides are cancelled or not completed, we can check how many there are
manhattan[(manhattan['duration_mins'] == 0) & (manhattan['trip_distance'] == 0)].size

19111

In [46]:
# Let's create a column categorizing whether the ride was on a weekend or not