# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import warnings

pd.set_option('display.float_format', '{:.4f}'.format)
warnings.filterwarnings("ignore")

In [2]:
fhv_df = pd.read_csv("data/FHVHV_Trip_Data.csv", index_col=[0])
yellow_df = pd.read_csv("data/Yellow_Trip_Data.csv", index_col=[0])
green_df = pd.read_csv("data/Green_Trip_Data.csv", index_col=[0])

In [3]:
taxi_zones_df = pd.read_csv("data/Taxi_Zones_Data.csv")
taxi_zones_df.columns = taxi_zones_df.columns.str.lower()
taxi_zones_df.set_index("locationid", inplace=True)

In [4]:
taxi_zones_df.head()

Unnamed: 0_level_0,borough,zone,service_zone
locationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,EWR,Newark Airport,EWR
2,Queens,Jamaica Bay,Boro Zone
3,Bronx,Allerton/Pelham Gardens,Boro Zone
4,Manhattan,Alphabet City,Yellow Zone
5,Staten Island,Arden Heights,Boro Zone


# For-Hire Dataset

In [5]:
fhv_df.duplicated().sum()

np.int64(0)

In [6]:
fhv_df["hvfhs_license_num"].value_counts()

hvfhs_license_num
HV0003    10521027
HV0005     3908330
HV0004       70643
Name: count, dtype: int64

In [7]:
# Replace the business code with the name
fhv_df["hvfhs_license_num"] = fhv_df["hvfhs_license_num"].replace(to_replace=["HV0002", "HV0003", "HV0004", "HV0005"], value=["Juno", "Uber", "Via", "Lyft"])
fhv_df["hvfhs_license_num"].value_counts()

hvfhs_license_num
Uber    10521027
Lyft     3908330
Via        70643
Name: count, dtype: int64

In [8]:
# Filter to only Uber and Lyft
fhv_df = fhv_df[~(fhv_df["hvfhs_license_num"]=="Via")]
fhv_df.reset_index(drop=True, inplace=True)

In [9]:
# Columns not needed
bad_cols = ["dispatching_base_num", "originating_base_num"]
fhv_df.drop(columns=bad_cols, inplace=True)

In [10]:
# Columns with Null values
fhv_df.isna().any()

hvfhs_license_num       False
request_datetime        False
on_scene_datetime        True
pickup_datetime         False
dropoff_datetime        False
pulocationid            False
dolocationid            False
trip_miles              False
trip_time               False
base_passenger_fare     False
tolls                   False
bcf                     False
sales_tax               False
congestion_surcharge    False
airport_fee              True
tips                    False
driver_pay              False
shared_request_flag     False
shared_match_flag       False
access_a_ride_flag      False
wav_request_flag        False
wav_match_flag          False
dtype: bool

In [11]:
fhv_df["airport_fee"] = fhv_df["airport_fee"].replace(np.nan, 0.00)

In [12]:
# Pick up and drop off locations
pick_up_borough = fhv_df["pulocationid"].map(taxi_zones_df["borough"])
pick_up_zone = fhv_df["pulocationid"].map(taxi_zones_df["zone"])
drop_off_borough = fhv_df["dolocationid"].map(taxi_zones_df["borough"])
drop_off_zone = fhv_df["dolocationid"].map(taxi_zones_df["zone"])

# Put into the dataframe
fhv_df.insert(loc=7, column="pick_up_borough", value=pick_up_borough)
fhv_df.insert(loc=8, column="pick_up_zone", value=pick_up_zone)
fhv_df.insert(loc=9, column="drop_off_borough", value=drop_off_borough)
fhv_df.insert(loc=10, column="drop_off_zone", value=drop_off_zone)

In [13]:
fhv_df.drop(columns=["pulocationid", "dolocationid"], inplace=True)

In [14]:
# Fix a couple of column names
fhv_df.rename(columns={"hvfhs_license_num": "business", "bcf": "black_car_fund"}, inplace=True)

In [15]:
fhv_df.head()

Unnamed: 0,business,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,pick_up_borough,pick_up_zone,drop_off_borough,drop_off_zone,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,Uber,2020-01-21 18:46:40,2020-01-21 18:47:21,2020-01-21 18:47:57,2020-01-21 18:58:42,Manhattan,Two Bridges/Seward Park,Brooklyn,Williamsburg (North Side),2.54,...,1.18,2.75,0.0,3.0,8.11,N,N,,N,N
1,Uber,2020-01-17 19:27:42,2020-01-17 19:28:38,2020-01-17 19:29:55,2020-01-17 19:45:27,Manhattan,East Village,Manhattan,Union Sq,1.66,...,1.15,2.75,0.0,1.0,9.5,N,N,,N,N
2,Lyft,2020-01-18 11:57:57,,2020-01-18 12:00:50,2020-01-18 12:08:26,Manhattan,Midtown East,Manhattan,Lenox Hill West,1.394,...,0.64,2.75,0.0,0.0,5.39,N,N,N,N,N
3,Lyft,2020-01-13 19:27:35,,2020-01-13 19:31:23,2020-01-13 19:42:23,Manhattan,Central Harlem North,Bronx,West Concourse,1.647,...,0.61,0.0,0.0,1.0,7.24,N,N,N,N,N
4,Lyft,2020-01-24 03:56:08,,2020-01-24 03:59:28,2020-01-24 04:11:37,Queens,LaGuardia Airport,Queens,Elmhurst,3.859,...,0.91,0.0,0.0,1.0,10.24,N,N,N,N,N


In [16]:
fhv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14429357 entries, 0 to 14429356
Data columns (total 24 columns):
 #   Column                Dtype  
---  ------                -----  
 0   business              object 
 1   request_datetime      object 
 2   on_scene_datetime     object 
 3   pickup_datetime       object 
 4   dropoff_datetime      object 
 5   pick_up_borough       object 
 6   pick_up_zone          object 
 7   drop_off_borough      object 
 8   drop_off_zone         object 
 9   trip_miles            float64
 10  trip_time             int64  
 11  base_passenger_fare   float64
 12  tolls                 float64
 13  black_car_fund        float64
 14  sales_tax             float64
 15  congestion_surcharge  float64
 16  airport_fee           float64
 17  tips                  float64
 18  driver_pay            float64
 19  shared_request_flag   object 
 20  shared_match_flag     object 
 21  access_a_ride_flag    object 
 22  wav_request_flag      object 
 23  wav_m

In [17]:
fhv_df.to_csv("data/FHVHV_Trip_Data_Clean.csv", index=False)

# Yellow Taxi Dataset

In [18]:
# Replace column names
cols_replace = {
    "tpep_pickup_datetime": "pickup_datetime", "tpep_dropoff_datetime": "dropoff_datetime",
    "trip_distance": "trip_miles", "tolls_amount":  "tolls", "tip_amount": "tips", "ratecodeid": "trip_category"
    }
yellow_df.rename(columns=cols_replace, inplace=True)    

In [19]:
# Columns with Null values
yellow_df.isna().any()

vendorid                 False
pickup_datetime          False
dropoff_datetime         False
passenger_count           True
trip_miles               False
trip_category             True
store_and_fwd_flag        True
pulocationid             False
dolocationid             False
payment_type             False
fare_amount              False
extra                    False
mta_tax                  False
tips                     False
tolls                    False
improvement_surcharge    False
total_amount             False
congestion_surcharge      True
airport_fee               True
dtype: bool

In [20]:
yellow_df["congestion_surcharge"] = yellow_df["congestion_surcharge"].replace(np.nan, 0.00)
yellow_df["airport_fee"] = yellow_df["airport_fee"].replace(np.nan, 0.00)

In [21]:
yellow_df["trip_category"].value_counts()

trip_category
1.0000     12494638
2.0000       391082
5.0000        81291
99.0000       53138
3.0000        32362
4.0000        20871
6.0000           73
Name: count, dtype: int64

In [22]:
yellow_df["store_and_fwd_flag"].value_counts()

store_and_fwd_flag
N    12944407
Y      129048
Name: count, dtype: int64

In [23]:
yellow_df["passenger_count"].value_counts()

passenger_count
1.0000    9802523
2.0000    1897912
3.0000     481916
0.0000     249524
5.0000     244913
4.0000     231891
6.0000     164651
7.0000         53
8.0000         53
9.0000         19
Name: count, dtype: int64

In [24]:
# Replace the values with category
trip_category = yellow_df["trip_category"].replace(to_replace=[1, 2, 3, 4, 5, 6, 99], value=["Standard Rate", "JFK", "Newark", "Nassau or Westchester", "Negotiated Fare", "Group Ride", "Unknown"])
yellow_df["trip_category"] = trip_category

In [25]:
# Pick up and drop off locations
pick_up_borough = yellow_df["pulocationid"].map(taxi_zones_df["borough"])
pick_up_zone = yellow_df["pulocationid"].map(taxi_zones_df["zone"])
drop_off_borough = yellow_df["dolocationid"].map(taxi_zones_df["borough"])
drop_off_zone = yellow_df["dolocationid"].map(taxi_zones_df["zone"])

# Put into the dataframe
yellow_df.insert(loc=7, column="pick_up_borough", value=pick_up_borough)
yellow_df.insert(loc=8, column="pick_up_zone", value=pick_up_zone)
yellow_df.insert(loc=9, column="drop_off_borough", value=drop_off_borough)
yellow_df.insert(loc=10, column="drop_off_zone", value=drop_off_zone)

# Drop the columns
yellow_df.drop(columns=["pulocationid", "dolocationid", "vendorid"], inplace=True)

In [26]:
yellow_df["payment_type"] = yellow_df["payment_type"].replace(to_replace=[1, 2, 3, 4, 5, 6], value=["Credit Card", "Cash", "No Charge", "Dispute", "Unknown", "Voided Trip"])

In [27]:
yellow_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13836755 entries, 0 to 13836754
Data columns (total 20 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   pickup_datetime        object 
 1   dropoff_datetime       object 
 2   passenger_count        float64
 3   trip_miles             float64
 4   trip_category          object 
 5   store_and_fwd_flag     object 
 6   pick_up_borough        object 
 7   pick_up_zone           object 
 8   drop_off_borough       object 
 9   drop_off_zone          object 
 10  payment_type           object 
 11  fare_amount            float64
 12  extra                  float64
 13  mta_tax                float64
 14  tips                   float64
 15  tolls                  float64
 16  improvement_surcharge  float64
 17  total_amount           float64
 18  congestion_surcharge   float64
 19  airport_fee            float64
dtypes: float64(11), object(9)
memory usage: 2.2+ GB


In [28]:
yellow_df.duplicated().sum()

np.int64(0)

In [29]:
yellow_df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,trip_miles,trip_category,store_and_fwd_flag,pick_up_borough,pick_up_zone,drop_off_borough,drop_off_zone,payment_type,fare_amount,extra,mta_tax,tips,tolls,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2020-01-12 03:00:14,2020-01-12 03:10:18,1.0,3.4,Standard Rate,N,Manhattan,TriBeCa/Civic Center,Manhattan,East Village,Credit Card,12.0,0.5,0.5,3.16,0.0,0.3,18.96,2.5,0.0
1,2020-01-15 18:39:38,2020-01-15 18:50:17,1.0,2.01,Standard Rate,N,Manhattan,Meatpacking/West Village West,Manhattan,Clinton East,Credit Card,9.5,1.0,0.5,2.76,0.0,0.3,16.56,2.5,0.0
2,2020-01-30 07:34:31,2020-01-30 07:48:24,1.0,1.88,Standard Rate,N,Manhattan,Lenox Hill West,Manhattan,Lincoln Square East,Credit Card,10.5,0.0,0.5,2.76,0.0,0.3,16.56,2.5,0.0
3,2020-01-20 17:16:38,2020-01-20 17:23:34,2.0,1.54,Standard Rate,N,Manhattan,Clinton East,Manhattan,Lincoln Square East,Credit Card,7.0,1.0,0.5,2.26,0.0,0.3,13.56,2.5,0.0
4,2020-01-14 11:48:39,2020-01-14 12:03:40,1.0,2.4,Standard Rate,N,Manhattan,Midtown Center,Manhattan,Yorkville West,Credit Card,11.5,2.5,0.5,2.96,0.0,0.3,17.76,2.5,0.0


In [30]:
yellow_df.describe()

Unnamed: 0,passenger_count,trip_miles,fare_amount,extra,mta_tax,tips,tolls,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,13073455.0,13836755.0,13836755.0,13836755.0,13836755.0,13836755.0,13836755.0,13836755.0,13836755.0,13836755.0,13836755.0
mean,1.391,5.2884,6.1314,1.1944,0.4882,12.3619,0.4654,0.5566,22.8663,2.1306,0.081
std,0.9662,534.2129,35860.2324,1.5148,0.0993,35860.0243,1.9561,0.3584,117.2293,0.9295,0.3495
min,0.0,-29.06,-133391414.0,-39.17,-0.5,-333.32,-114.2,-1.0,-1000.0,-2.5,-1.75
25%,1.0,1.05,7.2,0.0,0.5,0.0,0.0,0.3,12.74,2.5,0.0
50%,1.0,1.8,11.0,0.5,0.5,2.15,0.0,0.3,17.16,2.5,0.0
75%,1.0,3.35,17.7,2.5,0.5,3.5,0.0,1.0,25.0,2.5,0.0
max,9.0,351613.36,429496.72,90.06,42.17,133391363.53,700.17,1.0,429562.25,3.0,1.75


In [31]:
yellow_df.to_csv("data/Yellow_Trip_Data_Clean.csv", index=False)

In [43]:
pd.read_csv("data/Yellow_Trip_Data_Clean.csv")

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,trip_miles,trip_category,store_and_fwd_flag,pick_up_borough,pick_up_zone,drop_off_borough,drop_off_zone,payment_type,fare_amount,extra,mta_tax,tips,tolls,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2020-01-12 03:00:14,2020-01-12 03:10:18,1.0000,3.4000,Standard Rate,N,Manhattan,TriBeCa/Civic Center,Manhattan,East Village,Credit Card,12.0000,0.5000,0.5000,3.1600,0.0000,0.3000,18.9600,2.5000,0.0000
1,2020-01-15 18:39:38,2020-01-15 18:50:17,1.0000,2.0100,Standard Rate,N,Manhattan,Meatpacking/West Village West,Manhattan,Clinton East,Credit Card,9.5000,1.0000,0.5000,2.7600,0.0000,0.3000,16.5600,2.5000,0.0000
2,2020-01-30 07:34:31,2020-01-30 07:48:24,1.0000,1.8800,Standard Rate,N,Manhattan,Lenox Hill West,Manhattan,Lincoln Square East,Credit Card,10.5000,0.0000,0.5000,2.7600,0.0000,0.3000,16.5600,2.5000,0.0000
3,2020-01-20 17:16:38,2020-01-20 17:23:34,2.0000,1.5400,Standard Rate,N,Manhattan,Clinton East,Manhattan,Lincoln Square East,Credit Card,7.0000,1.0000,0.5000,2.2600,0.0000,0.3000,13.5600,2.5000,0.0000
4,2020-01-14 11:48:39,2020-01-14 12:03:40,1.0000,2.4000,Standard Rate,N,Manhattan,Midtown Center,Manhattan,Yorkville West,Credit Card,11.5000,2.5000,0.5000,2.9600,0.0000,0.3000,17.7600,2.5000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13836750,2024-10-28 13:30:42,2024-10-28 13:52:28,2.0000,5.3800,Standard Rate,N,Manhattan,West Village,Manhattan,Manhattan Valley,Credit Card,26.8000,0.0000,0.5000,7.7000,0.0000,1.0000,38.5000,2.5000,0.0000
13836751,2024-10-30 22:20:28,2024-10-30 22:37:53,1.0000,3.5800,Standard Rate,N,Manhattan,West Village,Manhattan,Lenox Hill West,Credit Card,19.1000,1.0000,0.5000,6.0300,0.0000,1.0000,30.1300,2.5000,0.0000
13836752,2024-10-05 20:49:33,2024-10-05 21:09:48,1.0000,10.4900,Standard Rate,N,Manhattan,Lincoln Square East,Bronx,Riverdale/North Riverdale/Fieldston,Credit Card,42.2000,1.0000,0.5000,10.0800,3.1800,1.0000,60.4600,2.5000,0.0000
13836753,2024-10-30 09:27:50,2024-10-30 09:38:25,1.0000,1.1000,Standard Rate,N,Manhattan,Yorkville West,Manhattan,Lenox Hill East,Credit Card,10.0000,2.5000,0.5000,2.8000,0.0000,1.0000,16.8000,2.5000,0.0000


# Green Taxi Dataset

In [32]:
# Replace column names
cols_replace = {
    "lpep_pickup_datetime": "pickup_datetime", "lpep_dropoff_datetime": "dropoff_datetime",
    "trip_distance": "trip_miles", "tolls_amount":  "tolls", "tip_amount": "tips", "ratecodeid": "trip_category"
    }
green_df.rename(columns=cols_replace, inplace=True)

In [33]:
green_df.isna().any()

vendorid                 False
pickup_datetime          False
dropoff_datetime         False
store_and_fwd_flag        True
trip_category             True
pulocationid             False
dolocationid             False
passenger_count           True
trip_miles               False
fare_amount              False
extra                    False
mta_tax                  False
tips                     False
tolls                    False
ehail_fee                 True
improvement_surcharge    False
total_amount             False
payment_type              True
trip_type                 True
congestion_surcharge      True
dtype: bool

In [34]:
green_df["congestion_surcharge"] = green_df["congestion_surcharge"].replace(np.nan, 0.00)

In [35]:
green_df["trip_category"].value_counts()

trip_category
1.0000     1716374
5.0000       75579
2.0000        5103
4.0000        1856
3.0000        1095
99.0000         96
6.0000          19
Name: count, dtype: int64

In [36]:
green_df["payment_type"].value_counts()

payment_type
1.0000    1132550
2.0000     651926
3.0000      12129
4.0000       3469
5.0000         48
Name: count, dtype: int64

In [37]:
# Replace the values with category
trip_category = green_df["trip_category"].replace(to_replace=[1, 2, 3, 4, 5, 6, 99], value=["Standard Rate", "JFK", "Newark", "Nassau or Westchester", "Negotiated Fare", "Group Ride", "Unknown"])
green_df["trip_category"] = trip_category

payment_type = green_df["payment_type"].replace(to_replace=[1, 2, 3, 4, 5, 6], value=["Credit Card", "Cash", "No Charge", "Dispute", "Unknown", "Voided Trip"])
green_df["payment_type"] = payment_type

In [38]:
# Pick up and drop off locations
pick_up_borough = green_df["pulocationid"].map(taxi_zones_df["borough"])
pick_up_zone = green_df["pulocationid"].map(taxi_zones_df["zone"])
drop_off_borough = green_df["dolocationid"].map(taxi_zones_df["borough"])
drop_off_zone = green_df["dolocationid"].map(taxi_zones_df["zone"])

# Put into the dataframe
green_df.insert(loc=5, column="pick_up_borough", value=pick_up_borough)
green_df.insert(loc=6, column="pick_up_zone", value=pick_up_zone)
green_df.insert(loc=7, column="drop_off_borough", value=drop_off_borough)
green_df.insert(loc=8, column="drop_off_zone", value=drop_off_zone)

# Drop the columns
green_df.drop(columns=["pulocationid", "dolocationid", "vendorid"], inplace=True)

In [39]:
green_df["payment_type"] = green_df["payment_type"].replace(to_replace=[1, 2, 3, 4, 5, 6], value=["Credit Card", "Cash", "No Charge", "Dispute", "Unknown", "Voided Trip"])

In [40]:
green_df.duplicated().sum()

np.int64(0)

In [41]:
green_df.to_csv("data/Green_Trip_Data_Clean.csv", index=False)