# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [2]:
fhv_df = pd.read_csv("data/FHVHV_Trip_Data.csv", index_col=[0])
yellow_df = pd.read_csv("data/Yellow_Trip_Data.csv", index_col=[0])
green_df = pd.read_csv("data/Green_Trip_Data.csv", index_col=[0])

In [3]:
taxi_zones_df = pd.read_csv("data/Taxi_Zones_Data.csv")
taxi_zones_df.columns = taxi_zones_df.columns.str.lower()
taxi_zones_df.set_index("locationid", inplace=True)

In [4]:
taxi_zones_df.head()

Unnamed: 0_level_0,borough,zone,service_zone
locationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,EWR,Newark Airport,EWR
2,Queens,Jamaica Bay,Boro Zone
3,Bronx,Allerton/Pelham Gardens,Boro Zone
4,Manhattan,Alphabet City,Yellow Zone
5,Staten Island,Arden Heights,Boro Zone


# For-Hire Dataset

In [5]:
fhv_df.duplicated().sum()

np.int64(0)

In [6]:
fhv_df["hvfhs_license_num"].value_counts()

hvfhs_license_num
HV0003    10521027
HV0005     3908330
HV0004       70643
Name: count, dtype: int64

In [7]:
# Replace the business code with the name
fhv_df["hvfhs_license_num"] = fhv_df["hvfhs_license_num"].replace(to_replace=["HV0002", "HV0003", "HV0004", "HV0005"], value=["Juno", "Uber", "Via", "Lyft"])
fhv_df["hvfhs_license_num"].value_counts()

hvfhs_license_num
Uber    10521027
Lyft     3908330
Via        70643
Name: count, dtype: int64

In [8]:
# Filter to only Uber and Lyft
fhv_df = fhv_df[~(fhv_df["hvfhs_license_num"]=="Via")]
fhv_df.reset_index(drop=True, inplace=True)

In [9]:
# Columns not needed
bad_cols = ["dispatching_base_num", "originating_base_num"]
fhv_df.drop(columns=bad_cols, inplace=True)

In [20]:
# Columns with Null values
fhv_df.isna().any()

business                False
request_datetime        False
on_scene_datetime        True
pickup_datetime         False
dropoff_datetime        False
pick_up_borough          True
pick_up_zone            False
drop_off_borough         True
drop_off_zone           False
trip_miles              False
trip_time               False
base_passenger_fare     False
tolls                   False
black_car_fund          False
sales_tax               False
congestion_surcharge    False
airport_fee             False
tips                    False
driver_pay              False
shared_request_flag     False
shared_match_flag       False
access_a_ride_flag      False
wav_request_flag        False
wav_match_flag          False
dtype: bool

In [11]:
fhv_df["airport_fee"] = fhv_df["airport_fee"].replace(np.nan, 0.00)

In [12]:
# Pick up and drop off locations
pick_up_borough = fhv_df["pulocationid"].map(taxi_zones_df["borough"])
pick_up_zone = fhv_df["pulocationid"].map(taxi_zones_df["zone"])
drop_off_borough = fhv_df["dolocationid"].map(taxi_zones_df["borough"])
drop_off_zone = fhv_df["dolocationid"].map(taxi_zones_df["zone"])

# Put into the dataframe
fhv_df.insert(loc=7, column="pick_up_borough", value=pick_up_borough)
fhv_df.insert(loc=8, column="pick_up_zone", value=pick_up_zone)
fhv_df.insert(loc=9, column="drop_off_borough", value=drop_off_borough)
fhv_df.insert(loc=10, column="drop_off_zone", value=drop_off_zone)

In [13]:
fhv_df.drop(columns=["pulocationid", "dolocationid"], inplace=True)

In [14]:
# Fix a couple of column names
fhv_df.rename(columns={"hvfhs_license_num": "business", "bcf": "black_car_fund"}, inplace=True)

In [15]:
fhv_df.head()

Unnamed: 0,business,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,pick_up_borough,pick_up_zone,drop_off_borough,drop_off_zone,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,Uber,2020-01-21 18:46:40,2020-01-21 18:47:21,2020-01-21 18:47:57,2020-01-21 18:58:42,Manhattan,Two Bridges/Seward Park,Brooklyn,Williamsburg (North Side),2.54,...,1.18,2.75,0.0,3.0,8.11,N,N,,N,N
1,Uber,2020-01-17 19:27:42,2020-01-17 19:28:38,2020-01-17 19:29:55,2020-01-17 19:45:27,Manhattan,East Village,Manhattan,Union Sq,1.66,...,1.15,2.75,0.0,1.0,9.5,N,N,,N,N
2,Lyft,2020-01-18 11:57:57,,2020-01-18 12:00:50,2020-01-18 12:08:26,Manhattan,Midtown East,Manhattan,Lenox Hill West,1.394,...,0.64,2.75,0.0,0.0,5.39,N,N,N,N,N
3,Lyft,2020-01-13 19:27:35,,2020-01-13 19:31:23,2020-01-13 19:42:23,Manhattan,Central Harlem North,Bronx,West Concourse,1.647,...,0.61,0.0,0.0,1.0,7.24,N,N,N,N,N
4,Lyft,2020-01-24 03:56:08,,2020-01-24 03:59:28,2020-01-24 04:11:37,Queens,LaGuardia Airport,Queens,Elmhurst,3.859,...,0.91,0.0,0.0,1.0,10.24,N,N,N,N,N


In [16]:
fhv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14429357 entries, 0 to 14429356
Data columns (total 24 columns):
 #   Column                Dtype  
---  ------                -----  
 0   business              object 
 1   request_datetime      object 
 2   on_scene_datetime     object 
 3   pickup_datetime       object 
 4   dropoff_datetime      object 
 5   pick_up_borough       object 
 6   pick_up_zone          object 
 7   drop_off_borough      object 
 8   drop_off_zone         object 
 9   trip_miles            float64
 10  trip_time             int64  
 11  base_passenger_fare   float64
 12  tolls                 float64
 13  black_car_fund        float64
 14  sales_tax             float64
 15  congestion_surcharge  float64
 16  airport_fee           float64
 17  tips                  float64
 18  driver_pay            float64
 19  shared_request_flag   object 
 20  shared_match_flag     object 
 21  access_a_ride_flag    object 
 22  wav_request_flag      object 
 23  wav_m

In [17]:
fhv_df.to_csv("data/FHVHV_Trip_Data_Clean.csv", index=False)