In [1]:
import pandas as pd
import numpy as np

# to bypass warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_path = "/home/dcti-02-11/projects/data/NYC_Taxi_Trips/data/2020_taxi_trips.csv"

trips_2020 = pd.read_csv(file_path)

In [10]:
trips_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1205954 entries, 0 to 1730575
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   VendorID               1205954 non-null  float64
 1   lpep_pickup_datetime   1205954 non-null  object 
 2   lpep_dropoff_datetime  1205954 non-null  object 
 3   store_and_fwd_flag     1205954 non-null  bool   
 4   RatecodeID             1205954 non-null  float64
 5   PULocationID           1205954 non-null  int64  
 6   DOLocationID           1205954 non-null  int64  
 7   passenger_count        1205954 non-null  float64
 8   trip_distance          1205954 non-null  float64
 9   fare_amount            1205954 non-null  float64
 10  extra                  1205954 non-null  float64
 11  mta_tax                1205954 non-null  float64
 12  tip_amount             1205954 non-null  float64
 13  tolls_amount           1205954 non-null  float64
 14  improvement_surcha

In [4]:
trips_2020 = trips_2020.dropna()

In [21]:
trips_2020.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,trip_type,congestion_surcharge,trip_distance_km,time_diff,pickup_date,dropoff_date,pickup_month,dropoff_month,pickup_day,dropoff_day
0,2.0,2020-01-01 06:47:28,2020-01-01 06:52:54,False,1.0,74,75,1,1.47,6.5,...,1.0,0.0,2.365736,0 days 00:05:26,2020-01-01,2020-01-01,1,1,Wednesday,Wednesday
1,2.0,2020-01-01 13:25:34,2020-01-01 13:30:43,False,1.0,74,75,1,1.49,6.5,...,1.0,0.0,2.397923,0 days 00:05:09,2020-01-01,2020-01-01,1,1,Wednesday,Wednesday
2,2.0,2020-01-01 14:20:35,2020-01-01 14:26:25,False,1.0,74,75,1,1.31,6.5,...,1.0,0.0,2.108241,0 days 00:05:50,2020-01-01,2020-01-01,1,1,Wednesday,Wednesday
3,2.0,2020-01-02 06:56:47,2020-01-02 07:03:03,False,1.0,74,75,1,1.43,6.5,...,1.0,0.0,2.301362,0 days 00:06:16,2020-01-02,2020-01-02,1,1,Thursday,Thursday
4,2.0,2020-01-02 09:34:46,2020-01-02 09:41:02,False,1.0,74,75,1,1.1,6.5,...,1.0,0.0,1.770278,0 days 00:06:16,2020-01-02,2020-01-02,1,1,Thursday,Thursday


## Replace and convert columns to the appropriate data types

In [11]:
# Replace Y and N with True and False respectively
trips_2020['store_and_fwd_flag'] = trips_2020['store_and_fwd_flag'].replace({'N':False, 'Y':True})

In [12]:
# convert the following columns to category
list_convert = ['VendorID','RatecodeID','payment_type','trip_type']
trips_2020[list_convert] = trips_2020[list_convert].astype('category')

In [13]:
# convert passenger count from float to int
trips_2020['passenger_count'] = trips_2020['passenger_count'].astype(int)

In [14]:
# convert pickup and dropoff to datetime
trips_2020['lpep_pickup_datetime'] = pd.to_datetime(trips_2020['lpep_pickup_datetime'])
trips_2020['lpep_dropoff_datetime'] = pd.to_datetime(trips_2020['lpep_dropoff_datetime'])

## Adding new Columns

In [15]:
# convert distance from miles to km
trips_2020['trip_distance_km'] = trips_2020['trip_distance'] * 1.609344

In [16]:
# get time difference from dropoff and pickup
trips_2020['time_diff'] = trips_2020['lpep_dropoff_datetime'] - trips_2020['lpep_pickup_datetime']

In [17]:
# get the pickup and dropoffs dates only
trips_2020['pickup_date'] = pd.to_datetime(trips_2020['lpep_pickup_datetime'].dt.date)
trips_2020['dropoff_date'] = pd.to_datetime(trips_2020['lpep_dropoff_datetime'].dt.date)

In [18]:
# add columns for month of the year
trips_2020['pickup_month'] = trips_2020['pickup_date'].dt.month
trips_2020['dropoff_month'] = trips_2020['dropoff_date'].dt.month

In [19]:
# add columns for day of the week
trips_2020['pickup_day'] = trips_2020['pickup_date'].dt.day_name()
trips_2020['dropoff_day'] = trips_2020['dropoff_date'].dt.day_name()

## Remove negative distances and negative days

In [32]:
# remove rows with trip distance less that 0
trips_2020 = trips_2020[trips_2020['trip_distance_km'] > 0]

In [40]:
# remove negative time difference
trips_2020 = trips_2020[trips_2020['time_diff'] > pd.Timedelta(0)]

In [None]:
trips_2020['fare_amount'].value_counts()

In [None]:
trips_2020.trip_distance_km.value_counts()

In [38]:
trips_2020['time_diff'].max()

Timedelta('0 days 23:59:58')

In [39]:
trips_2020.tail()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,trip_type,congestion_surcharge,trip_distance_km,time_diff,pickup_date,dropoff_date,pickup_month,dropoff_month,pickup_day,dropoff_day
1730033,2.0,2020-12-09 15:22:43,2020-12-09 15:31:23,False,1.0,244,220,1,3.65,12.0,...,1.0,0.0,5.874106,0 days 00:08:40,2020-12-09,2020-12-09,12,12,Wednesday,Wednesday
1730034,2.0,2020-12-11 10:55:38,2020-12-11 11:05:31,False,1.0,244,220,1,4.06,13.5,...,1.0,0.0,6.533937,0 days 00:09:53,2020-12-11,2020-12-11,12,12,Friday,Friday
1730035,2.0,2020-12-16 14:35:14,2020-12-16 14:48:01,False,1.0,244,220,1,3.27,13.0,...,1.0,0.0,5.262555,0 days 00:12:47,2020-12-16,2020-12-16,12,12,Wednesday,Wednesday
1730036,2.0,2020-12-16 14:03:48,2020-12-16 14:13:28,False,1.0,244,220,1,3.87,13.0,...,1.0,0.0,6.228161,0 days 00:09:40,2020-12-16,2020-12-16,12,12,Wednesday,Wednesday
1730037,2.0,2020-12-22 15:51:20,2020-12-22 16:00:48,False,1.0,244,220,1,3.5,13.0,...,1.0,0.0,5.632704,0 days 00:09:28,2020-12-22,2020-12-22,12,12,Tuesday,Tuesday


In [41]:
trips_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1146566 entries, 0 to 1730037
Data columns (total 27 columns):
 #   Column                 Non-Null Count    Dtype          
---  ------                 --------------    -----          
 0   VendorID               1146566 non-null  category       
 1   lpep_pickup_datetime   1146566 non-null  datetime64[ns] 
 2   lpep_dropoff_datetime  1146566 non-null  datetime64[ns] 
 3   store_and_fwd_flag     1146566 non-null  bool           
 4   RatecodeID             1146566 non-null  category       
 5   PULocationID           1146566 non-null  int64          
 6   DOLocationID           1146566 non-null  int64          
 7   passenger_count        1146566 non-null  int64          
 8   trip_distance          1146566 non-null  float64        
 9   fare_amount            1146566 non-null  float64        
 10  extra                  1146566 non-null  float64        
 11  mta_tax                1146566 non-null  float64        
 12  tip_amount    

In [None]:
trips_2020.isna().mean() * 100

In [45]:
trips_2020.groupby(['pickup_day']).count()

Unnamed: 0_level_0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,payment_type,trip_type,congestion_surcharge,trip_distance_km,time_diff,pickup_date,dropoff_date,pickup_month,dropoff_month,dropoff_day
pickup_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Friday,188044,188044,188044,188044,188044,188044,188044,188044,188044,188044,...,188044,188044,188044,188044,188044,188044,188044,188044,188044,188044
Monday,152201,152201,152201,152201,152201,152201,152201,152201,152201,152201,...,152201,152201,152201,152201,152201,152201,152201,152201,152201,152201
Saturday,167063,167063,167063,167063,167063,167063,167063,167063,167063,167063,...,167063,167063,167063,167063,167063,167063,167063,167063,167063,167063
Sunday,129712,129712,129712,129712,129712,129712,129712,129712,129712,129712,...,129712,129712,129712,129712,129712,129712,129712,129712,129712,129712
Thursday,179228,179228,179228,179228,179228,179228,179228,179228,179228,179228,...,179228,179228,179228,179228,179228,179228,179228,179228,179228,179228
Tuesday,158021,158021,158021,158021,158021,158021,158021,158021,158021,158021,...,158021,158021,158021,158021,158021,158021,158021,158021,158021,158021
Wednesday,172297,172297,172297,172297,172297,172297,172297,172297,172297,172297,...,172297,172297,172297,172297,172297,172297,172297,172297,172297,172297


In [46]:
trips_2020.groupby(['pickup_day']).sum()

Unnamed: 0_level_0,store_and_fwd_flag,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,trip_distance_km,pickup_month,dropoff_month
pickup_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Friday,629,18820034,24013929,243523,558818.76,2326032.5,90057.5,91546.0,234250.18,33718.63,55788.3,2909285.51,87510.75,899331.6,701932,702100
Monday,489,15378125,19881830,196703,517113.43,1903194.57,66537.75,74095.5,190920.65,29724.05,45138.6,2381657.92,81443.75,832213.4,624647,624669
Saturday,508,16814222,21333662,217613,530732.38,2074780.18,38125.0,81021.6,203444.49,27587.4,49494.9,2545071.92,78390.75,854131.0,632670,632865
Sunday,386,13143679,16755656,170797,400339.52,1606442.89,30563.26,62876.05,164318.1,22611.45,38455.5,1983173.8,64805.5,644284.0,481420,481429
Thursday,577,18065228,23176846,233665,520409.85,2218944.33,84622.0,87262.0,230534.81,32772.43,53173.8,2789931.22,92823.25,837518.5,684092,683988
Tuesday,486,15867832,20582638,205137,468059.86,1976475.32,72546.25,77073.65,201934.87,30383.95,46892.4,2481107.84,85612.0,753269.3,648246,648158
Wednesday,551,17263655,22357098,224268,755252.7,2164403.9,79347.0,83839.5,223383.06,33575.22,51110.1,2717573.73,91526.5,1215461.0,678787,678804


In [54]:
trips_2020.groupby(['pickup_month'])['total_amount', 'tip_amount','congestion_surcharge'].max()

Unnamed: 0_level_0,total_amount,tip_amount,congestion_surcharge
pickup_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,753.8,450.0,2.75
2,651.5,641.2,2.75
3,462.27,449.97,2.75
4,398.76,100.0,2.75
5,655.3,160.0,2.75
6,498.8,480.0,2.75
7,803.8,88.0,2.75
8,550.8,200.0,2.75
9,537.85,89.06,2.75
10,257.92,111.6,2.75
