In [14]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [16]:
filename = '../data/nyc_taxi_2019-07.csv'

df = (
    pd
    .read_csv(filename,
              usecols=['tpep_pickup_datetime',
                       'tpep_dropoff_datetime',
                       'trip_distance', 
                       'passenger_count',
                       'total_amount'],
              parse_dates=['tpep_pickup_datetime', 
                           'tpep_dropoff_datetime'])
)

df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3


In [17]:
df.dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
total_amount                    float64
dtype: object

In [18]:
# Create a new column, `trip_time`, containing the amount of time each taxi ride took.

df['trip_time'] = (
    df['tpep_dropoff_datetime'] - 
    df['tpep_pickup_datetime']
)
df['trip_time'].head()

0   0 days 00:00:29
1   0 days 00:19:42
2   0 days 00:35:47
3   0 days 00:41:55
4   0 days 00:12:10
Name: trip_time, dtype: timedelta64[ns]

In [19]:
# What number of rides took less than 1 minute?

df.loc[df['trip_time'] < '1 minute', 'trip_time'].count()


np.int64(70212)

In [20]:
df.loc[
    df['trip_time'] < '1 minute',
    'total_amount'
].mean()

np.float64(30.397584031219733)

In [21]:
# What percentage of rides took less than 1 minute?
df.loc[df['trip_time'] < '1 minute', 'trip_time'].count() / df['trip_time'].count() * 100

np.float64(1.1126361022936828)

In [22]:
# What number of rides took more than 10 hours?
df.loc[df['trip_time'] > '10 hours', 'trip_time'].count() 

np.int64(16698)

In [23]:
#  Now create a new column, `trip_time_group`, in which the values will be `short`
# (< 10 minutes), `medium` (>= between 10 minutes and 1 hour), or `long` (> 1 hour).

df['trip_time_group'] = (
    pd.cut(
           df['trip_time'],
           bins=[pd.to_timedelta(arg)
                 for arg in ['0 seconds', '10 minutes', 
                        '1 hour', '100 hours']],
          labels=['short', 'medium', 'long'])
)

In [24]:
# What proportion of rides were in each group?
df.groupby('trip_time_group', observed=False)['passenger_count'].mean()

trip_time_group
short     1.552411
medium    1.585806
long      1.700859
Name: passenger_count, dtype: float64

In [25]:
# df = DataFrame(
#     [s.split('-')        
#         for s in ['14-07-1970', '01-03-1971', '16-12-2000', '17-12-2002','31-10-2005']],
#         columns='day month year'.split())

# pd.to_datetime(df[['year', 'month', 'day']])


In [26]:
filenames = ['../data/nyc_taxi_2019-01.csv', '../data/nyc_taxi_2019-07.csv']

all_dfs = [pd.read_csv(one_filename,
            usecols=['tpep_pickup_datetime', 'passenger_count', 'trip_distance',
                    'fare_amount','extra','mta_tax','tip_amount','tolls_amount',
                    'improvement_surcharge','total_amount','congestion_surcharge'],
           parse_dates=['tpep_pickup_datetime'])
           for one_filename in filenames
           ]

df = pd.concat(all_dfs)

df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [27]:
# create a new column, pre_tip_amount
df['pre_tip_amount'] = (
    df[['fare_amount', 'extra', 'mta_tax', 'tolls_amount', 
      'improvement_surcharge', 'congestion_surcharge']]
      .sum(axis='columns')
)

In [28]:
# Create a new column, tip_percentage, showing the percent of the fare_amount that people tipped
df['tip_percentage'] = df['tip_amount'] / df['pre_tip_amount']

In [29]:
df['tip_percentage'].mean()

np.float64(0.13003974566357937)

In [30]:
# What was the overall tip percentage?
df['tip_percentage'].mean()

np.float64(0.13003974566357937)

In [31]:
# How many times did people tip more than 100%?
(df['tip_percentage'] > 1).value_counts()

tip_percentage
False    13970379
True         7832
Name: count, dtype: int64

In [32]:
# What percent of taxi riders give no tip at all?
(df['tip_percentage'] == 0).value_counts(normalize=True)

tip_percentage
False    0.67923
True     0.32077
Name: proportion, dtype: float64

In [33]:
# On which day of the week do people tip the greatest percentage?
# Mon = 0, Tues = 1, Wed = 2, Thu = 3, Fri = 4, Sat = 5, Sun = 6

df.groupby(df['tpep_pickup_datetime'].dt.day_of_week)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
3    0.133970
2    0.132221
1    0.131424
4    0.129136
0    0.128723
6    0.126634
5    0.125801
Name: tip_percentage, dtype: float64

In [34]:
# At which hour do people tip the greatest percentage?
df.groupby(df['tpep_pickup_datetime'].dt.hour)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
22    0.138816
20    0.138160
21    0.137685
8     0.137116
19    0.135174
23    0.134978
18    0.133292
9     0.133017
7     0.132134
0     0.131490
2     0.130914
1     0.130710
17    0.128640
10    0.127200
11    0.125022
16    0.124655
13    0.124567
12    0.124376
14    0.123727
15    0.123547
3     0.121053
6     0.119915
4     0.118987
5     0.112028
Name: tip_percentage, dtype: float64