In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [22]:
# Import the taxi info from both January and July 2019.
# Include the following columns: tpep_pickup_datetime, passenger_count, trip_distance,
# fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge,
# total_amount, and congestion_surcharge.

filenames = ['../data/nyc_taxi_2019-01.csv', '../data/nyc_taxi_2019-07.csv']

all_dfs = [pd.read_csv(one_filename, 
           usecols=['tpep_pickup_datetime', 'passenger_count', 'trip_distance',
                    'fare_amount','extra','mta_tax','tip_amount','tolls_amount',
                    'improvement_surcharge','total_amount','congestion_surcharge'],
           parse_dates=['tpep_pickup_datetime'])
           for one_filename in filenames]

df = pd.concat(all_dfs)

df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [28]:
# Create a new column, pre_tip_amount
df['pre_tip_amount'] = (
    df
    [['fare_amount', 'extra', 'mta_tax', 'tolls_amount', 
      'improvement_surcharge', 'congestion_surcharge']]
    .sum(axis='columns')
)

In [29]:
# Create a new column, tip_percentage, showing the percent of the fare_amount that people tipped
df['tip_percentage'] = df['tip_amount'] / df['pre_tip_amount']

In [30]:
# What was the overall tip percentage?
df['tip_percentage'].mean()

0.13003974566357937

In [31]:
# How many times did people tip more than 100%?
(df['tip_percentage'] > 1).value_counts()

tip_percentage
False    13970379
True         7832
Name: count, dtype: int64

In [32]:
# What percent of taxi riders give no tip at all?
(df['tip_percentage'] == 0).value_counts(normalize=True)

tip_percentage
False    0.67923
True     0.32077
Name: proportion, dtype: float64

In [33]:
# On which day of the week do people tip the greatest percentage?
# Mon = 0, Tues = 1, Wed = 2, Thu = 3, Fri = 4, Sat = 5, Sun = 6

df.groupby(df['tpep_pickup_datetime'].dt.day_of_week)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
3    0.133970
2    0.132221
1    0.131424
4    0.129136
0    0.128723
6    0.126634
5    0.125801
Name: tip_percentage, dtype: float64

In [34]:
# At which hour do people tip the greatest percentage?
df.groupby(df['tpep_pickup_datetime'].dt.hour)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
22    0.138816
20    0.138160
21    0.137685
8     0.137116
19    0.135174
23    0.134978
18    0.133292
9     0.133017
7     0.132134
0     0.131490
2     0.130914
1     0.130710
17    0.128640
10    0.127200
11    0.125022
16    0.124655
13    0.124567
12    0.124376
14    0.123727
15    0.123547
3     0.121053
6     0.119915
4     0.118987
5     0.112028
Name: tip_percentage, dtype: float64

In [35]:
# Do people tip more, on average, in January or July?
df.groupby(df['tpep_pickup_datetime'].dt.month)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
5     0.200000
8     0.158099
3     0.148046
9     0.141431
1     0.137011
2     0.132224
7     0.121570
12    0.109367
6     0.107354
10    0.100000
4     0.074877
11    0.046026
Name: tip_percentage, dtype: float64

In [36]:
# What was the 1-day period in our data set when people tipped the greatest percentage?
df = df.set_index('tpep_pickup_datetime')
df.resample('1D')['tip_percentage'].mean().sort_values(ascending=False).head(10)

tpep_pickup_datetime
2019-02-13    0.358127
2019-02-25    0.250000
2019-08-20    0.241865
2019-11-27    0.200000
2019-08-15    0.200000
2019-05-20    0.200000
2019-08-10    0.200000
2019-09-22    0.200000
2019-09-24    0.200000
2019-09-25    0.200000
Name: tip_percentage, dtype: float64

In [41]:
# Try again, within our range
df = df.sort_index()

df = pd.concat([df.loc['2019-01-01':'2019-01-31'],
           df.loc['2019-07-01':'2019-07-31']])

df.resample('1D')['tip_percentage'].mean().sort_values(ascending=False).head(10)

tpep_pickup_datetime
2019-01-31    0.144351
2019-01-30    0.143530
2019-01-24    0.143434
2019-01-22    0.142769
2019-01-15    0.142329
2019-01-29    0.141330
2019-01-10    0.141291
2019-01-16    0.141147
2019-01-17    0.140356
2019-01-23    0.140309
Name: tip_percentage, dtype: float64

In [42]:
df.resample('1M')['tip_percentage'].mean().dropna()

tpep_pickup_datetime
2019-01-31    0.137012
2019-07-31    0.121570
Name: tip_percentage, dtype: float64