In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Load taxi data from January 2019 into a data frame, 
# using only the columns `tpep_pickup_datetime`, `tpep_dropoff_datetime`, 
# `passenger_count`, `trip_distance`, and `total_amount`, 
# making sure to load `tpep_pickup_datetime` and `tpep_dropoff_datetime` as `datetime` columns.

filename = '../data/nyc_taxi_2019-07.csv'

df = (
    pd
    .read_csv(filename,
              usecols=['tpep_pickup_datetime',
                       'tpep_dropoff_datetime',
                       'trip_distance', 
                       'passenger_count',
                       'total_amount'],
              parse_dates=['tpep_pickup_datetime', 
                           'tpep_dropoff_datetime'])
)

df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3


In [3]:
df.dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
total_amount                    float64
dtype: object

In [4]:
# Create a new column, `trip_time`, containing the amount of time each taxi ride took.
df['trip_time'] = (
    df['tpep_dropoff_datetime'] - 
    df['tpep_pickup_datetime']
)
df['trip_time'].head()

0   0 days 00:00:29
1   0 days 00:19:42
2   0 days 00:35:47
3   0 days 00:41:55
4   0 days 00:12:10
Name: trip_time, dtype: timedelta64[ns]

- `trip_time` 이 음수인 것은 이상치로 간주하여 제거.
- 자료형을 맞춘 후 부울 인덱싱 적용

In [5]:
df['trip_time'] >= pd.to_timedelta('0 seconds') # 또는 df['trip_time'] >= pd.Timedelta(0)

0          True
1          True
2          True
3          True
4          True
           ... 
6310414    True
6310415    True
6310416    True
6310417    True
6310418    True
Name: trip_time, Length: 6310419, dtype: bool

In [6]:
df[df['trip_time'] < pd.to_timedelta('0 seconds')]

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
775861,2019-07-05 16:26:04,2019-07-05 16:25:33,1.0,0.4,9.35,-1 days +23:59:29
5236711,2019-07-26 21:42:37,2019-07-25 17:08:50,1.0,3.5,33.35,-2 days +19:26:13
6279078,2019-07-02 19:12:10,2019-07-02 19:12:00,,0.0,-27.86,-1 days +23:59:50
6286977,2019-07-11 11:30:10,2019-07-11 11:30:00,,0.0,-23.84,-1 days +23:59:50
6289536,2019-07-13 14:56:10,2019-07-13 14:56:00,,0.04,-26.93,-1 days +23:59:50
6293349,2019-07-16 18:49:10,2019-07-16 18:49:00,,0.08,-21.06,-1 days +23:59:50
6294823,2019-07-18 15:15:10,2019-07-18 15:15:00,,0.0,-12.82,-1 days +23:59:50
6304692,2019-07-26 06:04:10,2019-07-26 06:04:00,,0.01,-24.7,-1 days +23:59:50
6304694,2019-07-26 06:00:10,2019-07-26 06:00:00,,0.14,-24.14,-1 days +23:59:50
6307658,2019-07-30 07:01:10,2019-07-30 07:01:00,,0.0,-17.69,-1 days +23:59:50


In [7]:
# 이상치 제거
df = df.loc[df['trip_time'] >= pd.to_timedelta('0 seconds')]

In [8]:
# What number of rides took less than 1 minute?
df.loc[df['trip_time'] < '1 minute', 'trip_time'].count()

np.int64(70202)

In [9]:
# What percentage of rides took less than 1 minute?
df.loc[df['trip_time'] < '1 minute', 'trip_time'].count() / df['trip_time'].count() * 100

np.float64(1.1124793971357483)

In [10]:
# What was the average fare paid by people taking these short trips?
df.loc[df['trip_time'] < '1 minute', 'total_amount'].mean()

np.float64(30.403856157944215)

In [11]:
# What number of rides took more than 10 hours?
df.loc[df['trip_time'] > '10 hours', 'trip_time'].count() 

np.int64(16698)

In [12]:
# What percentage of rides took more than 10 hours?
df.loc[df['trip_time'] > '10 hours', 'trip_time'].count() / df['trip_time'].count() * 100

np.float64(0.2646104238251435)

In [13]:
#  Now create a new column, `trip_time_group`, in which the values will be `short`
# (< 10 minutes), `medium` (>= between 10 minutes and 1 hour), or `long` (> 1 hour).

df['trip_time_group'] = (
    pd.cut(
           df['trip_time'],
           bins=[pd.to_timedelta(arg)
                 for arg in ['0 seconds', '9 minutes 59 seconds', 
                        '1 hour', '100 hours']],
          labels=['short', 'medium', 'long'],
          include_lowest=True)
)

In [14]:
# What proportion of rides were in each group?
df.groupby('trip_time_group', observed=False)['passenger_count'].mean()

trip_time_group
short     1.551222
medium    1.585768
long      1.700859
Name: passenger_count, dtype: float64