In [2]:
import pandas as pd

In [3]:
driver_id = pd.read_csv("driver_ids.csv", parse_dates = ["driver_onboard_date"])
ride_ids = pd.read_csv("ride_ids.csv")
ride_timestamps = pd.read_csv("ride_timestamps.csv", parse_dates = ["timestamp"])

### First inspection

In [4]:
driver_id.head()

Unnamed: 0,driver_id,driver_onboard_date
0,002be0ffdc997bd5c50703158b7c2491,2016-03-29
1,007f0389f9c7b03ef97098422f902e62,2016-03-29
2,011e5c5dfc5c2c92501b8b24d47509bc,2016-04-05
3,0152a2f305e71d26cc964f8d4411add9,2016-04-23
4,01674381af7edd264113d4e6ed55ecda,2016-04-29


In [5]:
ride_ids.head()

Unnamed: 0,driver_id,ride_id,ride_distance,ride_duration,ride_prime_time
0,002be0ffdc997bd5c50703158b7c2491,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50
1,002be0ffdc997bd5c50703158b7c2491,01b522c5c3a756fbdb12e95e87507eda,3362,809,0
2,002be0ffdc997bd5c50703158b7c2491,029227c4c2971ce69ff2274dc798ef43,3282,572,0
3,002be0ffdc997bd5c50703158b7c2491,034e861343a63ac3c18a9ceb1ce0ac69,65283,3338,25
4,002be0ffdc997bd5c50703158b7c2491,034f2e614a2f9fc7f1c2f77647d1b981,4115,823,100


In [6]:
ride_timestamps.head()

Unnamed: 0,ride_id,event,timestamp
0,00003037a262d9ee40e61b5c0718f7f0,requested_at,2016-06-13 09:39:19
1,00003037a262d9ee40e61b5c0718f7f0,accepted_at,2016-06-13 09:39:51
2,00003037a262d9ee40e61b5c0718f7f0,arrived_at,2016-06-13 09:44:31
3,00003037a262d9ee40e61b5c0718f7f0,picked_up_at,2016-06-13 09:44:33
4,00003037a262d9ee40e61b5c0718f7f0,dropped_off_at,2016-06-13 10:03:05


In [7]:
driver_id.describe()

Unnamed: 0,driver_id,driver_onboard_date
count,937,937
unique,937,49
top,9494350df132e6748afca3bc5d138dcc,2016-04-05 00:00:00
freq,1,36
first,,2016-03-28 00:00:00
last,,2016-05-15 00:00:00


In [8]:
driver_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 937 entries, 0 to 936
Data columns (total 2 columns):
driver_id              937 non-null object
driver_onboard_date    937 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 14.7+ KB


**Driver_id is an object, not int**

In [9]:
ride_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193502 entries, 0 to 193501
Data columns (total 5 columns):
driver_id          193502 non-null object
ride_id            193502 non-null object
ride_distance      193502 non-null int64
ride_duration      193502 non-null int64
ride_prime_time    193502 non-null int64
dtypes: int64(3), object(2)
memory usage: 7.4+ MB


In [10]:
ride_ids['ride_prime_time'].describe()

count    193502.000000
mean         17.305893
std          30.825800
min           0.000000
25%           0.000000
50%           0.000000
75%          25.000000
max         500.000000
Name: ride_prime_time, dtype: float64

In [11]:
ride_ids['ride_prime_time'].head(20)

0      50
1       0
2       0
3      25
4     100
5     100
6       0
7      25
8      75
9      50
10    100
11      0
12      0
13     25
14      0
15      0
16      0
17      0
18      0
19     25
Name: ride_prime_time, dtype: int64

In [12]:
ride_timestamps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970405 entries, 0 to 970404
Data columns (total 3 columns):
ride_id      970405 non-null object
event        970405 non-null object
timestamp    970404 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 22.2+ MB


In [13]:
ride_timestamps["timestamp"].describe()

count                  970404
unique                 865826
top       2016-04-30 22:09:15
freq                        7
first     2016-03-28 05:48:18
last      2016-06-27 00:50:50
Name: timestamp, dtype: object

**Time period between March 28 and June 27**

In [14]:
ride_timestamps["event"].unique()

array(['requested_at', 'accepted_at', 'arrived_at', 'picked_up_at',
       'dropped_off_at'], dtype=object)

In [15]:
na_filter = ride_timestamps["timestamp"].isna() == True
na_index = ride_timestamps.loc[na_filter, "ride_id"].index
print(na_index)

Int64Index([434222], dtype='int64')


 **There seems to be one missing value for timestamp**

### Merge rides and driver id

In [16]:
lyft_data = pd.merge(driver_id, ride_ids, how = 'inner', on = "driver_id")
#check dimensions
lyft_data.shape

(185891, 6)

### Merge with timestamp data

In [17]:
lyft_data = pd.merge(lyft_data, ride_timestamps, how = 'inner', on = 'ride_id')

In [18]:
# investigate dimensions
lyft_data.shape

(921045, 8)

In [19]:
lyft_data.head(10)

Unnamed: 0,driver_id,driver_onboard_date,ride_id,ride_distance,ride_duration,ride_prime_time,event,timestamp
0,002be0ffdc997bd5c50703158b7c2491,2016-03-29,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50,requested_at,2016-04-23 02:13:50
1,002be0ffdc997bd5c50703158b7c2491,2016-03-29,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50,accepted_at,2016-04-23 02:14:15
2,002be0ffdc997bd5c50703158b7c2491,2016-03-29,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50,arrived_at,2016-04-23 02:16:36
3,002be0ffdc997bd5c50703158b7c2491,2016-03-29,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50,picked_up_at,2016-04-23 02:16:40
4,002be0ffdc997bd5c50703158b7c2491,2016-03-29,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50,dropped_off_at,2016-04-23 02:22:07
5,002be0ffdc997bd5c50703158b7c2491,2016-03-29,01b522c5c3a756fbdb12e95e87507eda,3362,809,0,requested_at,2016-03-29 19:00:49
6,002be0ffdc997bd5c50703158b7c2491,2016-03-29,01b522c5c3a756fbdb12e95e87507eda,3362,809,0,accepted_at,2016-03-29 19:00:52
7,002be0ffdc997bd5c50703158b7c2491,2016-03-29,01b522c5c3a756fbdb12e95e87507eda,3362,809,0,arrived_at,2016-03-29 19:03:57
8,002be0ffdc997bd5c50703158b7c2491,2016-03-29,01b522c5c3a756fbdb12e95e87507eda,3362,809,0,picked_up_at,2016-03-29 19:04:01
9,002be0ffdc997bd5c50703158b7c2491,2016-03-29,01b522c5c3a756fbdb12e95e87507eda,3362,809,0,dropped_off_at,2016-03-29 19:17:30


# Insights we are interested in, based on the data

- % of successful pick-ups after acceptance
    - Over time, has this percentage changed? By how much?
- Mean difference between request and accept times per group of prime time
- Riders with the least amount of rides over the course of this dataset's duration
- 

## Percent of successful pick ups

In [20]:
#find all accepted requests
accepted = lyft_data["event"] == 'accepted_at'
#find all pickups from acceptances
picked_up = lyft_data["event"] == 'picked_up_at'

In [21]:
#finding percent of accepted requests that resulted in picking up rider
percent_completion = (picked_up.sum()/accepted.sum())*100
percent_completion

100.0

## Percent of requests that were accepted

In [22]:
#find all requests
requests = lyft_data["event"] == "requested_at"

In [23]:
#percent of requests that were accepted
accept_percentage = (requests.sum()/accepted.sum())*100
accept_percentage

100.0

**It seems like this dataset contains all completed rides**

## Mean difference in time between request and acceptance for different prime time groups