In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import time
from datetime import datetime

In [2]:
df = pd.read_csv("../boston_kaggle/cab_rides.csv")
df['time_stamp'] = df['time_stamp']/1000
df = df.sort_values(by = "time_stamp")
#make assumption that drivers are driving at 40 mph - get duration in seconds
df['duration'] = df['distance'] / 10 * 60 * 60
df = df.reset_index(drop=True)
df = df.fillna(0)

In [3]:
old_df = df.copy()

In [4]:
df.columns

Index(['distance', 'cab_type', 'time_stamp', 'destination', 'source', 'price',
       'surge_multiplier', 'id', 'product_id', 'name', 'duration'],
      dtype='object')

In [5]:
set(df.destination)

{'Back Bay',
 'Beacon Hill',
 'Boston University',
 'Fenway',
 'Financial District',
 'Haymarket Square',
 'North End',
 'North Station',
 'Northeastern University',
 'South Station',
 'Theatre District',
 'West End'}

In [6]:
print(df)

        distance cab_type    time_stamp              destination  \
0           3.03     Lyft  1.543204e+09         Theatre District   
1           1.57     Uber  1.543204e+09                North End   
2           1.30     Uber  1.543204e+09         Theatre District   
3           2.71     Uber  1.543204e+09                   Fenway   
4           2.43     Lyft  1.543204e+09              Beacon Hill   
5           2.71     Uber  1.543204e+09                   Fenway   
6           2.71     Uber  1.543204e+09                   Fenway   
7           2.19     Uber  1.543204e+09                North End   
8           3.05     Uber  1.543204e+09                   Fenway   
9           2.19     Uber  1.543204e+09                North End   
10          2.19     Uber  1.543204e+09                North End   
11          2.22     Lyft  1.543204e+09  Northeastern University   
12          4.46     Lyft  1.543204e+09       Financial District   
13          4.46     Lyft  1.543204e+09       Fi

In [7]:
df.iloc[100]

distance                                            1.45
cab_type                                            Lyft
time_stamp                                   1.54321e+09
destination                                       Fenway
source                                          Back Bay
price                                               10.5
surge_multiplier                                       1
id                  e57bd38c-cb92-414a-be2f-322beb26e8ad
product_id                                     lyft_plus
name                                             Lyft XL
duration                                             522
Name: 100, dtype: object

'2018-12-18 14:15:10.943000'

In [94]:
def simulate_drive(df, time_start, time_end, source = "Haymarket Square", verbose = True):
    wait_noise = max(0, np.random.normal(0, 1) * 60 * 3 + 5 * 60)
    time_start = time_start + wait_noise
    
    #total price of all the rides
    running_price = 0
    
    #duration of the ride
    duration = 0
    
    df = df[(df['time_stamp'] > time_start) & (df['time_stamp'] < time_end)]
    drives = df[(df['time_stamp'] > time_start) & (df['source'] == source)].reset_index(drop= True)

    wait_noise = 0
    
    while(len(drives) > 0): 
        wait_noise = max(0, np.random.normal(0, 1) * 60 * 3 + 5 * 60)
        if verbose: 
            print("\t[Time_Start : %s] $%s [%.2f (%.2f wait)] [%s] to [%s]" %(str(datetime.fromtimestamp(time_start)).split(".")[0],
                                                        drives['price'].iloc[0],
                                                        drives['duration'].iloc[0]/60,
                                                        wait_noise/60, 
                                                        drives['source'].iloc[0], 
                                                        drives['destination'].iloc[0] 
                                                        ))
        running_price += drives['price'].iloc[0]
        duration = drives['duration'].iloc[0] 

        
        #note - everything below is after the trip has been taken already

        
        #increment time start by the duration of the trip + the noise in waiting for the next trip
        time_start = time_start + duration + wait_noise

        # the next source is the current drive's destination
        source = drives['destination'].iloc[0]
        # drives is based on a slice of the dataframe
        drives = df[(df['time_stamp'] > time_start) & (df['source'] == source)].reset_index(drop= True)

    if verbose: 
        print("\tPrice $%.2f" % running_price)
        print("\tEarned $%.2f" % (running_price * 0.3))
    earnings = running_price * 0.3
    return running_price, earnings

time_start = df['time_stamp'].iloc[101]
time_end = time_start + 60 * 60    
simulate_drive(df, time_start, time_end, "Back Bay")


	[Time_Start : 2018-11-25 23:40:55] $38.5 [18.24 (4.01 wait)] [Back Bay] to [North End]
	[Time_Start : 2018-11-26 00:03:10] $5.0 [7.56 (6.42 wait)] [North End] to [West End]
	[Time_Start : 2018-11-26 00:17:09] $3.5 [4.50 (2.34 wait)] [West End] to [Haymarket Square]
	[Time_Start : 2018-11-26 00:23:59] $26.0 [2.94 (2.17 wait)] [Haymarket Square] to [North Station]
	[Time_Start : 2018-11-26 00:29:06] $24.0 [18.06 (7.20 wait)] [North Station] to [Boston University]
	Price $97.00
	Earned $29.10


(97.0, 29.099999999999998)

In [113]:
CALENDAR_SIZE = (24, 7) #set calendar size to 24 by 7 

def generate_calendar_matrix(arr = None): 
    '''
    Generates a calendar matrix based off
    an optional numpy input array. If no arr
    is inputted, the default is zero
    '''
    if arr is None:
        arr = np.zeros(CALENDAR_SIZE)
    df = pd.DataFrame(arr, index = range(CALENDAR_SIZE[0]), columns= ["mon", "tues", "wed", "thur", "fri", "sat", "sun"])
    return df

def to_datestring(s): 
    return str(datetime.fromtimestamp(s)).split(".")[0]

In [132]:
np.random.seed(8)

cdf = generate_calendar_matrix((np.random.random(CALENDAR_SIZE) > 0.75).astype(int))


def get_start_end_times(df, base_date = '11/26/18'): 
    cmatrix = df.values
    orig = np.zeros((CALENDAR_SIZE[0] + 1, CALENDAR_SIZE[1]))
    shift = np.zeros((CALENDAR_SIZE[0] + 1, CALENDAR_SIZE[1]))

    orig[1:, :] = cmatrix
    shift[:-1, :] = cmatrix

    diff = shift - orig

    start_times_bool = diff == 1
    end_times_bool = diff == -1
    
    monday_time = datetime.timestamp(datetime.strptime(datetime_str, '%m/%d/%y %H')) + 1
    base_timestamp = datetime.timestamp(datetime.strptime((base_date), '%m/%d/%y'))
    li = [[base_timestamp + hour*60*60 + day*24*60*60 for day in range(7)] for hour in range(25)]
    #li = [[str(datetime.fromtimestamp(x)).split(".")[0] for x in y] for y in li]

    date_arr = np.array(li)

    start_times = date_arr[start_times_bool]
    end_times = date_arr[end_times_bool]

    start_times = np.sort(start_times)
    end_times = np.sort(end_times)
    
    return start_times, end_times


start_times, end_times = get_start_end_times(cdf)
print(cdf)

    mon  tues  wed  thur  fri  sat  sun
0     1     1    1     0    0    0    0
1     0     0    0     0    0    1    0
2     0     0    0     1    0    0    0
3     1     0    0     0    0    0    1
4     0     1    0     0    0    1    1
5     0     0    0     1    0    0    0
6     1     0    0     1    1    0    0
7     0     0    0     0    0    0    0
8     0     1    1     0    0    0    0
9     0     0    0     0    0    0    0
10    0     0    0     0    1    0    0
11    0     1    0     1    0    0    0
12    0     0    1     0    1    0    0
13    0     0    1     1    0    0    0
14    0     1    0     1    1    0    1
15    1     0    0     1    0    0    0
16    0     0    1     1    0    0    0
17    0     0    0     1    0    0    0
18    0     0    0     1    0    0    1
19    0     0    0     0    0    0    0
20    0     0    1     1    0    1    1
21    0     0    0     0    0    0    0
22    0     0    0     0    1    0    0
23    0     0    0     0    0    0    0


In [133]:
cdf.sum().sum()

39

In [134]:
running_price = 0
running_earnings = 0
for i in range(len(start_times)): 
    start = start_times[i]
    end = end_times[i]
    
    print("Ride %d of %d [%d%%] \tStart: %s \tEnd: %s"%(i, len(start_times), 100 * i/len(start_times), to_datestring(start), to_datestring(end)))
    price, earnings = simulate_drive(df, start, end)
    
    running_price += price
    running_earnings += earnings

print(running_price)
print(running_earnings)

Ride 0 of 31 [0%] 	Start: 2018-11-26 00:00:00 	End: 2018-11-26 01:00:00
	[Time_Start : 2018-11-26 00:00:40] $16.5 [6.84 (7.85 wait)] [Haymarket Square] to [Theatre District]
	[Time_Start : 2018-11-26 00:15:22] $26.0 [4.08 (0.42 wait)] [Theatre District] to [South Station]
	[Time_Start : 2018-11-26 00:19:52] $10.5 [15.06 (8.41 wait)] [South Station] to [Beacon Hill]
	[Time_Start : 2018-11-26 00:43:20] $20.5 [11.82 (6.45 wait)] [Beacon Hill] to [Northeastern University]
	Price $73.50
	Earned $22.05
Ride 1 of 31 [3%] 	Start: 2018-11-26 03:00:00 	End: 2018-11-26 04:00:00
	[Time_Start : 2018-11-26 03:01:59] $3.0 [14.28 (4.04 wait)] [Haymarket Square] to [Back Bay]
	[Time_Start : 2018-11-26 03:20:19] $10.0 [13.20 (5.47 wait)] [Back Bay] to [Haymarket Square]
	Price $13.00
	Earned $3.90
Ride 2 of 31 [6%] 	Start: 2018-11-26 06:00:00 	End: 2018-11-26 07:00:00
	[Time_Start : 2018-11-26 06:06:04] $0.0 [6.96 (5.77 wait)] [Haymarket Square] to [Theatre District]
	[Time_Start : 2018-11-26 06:18:48] 

	[Time_Start : 2018-11-29 13:44:44] $22.5 [14.40 (5.53 wait)] [Beacon Hill] to [South Station]
	[Time_Start : 2018-11-29 14:04:40] $22.5 [19.68 (1.60 wait)] [South Station] to [Back Bay]
	[Time_Start : 2018-11-29 14:25:57] $13.5 [8.64 (3.04 wait)] [Back Bay] to [Fenway]
	[Time_Start : 2018-11-29 14:37:38] $12.0 [18.42 (6.78 wait)] [Fenway] to [North Station]
	[Time_Start : 2018-11-29 15:02:49] $11.0 [3.36 (1.43 wait)] [North Station] to [Haymarket Square]
	[Time_Start : 2018-11-29 15:07:37] $9.5 [3.72 (3.05 wait)] [Haymarket Square] to [West End]
	[Time_Start : 2018-11-29 15:14:23] $10.5 [17.40 (7.21 wait)] [West End] to [Fenway]
	[Time_Start : 2018-11-29 15:39:00] $0.0 [23.52 (4.83 wait)] [Fenway] to [North Station]
	[Time_Start : 2018-11-29 16:07:21] $26.0 [18.06 (2.48 wait)] [North Station] to [Boston University]
	[Time_Start : 2018-11-29 16:27:53] $12.0 [19.32 (5.85 wait)] [Boston University] to [Beacon Hill]
	[Time_Start : 2018-11-29 16:53:03] $9.5 [8.94 (8.87 wait)] [Beacon Hill]

In [108]:
df

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,duration
0,3.03,Lyft,1.543204e+09,Theatre District,Boston University,34.0,1.00,ef4771c2-c88d-4730-aaf7-a95751e9d27e,lyft_luxsuv,Lux Black XL,1090.8
1,1.57,Uber,1.543204e+09,North End,Theatre District,0.0,1.00,9962f244-8fce-4ae9-a583-139d5d7522e1,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi,565.2
2,1.30,Uber,1.543204e+09,Theatre District,South Station,18.5,1.00,00ea74ea-2c49-416c-bfc5-f7877025f6eb,6c84fd89-3f11-4782-9b50-97c468b19529,Black,468.0
3,2.71,Uber,1.543204e+09,Fenway,Theatre District,19.5,1.00,8682f9bf-5cc0-4dfc-b8fe-4e22070d1684,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,975.6
4,2.43,Lyft,1.543204e+09,Beacon Hill,Northeastern University,10.5,1.00,edfc7f44-97e1-48cd-930c-e4fe20e88ac8,lyft,Lyft,874.8
5,2.71,Uber,1.543204e+09,Fenway,Theatre District,32.0,1.00,6172077a-22de-481b-aae2-b5763c87a6c4,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,975.6
6,2.71,Uber,1.543204e+09,Fenway,Theatre District,0.0,1.00,2ee597c2-b685-450e-b3a2-2df42fb18d49,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi,975.6
7,2.19,Uber,1.543204e+09,North End,Beacon Hill,8.0,1.00,bb3f969d-3190-4bb8-9a84-dff2deba0a98,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,788.4
8,3.05,Uber,1.543204e+09,Fenway,North Station,10.5,1.00,f67b0a6b-08f9-43bb-b47d-efad7310d4c7,9a0e7b09-b92b-4c41-9779-2ad22b4d779d,WAV,1098.0
9,2.19,Uber,1.543204e+09,North End,Beacon Hill,13.0,1.00,cec0651f-2280-48ac-86bf-cb0152cada9e,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,788.4
