In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import time
from datetime import datetime
from model import *

In [2]:
df = pd.read_csv("boston_kaggle/cab_rides.csv")
df['time_stamp'] = df['time_stamp']/1000
df = df.sort_values(by = "time_stamp")
#make assumption that drivers are driving at 40 mph - get duration in seconds
df['duration'] = df['distance'] / 10 * 60 * 60
df = df.reset_index(drop=True)
df = df.fillna(0)

In [3]:
old_df = df.copy()

In [4]:
df.columns

Index(['distance', 'cab_type', 'time_stamp', 'destination', 'source', 'price',
       'surge_multiplier', 'id', 'product_id', 'name', 'duration'],
      dtype='object')

In [5]:
set(df.destination)

{'Back Bay',
 'Beacon Hill',
 'Boston University',
 'Fenway',
 'Financial District',
 'Haymarket Square',
 'North End',
 'North Station',
 'Northeastern University',
 'South Station',
 'Theatre District',
 'West End'}

In [6]:
print(df)

        distance cab_type    time_stamp              destination  \
0           3.03     Lyft  1.543204e+09         Theatre District   
1           1.57     Uber  1.543204e+09                North End   
2           1.30     Uber  1.543204e+09         Theatre District   
3           2.71     Uber  1.543204e+09                   Fenway   
4           2.43     Lyft  1.543204e+09              Beacon Hill   
5           2.71     Uber  1.543204e+09                   Fenway   
6           2.71     Uber  1.543204e+09                   Fenway   
7           2.19     Uber  1.543204e+09                North End   
8           3.05     Uber  1.543204e+09                   Fenway   
9           2.19     Uber  1.543204e+09                North End   
10          2.19     Uber  1.543204e+09                North End   
11          2.22     Lyft  1.543204e+09  Northeastern University   
12          4.46     Lyft  1.543204e+09       Financial District   
13          4.46     Lyft  1.543204e+09       Fi

In [7]:
def simulate_drive(df, time_start, time_end, source = "Haymarket Square", verbose = True):
    wait_noise = max(0, np.random.normal(0, 1) * 60 * 3 + 5 * 60)
    time_start = time_start + wait_noise
    
    #total price of all the rides
    running_price = 0
    
    #duration of the ride
    duration = 0
    
    df = df[(df['time_stamp'] > time_start) & (df['time_stamp'] < time_end)]
    drives = df[(df['time_stamp'] > time_start) & (df['source'] == source)].reset_index(drop= True)

    wait_noise = 0
    
    while(len(drives) > 0): 
        wait_noise = max(0, np.random.normal(0, 1) * 60 * 3 + 10 * 60)
        if verbose: 
            print("\t[Time_Start : %s] $%s [%.2f (%.2f wait)] [%s] to [%s]" %(str(datetime.fromtimestamp(time_start)).split(".")[0],
                                                        drives['price'].iloc[0],
                                                        drives['duration'].iloc[0]/60,
                                                        wait_noise/60, 
                                                        drives['source'].iloc[0], 
                                                        drives['destination'].iloc[0] 
                                                        ))
        running_price += drives['price'].iloc[0]
        duration = drives['duration'].iloc[0] 

        
        #note - everything below is after the trip has been taken already

        
        #increment time start by the duration of the trip + the noise in waiting for the next trip
        time_start = time_start + duration + wait_noise

        # the next source is the current drive's destination
        source = drives['destination'].iloc[0]
        # drives is based on a slice of the dataframe
        drives = df[(df['time_stamp'] > time_start) & (df['source'] == source)].reset_index(drop= True)

    if verbose: 
        print("\tPrice $%.2f" % running_price)
        print("\tEarned $%.2f" % (running_price * 0.3))
    earnings = running_price * 0.3
    return running_price, earnings

time_start = df['time_stamp'].iloc[101]
time_end = time_start + 60 * 60    
simulate_drive(df, time_start, time_end, "Back Bay")


	[Time_Start : 2018-11-25 23:44:55] $7.5 [6.60 (8.48 wait)] [Back Bay] to [Northeastern University]
	[Time_Start : 2018-11-26 00:00:00] $23.0 [19.32 (11.47 wait)] [Northeastern University] to [North Station]
	[Time_Start : 2018-11-26 00:30:47] $13.5 [12.72 (14.82 wait)] [North Station] to [South Station]
	Price $44.00
	Earned $13.20


(44.0, 13.2)

In [8]:
CALENDAR_SIZE = (24, 7) #set calendar size to 24 by 7 

def generate_calendar_matrix(arr = None): 
    '''
    Generates a calendar matrix based off
    an optional numpy input array. If no arr
    is inputted, the default is zero
    '''
    if arr is None:
        arr = np.zeros(CALENDAR_SIZE)
    df = pd.DataFrame(arr, index = range(CALENDAR_SIZE[0]), columns= ["mon", "tues", "wed", "thur", "fri", "sat", "sun"])
    return df

def to_datestring(s): 
    return str(datetime.fromtimestamp(s)).split(".")[0]

In [9]:
cdf = (np.random.random(CALENDAR_SIZE) > 0.75).astype(int)


def get_start_end_times(cmatrix, base_date = '11/26/18'): 
    orig = np.zeros((CALENDAR_SIZE[0] + 1, CALENDAR_SIZE[1]))
    shift = np.zeros((CALENDAR_SIZE[0] + 1, CALENDAR_SIZE[1]))

    orig[1:, :] = cmatrix
    shift[:-1, :] = cmatrix

    diff = shift - orig

    start_times_bool = diff == 1
    end_times_bool = diff == -1
    
    base_timestamp = datetime.timestamp(datetime.strptime((base_date), '%m/%d/%y'))
    li = [[base_timestamp + hour*60*60 + day*24*60*60 for day in range(7)] for hour in range(25)]
    #li = [[str(datetime.fromtimestamp(x)).split(".")[0] for x in y] for y in li]

    date_arr = np.array(li)

    start_times = date_arr[start_times_bool]
    end_times = date_arr[end_times_bool]

    start_times = np.sort(start_times)
    end_times = np.sort(end_times)
    
    return start_times, end_times


start_times, end_times = get_start_end_times(cdf)
print(cdf)
print(start_times, end_times)

[[0 1 1 0 1 1 0]
 [1 0 0 0 0 0 0]
 [0 0 1 1 1 0 1]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 1 0 0 0 1]
 [0 0 0 0 1 0 0]
 [1 0 0 0 1 0 1]
 [0 1 1 0 0 0 1]
 [0 0 0 0 0 1 0]
 [1 0 0 0 1 0 1]
 [0 0 0 1 0 0 0]
 [0 0 0 1 1 0 0]
 [0 0 0 1 0 0 0]
 [0 0 1 0 0 0 0]
 [0 1 1 0 0 0 1]
 [0 0 0 0 0 1 0]
 [1 0 1 0 0 0 0]
 [1 1 0 0 0 0 0]
 [0 0 0 1 0 1 0]
 [1 0 0 0 0 1 0]
 [0 0 0 0 0 0 0]]
[1.5432120e+09 1.5432264e+09 1.5432408e+09 1.5432516e+09 1.5432768e+09
 1.5432876e+09 1.5432948e+09 1.5433128e+09 1.5433308e+09 1.5433560e+09
 1.5433668e+09 1.5433812e+09 1.5433884e+09 1.5434028e+09 1.5434172e+09
 1.5434388e+09 1.5434496e+09 1.5434748e+09 1.5435144e+09 1.5435432e+09
 1.5435540e+09 1.5435612e+09 1.5435828e+09 1.5435972e+09 1.5436044e+09
 1.5436404e+09 1.5436800e+09 1.5437052e+09 1.5437160e+09 1.5437340e+09
 1.5437520e+09 1.5437592e+09 1.5437700e+09 1.5437880e+09] [1.5432156e+09 1.5432300e+09 1.5432444e+09 1.5432552e+09 1.5432840e+09
 1.5432912e+09 1.5432984e+09 1.54331

In [10]:
cdf.sum().sum()

42

In [24]:
def schedule_model(cmatrix, trials = 1, verbose = True): 
    start_times, end_times = get_start_end_times(cmatrix)
    
    earningslist = []
    for i in range(trials): 
        running_price = 0
        running_earnings = 0
        for i in range(len(start_times)): 
            start = start_times[i]
            end = end_times[i]
            if verbose: 
                print("Ride %d of %d [%d%%] \tStart: %s \tEnd: %s"%(i, len(start_times), 100 * i/len(start_times), to_datestring(start), to_datestring(end)))
            price, earnings = simulate_drive(df, start, end, verbose = verbose)
            running_price += price
            running_earnings += earnings
        
        running_earnings = np.round(running_earnings, 2)
        earningslist.append(running_earnings)
    return earningslist    

In [36]:
#set availability
mask = np.zeros(CALENDAR_SIZE)
mask[7:23, :] = 1
mask = mask.astype(int)


dates = ["2018-11-26", "2018-11-27", "2018-11-28", "2018-11-29", "2018-11-30", "2018-12-01", "2018-12-02"]

trials = 5000
trial_earnings_recs = []
#generate recommendation-based-earnings
for i in range(trials):
    hours = np.random.normal(20, 3)
    traffic_preference = np.random.normal(1, 0.5)
    weights = {'traffic': traffic_preference}
    if i%100 == 0: 
        print("%d of %d [%d%%]"%(i, trials, 100*i/trials)) 
    cmatrix = simple_model(weights, mask, hours, CALENDAR_SIZE, dates, verbose = False)
    trial_earnings_recs.append(np.mean(schedule_model(cmatrix, verbose = False)))

trial_earnings_rand = []
#generate recommendation-based-earnings    
for i in range(trials): 
    hours = np.random.normal(20, 3)
    cdf = (np.random.random(CALENDAR_SIZE))
    cdf = (cdf * mask)
    if i%100 == 0: 
        print("%d of %d [%d%%]"%(i, trials, 100*i/trials)) 
    cmatrix = (cdf > np.quantile(cdf, (24*7 - hours)/(24*7))).astype(int)
    trial_earnings_rand.append(np.mean(schedule_model(cmatrix, verbose = False)))
    
print(trial_earnings_recs)
print(trial_earnings_rand)

0 of 5000 [0%]
100 of 5000 [2%]
200 of 5000 [4%]
300 of 5000 [6%]
400 of 5000 [8%]
500 of 5000 [10%]
600 of 5000 [12%]
700 of 5000 [14%]
800 of 5000 [16%]
900 of 5000 [18%]
1000 of 5000 [20%]
1100 of 5000 [22%]
1200 of 5000 [24%]
1300 of 5000 [26%]
1400 of 5000 [28%]
1500 of 5000 [30%]
1600 of 5000 [32%]
1700 of 5000 [34%]
1800 of 5000 [36%]
1900 of 5000 [38%]
2000 of 5000 [40%]
2100 of 5000 [42%]
2200 of 5000 [44%]
2300 of 5000 [46%]
2400 of 5000 [48%]
2500 of 5000 [50%]
2600 of 5000 [52%]
2700 of 5000 [54%]
2800 of 5000 [56%]
2900 of 5000 [58%]
3000 of 5000 [60%]
3100 of 5000 [62%]
3200 of 5000 [64%]
3300 of 5000 [66%]
3400 of 5000 [68%]
3500 of 5000 [70%]
3600 of 5000 [72%]
3700 of 5000 [74%]
3800 of 5000 [76%]
3900 of 5000 [78%]
4000 of 5000 [80%]
4100 of 5000 [82%]
4200 of 5000 [84%]
4300 of 5000 [86%]
4400 of 5000 [88%]
4500 of 5000 [90%]
4600 of 5000 [92%]
4700 of 5000 [94%]
4800 of 5000 [96%]
4900 of 5000 [98%]
0 of 5000 [0%]
100 of 5000 [2%]
200 of 5000 [4%]
300 of 5000 [6%]
4

In [43]:
with open("recs.txt", "x") as f: 
    f.write(str(trial_earnings_recs))
with open("rand.txt", "x") as f: 
    f.write(str(trial_earnings_rand))

In [None]:
hours = np.random.normal(20, 3)
cdf = (np.random.random(CALENDAR_SIZE))
cdf = (cdf * mask)
cmatrix = (cdf > np.quantile(cdf, (24*7 - hours)/(24*7))).astype(int)
plt.imshow(cmatrix)

In [None]:

cdf = generate_calendar_matrix((np.random.random(CALENDAR_SIZE) > 0.75).astype(int))

In [35]:
np.random.normal(1, 0.5)

1.527185786153098

In [44]:
from scipy.stats import ttest_ind
ttest_ind(np.array(trial_earnings_recs), np.array(trial_earnings_rand))

Ttest_indResult(statistic=-2.859303688621242, pvalue=0.004254506739589373)

In [48]:
np.mean(np.array(trial_earnings_recs))

265.77207000000004

In [49]:
np.mean(np.array(trial_earnings_rand))

268.4343