<h1>Prepare Data</h1>

In [47]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score

SECONDS_IN_YEAR = (dt.datetime(2021, 1, 1) - dt.datetime(2020, 1, 1)).total_seconds()

def get_population_density(location):
    if location == "Kenosha;WI":
        return 1
    elif location == "Hackensack;NJ":
        return 2
    elif location == "St. Louis;MO":
        return 3
    elif location == "Ellicott City;MD":
        return 4
    elif location == "Baltimore;MD":
        return 5
    elif location == "Catonsville;MD":
        return 6
    elif location == "North Bethesda;MD":
        return 7
    elif location == "Los Angeles;CA":
        return 8

def get_SINE(date):
    seconds = (date - dt.datetime(2020, 1, 1, 0)).total_seconds()
    return np.sin(2 * np.pi * seconds / SECONDS_IN_YEAR)

def get_COSINE(date):
    seconds = (date - dt.datetime(2020, 1, 1, 0)).total_seconds()
    return np.cos(2 * np.pi * seconds / SECONDS_IN_YEAR)

def get_hour_difference(start_datetime, end_datetime):
    return (end_datetime - start_datetime).total_seconds() / 3600

# Import file
#import_file = "DoorDash_Kenosha_WI_V2.csv"
#import_file = "DoorDash_Hackensack_NJ_V2.csv"
#import_file = "DoorDash_St._Louis_MO_V2.csv"
#import_file = "DoorDash_MD_V2.csv"
#import_file = "DoorDash_Los_Angeles_CA_V2.csv"
import_file = "DoorDash_Combined_V2.csv"
#columns = ["Location", "Start_Datetime", "End_Datetime", "DPH", "TEPH"]
columns = ["Start_Datetime", "Deliveries", "Location", "Total_Earnings", "DPH", "ID", "End_Datetime", "TEPH"]
df = pd.read_csv(import_file, names=columns)

df.describe()

# Change to numerical data
df["Location"] = df["Location"].map(get_population_density)
df["Start_Datetime"] = pd.to_datetime(df["Start_Datetime"], format="%Y-%m-%dT%H:%MZ", errors="coerce")
df["DayOfWeek"] = df["Start_Datetime"].dt.dayofweek
df["Start_Datetime"] = (df["Start_Datetime"] - dt.datetime(2020, 1, 1)).dt.total_seconds()
df["Start_Datetime_SINE"] = np.sin(2 * np.pi * df["Start_Datetime"] / SECONDS_IN_YEAR)
df["Start_Datetime_COSINE"] = np.cos(2 * np.pi * df["Start_Datetime"] / SECONDS_IN_YEAR)
df["End_Datetime"] = pd.to_datetime(df["End_Datetime"], format="%Y-%m-%dT%H:%MZ", errors="coerce")
df["End_Datetime"] = (df["End_Datetime"] - dt.datetime(2020, 1, 1)).dt.total_seconds()
df["End_Datetime_SINE"] = np.sin(2 * np.pi * df["End_Datetime"] / SECONDS_IN_YEAR)
df["End_Datetime_COSINE"] = np.cos(2 * np.pi * df["End_Datetime"] / SECONDS_IN_YEAR)
df = df[["Location", "Start_Datetime_SINE", "Start_Datetime_COSINE", "End_Datetime_SINE", "End_Datetime_COSINE", "DayOfWeek", "DPH", "TEPH"]]

# Separate features and classes
feature_names = ["Location", "Start_Datetime_SINE", "Start_Datetime_COSINE", "End_Datetime_SINE", "End_Datetime_COSINE", "DayOfWeek"]
all_features = df[feature_names].values
all_classes_TEPH = df["TEPH"].values

# Normalize data
scaler = preprocessing.StandardScaler()
all_features_scaled = scaler.fit_transform(all_features)

# Test inputs
location = get_population_density("Kenosha;WI")
start_datetime = dt.datetime(2020, 10, 5, 15, 0)
end_datetime = dt.datetime(2020, 10, 5, 16, 0)
start_datetime_SINE = get_SINE(start_datetime)
start_datetime_COSINE = get_COSINE(start_datetime)
end_datetime_SINE = get_SINE(end_datetime)
end_datetime_COSINE = get_COSINE(end_datetime)
dayOfWeek = start_datetime.weekday()
hour_difference = get_hour_difference(start_datetime, end_datetime)

df.describe()

Unnamed: 0,Location,Start_Datetime_SINE,Start_Datetime_COSINE,End_Datetime_SINE,End_Datetime_COSINE,DayOfWeek,DPH,TEPH
count,340.0,343.0,343.0,343.0,343.0,343.0,343.0,343.0
mean,2.397059,0.284093,-0.595033,0.283465,-0.595847,2.623907,2.053294,17.180379
std,1.766116,0.630999,0.410756,0.631107,0.40984,1.867296,0.660985,6.346039
min,1.0,-0.996916,-0.999993,-0.99699,-0.999999,0.0,0.25,1.72
25%,1.0,-0.225284,-0.958547,-0.224593,-0.959161,1.0,1.675,13.2
50%,2.0,0.32488,-0.731646,0.323944,-0.729693,3.0,2.07,17.02
75%,3.0,0.921236,-0.330484,0.920311,-0.331887,4.0,2.39,20.93
max,7.0,1.0,0.398202,0.99998,0.395138,6.0,6.0,45.0


In [45]:
import math

TEPH_mean = df["TEPH"].mean()
print("TEPH Calculation:", TEPH_mean * hour_difference)

TEPH_std = df["TEPH"].std()
TEPH_variance = TEPH_std * TEPH_std
TEPH_test_std = math.sqrt(TEPH_variance * hour_difference)
print("TEPH STD Calculation:", TEPH_test_std)

TEPH Calculation: 17.180379008746357
TEPH STD Calculation: 6.3460392977205125


<h1>Random Forest</h1>

In [46]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=10, random_state=1)

# Return random forest estimates and k-fold cross-validation scores
def get_random_forest(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek):
    test_input = [[location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek]]
    # TEPH
    cv_scores = cross_val_score(clf, all_features_scaled, all_classes_TEPH, cv=10)
    TEPH_k = cv_scores.mean()
    clf.fit(all_features_scaled, all_classes_TEPH)
    TEPH = clf.predict(test_input)[0]
    TEPH_array = [TEPH, TEPH_k, "TEPH"]
    return [TEPH_array]

estimates = get_random_forest(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek)
print(estimates)

print("TEPH Calculation:", estimates[0][0] * hour_difference)
TEPH_test_std = math.sqrt(TEPH_variance * hour_difference)
print("TEPH STD Calculation:", TEPH_test_std)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').



ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

<h1>Linear SVM</h1>

In [36]:
from sklearn import svm

# Hyperparameters
C = 1.0

svr = svm.SVR(kernel="linear", C=C)

# Return linear SVM estimates and k-fold cross-validation scores
def get_linear_SVM(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek):
    test_input = [[location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek]]
    # TEPH
    cv_scores = cross_val_score(svr, all_features_scaled, all_classes_TEPH, cv=10)
    TEPH_k = cv_scores.mean()
    svr.fit(all_features_scaled, all_classes_TEPH)
    TEPH = svr.predict(test_input)[0]
    TEPH_array = [TEPH, TEPH_k, "TEPH"]
    return [TEPH_array]

estimates = get_linear_SVM(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek)
print(estimates)

print("TEPH Calculation:", estimates[0][0] * hour_difference)
TEPH_test_std = math.sqrt(TEPH_variance * hour_difference)
print("TEPH STD Calculation:", TEPH_test_std)

[[14.62761104066157, 0.00018257027584018816, 'TEPH']]
TEPH Calculation: 14.62761104066157
TEPH STD Calculation: 6.178973095657914


<h1>RBF SVM</h1>

In [37]:
from sklearn import svm

# Hyperparameters
C = 1.0

svr = svm.SVR(kernel="rbf", C=C)

# Return RBF SVM estimates and k-fold cross-validation scores
def get_RBF_SVM(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek):
    test_input = [[location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek]]
    # TEPH
    cv_scores = cross_val_score(svr, all_features_scaled, all_classes_TEPH, cv=10)
    TEPH_k = cv_scores.mean()
    svr.fit(all_features_scaled, all_classes_TEPH)
    TEPH = svr.predict(test_input)[0]
    TEPH_array = [TEPH, TEPH_k, "TEPH"]
    return [TEPH_array]

estimates = get_RBF_SVM(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek)
print(estimates)

print("TEPH Calculation:", estimates[0][0] * hour_difference)
TEPH_test_std = math.sqrt(TEPH_variance * hour_difference)
print("TEPH STD Calculation:", TEPH_test_std)

[[14.25249904389894, 0.03852328932240066, 'TEPH']]
TEPH Calculation: 14.25249904389894
TEPH STD Calculation: 6.178973095657914


<h1>K-Nearest Neighbors</h1>

In [38]:
from sklearn import neighbors

clf = neighbors.KNeighborsRegressor(n_neighbors=10)

# Find optimal value for neighbors
#for n in range(1, 50):
#    clf = neighbors.KNeighborsRegressor(n_neighbors=n)
#    cv_scores = cross_val_score(clf, all_features_scaled, all_classes_TEPH, cv=10)
#    print(n, cv_scores.mean())

# Return KNN estimates and k-fold cross-validation scores
def get_KNN(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek):
    test_input = [[location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek]]
    # TEPH
    cv_scores = cross_val_score(clf, all_features_scaled, all_classes_TEPH, cv=10)
    TEPH_k = cv_scores.mean()
    clf.fit(all_features_scaled, all_classes_TEPH)
    TEPH = clf.predict(test_input)[0]
    TEPH_array = [TEPH, TEPH_k, "TEPH"]
    return [TEPH_array]

estimates = get_KNN(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek)
print(estimates)

print("TEPH Calculation:", estimates[0][0] * hour_difference)
TEPH_test_std = math.sqrt(TEPH_variance * hour_difference)
print("TEPH STD Calculation:", TEPH_test_std)

[[13.425999999999998, -0.021429989476138177, 'TEPH']]
TEPH Calculation: 13.425999999999998
TEPH STD Calculation: 6.178973095657914


<h1>Neural Network</h1>

In [42]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

# Create model
def create_model():
    model = Sequential()
    model.add(Dense(6, input_dim=6, kernel_initializer="normal", activation="relu"))
    model.add(Dense(3, kernel_initializer="normal", activation="relu"))
    model.add(Dense(1, kernel_initializer="normal", activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [43]:
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

estimator = KerasRegressor(build_fn=create_model, epochs=100, verbose=0)

# Return neural network estimates and k-fold cross-validation scores
def get_neural_network(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek):
    test_input = [[location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek]]
    # TEPH
    cv_scores = cross_val_score(estimator, all_features_scaled, all_classes_TEPH, cv=10)
    TEPH_k = cv_scores.mean()
    estimator.fit(all_features_scaled, all_classes_TEPH)
    TEPH = estimator.predict(test_input)
    TEPH_array = [TEPH, TEPH_k, "TEPH"]
    return [TEPH_array]

estimates = get_neural_network(location, start_datetime_SINE, start_datetime_COSINE, end_datetime_SINE, end_datetime_COSINE, dayOfWeek)
print(estimates)

print("TEPH Calculation:", estimates[0][0] * hour_difference)
TEPH_test_std = math.sqrt(TEPH_variance * hour_difference)
print("TEPH STD Calculation:", TEPH_test_std)

ValueError: A target array with shape (271, 1) was passed for an output of shape (None, 3) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.

ValueError: A target array with shape (272, 1) was passed for an output of shape (None, 3) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.



ValueError: A target array with shape (302, 1) was passed for an output of shape (None, 3) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.