In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.window import Window
from sklearn import preprocessing # https://github.com/Snowflake-Labs/snowpark-python-demos/tree/main/sp4py_utilities
from snowflake.snowpark.functions import col
import snowflake
import snowflake.snowpark

import getpass
import pandas as pd
import matplotlib.pyplot as plt
import decimal
import joblib 
from datetime import datetime, timedelta
import random

In [2]:
# accountname = getpass.getpass() # ORGNAME-ACCOUNTNAME (separated by minus sign)
# username = getpass.getpass()    # SNOWFLAKE-USERNAME
# password = getpass.getpass()    # SNOWFLAKE-PASSWORD

accountname = "HIIOYKL-IX77996" # ORGNAME-ACCOUNTNAME (separated by minus sign)
username =  "Nathan" # SNOWFLAKE-USERNAME
password = "Nathan5!"# SNOWFLAKE-PASSWORD

In [3]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "FROSTBYTE_TASTY_BYTES",

    "warehouse": "COMPUTE_WH"
}

session = Session.builder.configs(connection_parameters).create()

In [4]:
algo_table = session.table('ROUTING."ALGO_DATA(With Year)"')
algo_result = algo_table.collect()

In [5]:
algo_df = pd.DataFrame(algo_result)

In [6]:
algo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540201 entries, 0 to 540200
Data columns (total 39 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   TRUCK_ID                                  540201 non-null  int64  
 1   MONTH                                     540201 non-null  int64  
 2   HOUR                                      540201 non-null  int64  
 3   DOW                                       540201 non-null  int64  
 4   DAY                                       540201 non-null  int64  
 5   PUBLIC_HOLIDAY                            540201 non-null  int64  
 6   SUM(ORDER_TOTAL)                          540201 non-null  float64
 7   LAT                                       540201 non-null  float64
 8   LONG                                      540201 non-null  float64
 9   LOCATION_ID                               540201 non-null  int64  
 10  SUM_DAY_OF_WEEK_AVG_

In [7]:
unique_combinations = algo_df.groupby(['YEAR', 'DAY', 'MONTH']).size().reset_index().rename(columns={0: 'Count'})
num_unique_combinations = len(unique_combinations)
num_unique_combinations

latest_date_row = algo_df.sort_values(['YEAR', 'MONTH', 'DAY'], ascending=False).iloc[0]

In [8]:
latest_date_row.loc[['DAY','MONTH','YEAR']]

DAY        30.0
MONTH      10.0
YEAR     2022.0
Name: 1776, dtype: float64

In [9]:
latest_date = {'YEAR': 2022.0, 'MONTH': 10.0, 'DAY': 30.0}

# Calculate the date 1 year before the latest date
one_year_before = {'YEAR': latest_date['YEAR'] - 1, 'MONTH': latest_date['MONTH'], 'DAY': latest_date['DAY']}

# Filter the DataFrame to exclude data within the last year
holdout_df_year = algo_df[(algo_df['YEAR'] < one_year_before['YEAR']) | 
                      (algo_df['YEAR'] == one_year_before['YEAR']) & (algo_df['MONTH'] < one_year_before['MONTH']) |
                      (algo_df['YEAR'] == one_year_before['YEAR']) & (algo_df['MONTH'] == one_year_before['MONTH']) & (algo_df['DAY'] <= one_year_before['DAY'])]

holdout_df_year

Unnamed: 0,TRUCK_ID,MONTH,HOUR,DOW,DAY,PUBLIC_HOLIDAY,SUM(ORDER_TOTAL),LAT,LONG,LOCATION_ID,...,CITY_SEATTLE_ENCODED,CITY_DENVER_ENCODED,CITY_San Mateo_encoded,CITY_New York City_encoded,CITY_BOSTON_ENCODED,REGION_NY_ENCODED,REGION_MA_ENCODED,REGION_CO_ENCODED,REGION_WA_ENCODED,REGION_CA_ENCODED
0,17,9,16,4,10,0,1305.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
1,17,9,17,4,10,0,610.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
2,17,9,18,4,10,0,3461.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
3,17,9,19,4,10,0,5221.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
4,17,9,20,4,10,0,8929.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540196,21,6,18,4,4,0,3517.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
540197,21,6,19,4,4,0,2466.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
540198,21,6,20,4,4,0,2297.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0
540199,21,6,21,4,4,0,2289.0,39.748742,-104.972158,3304,...,0,1,0,0,0,0,0,1,0,0


In [10]:
algo_df.rename(columns={"SUM(ORDER_TOTAL)": "Revenue"},inplace=True)

In [11]:
model = joblib.load(open('model.joblib',"rb"))

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



In [12]:
X_final_scaled = holdout_df_year.copy()

Routing Build

In [13]:
# Get unique location IDs
unique_location_ids = X_final_scaled['LOCATION_ID'].unique()
print("Unique Location IDs:", unique_location_ids)

# Create a list to store the table data
table_data = []


# Create a DataFrame to store the table data
df_unique_locations_lat_long = pd.DataFrame(columns=["Location ID", "Latitude", "Longitude"])

# Iterate over each unique location ID
for location_id in unique_location_ids:
    location = X_final_scaled[X_final_scaled['LOCATION_ID'] == location_id]
    latitude = location['LAT'].values[0]
    longitude = location['LONG'].values[0]
    df_unique_locations_lat_long = pd.concat([df_unique_locations_lat_long, pd.DataFrame({"Location ID": [location_id],
                                                  "Latitude": [latitude],
                                                  "Longitude": [longitude]})],
                         ignore_index=True)

# Print the DataFrame
df_unique_locations_lat_long

Unique Location IDs: [ 3304  3156 15428 ... 14861 14820  3595]


Unnamed: 0,Location ID,Latitude,Longitude
0,3304,39.748742,-104.972158
1,3156,39.758079,-104.974347
2,15428,42.326432,-71.075696
3,15295,42.288406,-71.101457
4,15296,42.288024,-71.099891
...,...,...,...
1873,3580,39.686778,-104.908275
1874,3329,40.761161,-73.989227
1875,14861,39.790958,-104.805624
1876,14820,39.792662,-104.841885


In [14]:
X_final_scaled.rename(columns={"SUM(ORDER_TOTAL)": "Revenue"},inplace=True)

In [15]:
#Add date column
X_final_scaled['Date'] = pd.to_datetime(X_final_scaled[['YEAR', 'MONTH', 'DAY']])

Part 2 Start Again

In [16]:
date = '2021-8-23'
datetime_object = datetime.strptime(date, '%Y-%m-%d')

weekadd = timedelta(days=14)

two_week=datetime_object-weekadd

sql_string='select DAY,MONTH,YEAR,TRUCK_ID,"SUM(ORDER_TOTAL)" from ROUTING."ALGO_DATA(With Year)"\
where (DAY>={} and YEAR>={} and MONTH>={}) \
and (DAY<={} and YEAR<={} and MONTH<={})'.format(two_week.day,two_week.year,two_week.month,datetime_object.day,datetime_object.year,datetime_object.month)


full_sql='select Truck_ID,SUM("SUM(ORDER_TOTAL)") as Revenue from ({}) group by TRUCK_ID  having TRUCK_ID  in (27,28,43,44,46,47)'.format(sql_string)


session.sql(full_sql).show()

past_2_weeks_rows = session.sql(full_sql).collect()
past_2_weeks_df = pd.DataFrame(past_2_weeks_rows)

past_2_weeks_df_sorted = past_2_weeks_df.sort_values(by='REVENUE', ascending=False)
past_2_weeks_df_sorted

--------------------------
|"TRUCK_ID"  |"REVENUE"  |
--------------------------
|28          |669889.0   |
|47          |1974438.0  |
|46          |894930.0   |
|43          |1405972.0  |
|27          |913659.0   |
--------------------------



Unnamed: 0,TRUCK_ID,REVENUE
1,47,1974438.0
3,43,1405972.0
4,27,913659.0
2,46,894930.0
0,28,669889.0


In [17]:
working_days = 6
truck_ids = [27, 28, 43, 44, 46, 47] 

start_date = datetime.strptime('2021-08-23', '%Y-%m-%d')
dates = [start_date + timedelta(days=i) for i in range(working_days)]

truck_data = []


for i in range(len(truck_ids)):
    num_of_locs = random.randrange(2, 5)
    each_location_travel_distance = random.randrange(8, 12)
    max_total_travel_distance = each_location_travel_distance * num_of_locs

    truck_data.append({
        'Truck_ID': truck_ids[i],
        'Date': dates[i],
        'Starting_Hour': random.randrange(8, 12),
        'Ending_Hour': random.randrange(18, 24),
        'Num_of_locs': num_of_locs,
        'each_location_travel_distance': each_location_travel_distance,
        'Max_Total_Travel_Distance': max_total_travel_distance
    })

truck_df = pd.DataFrame(truck_data)

In [18]:
starting_locations = {}

for truck in truck_ids:
    truck_locations = X_final_scaled[X_final_scaled['TRUCK_ID'] == truck]['LOCATION_ID'].values
    starting_location = truck_locations[0] if len(truck_locations) > 0 else None
    starting_locations[truck] = starting_location

truck_df['Starting_Location'] = truck_df['Truck_ID'].map(starting_locations)

In [19]:
truck_df

Unnamed: 0,Truck_ID,Date,Starting_Hour,Ending_Hour,Num_of_locs,each_location_travel_distance,Max_Total_Travel_Distance,Starting_Location
0,27,2021-08-23,10,21,3,11,33,3304
1,28,2021-08-24,9,19,2,9,18,3304
2,43,2021-08-25,10,20,2,11,22,4121
3,44,2021-08-26,11,19,2,8,16,1917
4,46,2021-08-27,8,19,2,8,16,15428
5,47,2021-08-28,10,23,4,10,40,15428


In [20]:
truck_ids = [27, 28, 43, 44, 46, 47]
Starting_Location = [3304,3304,4121,1917,15428]
date = '2021-08-23'
datetime_object = datetime.strptime(date, '%Y-%m-%d')

In [21]:
truck_cols=['TRUCK_ID','MENU_TYPE_GYROS_ENCODED', 'MENU_TYPE_CREPES_ENCODED',
       'MENU_TYPE_BBQ_ENCODED', 'MENU_TYPE_SANDWICHES_ENCODED',
       'MENU_TYPE_Mac & Cheese_encoded', 'MENU_TYPE_POUTINE_ENCODED',
       'MENU_TYPE_ETHIOPIAN_ENCODED', 'MENU_TYPE_TACOS_ENCODED',
       'MENU_TYPE_Ice Cream_encoded', 'MENU_TYPE_Hot Dogs_encoded',
       'MENU_TYPE_CHINESE_ENCODED', 'MENU_TYPE_Grilled Cheese_encoded',
       'MENU_TYPE_VEGETARIAN_ENCODED', 'MENU_TYPE_INDIAN_ENCODED',
       'MENU_TYPE_RAMEN_ENCODED']
location_cols=[ 'CITY_SEATTLE_ENCODED',
       'CITY_DENVER_ENCODED', 'CITY_San Mateo_encoded',
       'CITY_New York City_encoded', 'CITY_BOSTON_ENCODED',
       'REGION_NY_ENCODED', 'REGION_MA_ENCODED', 'REGION_CO_ENCODED',
       'REGION_WA_ENCODED', 'REGION_CA_ENCODED', 'LAT', 'LONG', 'LOCATION_ID','HOUR']

In [22]:
def generate_date_data(df,datetime_object):
    df['date'] = pd.to_datetime(datetime_object)
    df['MONTH'] = df['date'].dt.month
    df['DOW'] = df['date'].dt.weekday
    df['DAY'] = df['date'].dt.day
    df['WOM'] = (df['DAY'] - 1) // 7 + 1
    df['YEAR'] = df['date'].dt.year

    public_holidays = [
     {'Month': 7, 'Day': 4, 'DOW': None, 'WOM': None},  # 4th of July
   {'Month': 12, 'Day': 24, 'DOW': None, 'WOM': None},  # Christmas Eve
    {'Month': 12, 'Day': 25, 'DOW': None, 'WOM': None},  # Christmas Day
    {'Month': 10, 'Day': None, 'DOW': '0', 'WOM': 2},  # Columbus Day (second Monday in October)
    {'Month': 6, 'Day': 19, 'DOW': None, 'WOM': None},  # Juneteenth
    {'Month': 9, 'Day': None, 'DOW': '0', 'WOM': 1},  # Labor Day (first Monday in September)
    {'Month': 1, 'Day': None, 'DOW': '0', 'WOM': 3},  # Martin Luther King, Jr. Day (third Monday in January)
    {'Month': 5, 'Day': None, 'DOW': '0', 'WOM': -1},  # Memorial Day (last Monday in May)
    {'Month': 1, 'Day': 1, 'DOW': None, 'WOM': None},  # New Year's Day
    {'Month': 12, 'Day': 31, 'DOW': None, 'WOM': None},  # New Year's Eve
    {'Month': 11, 'Day': None, 'DOW': '3', 'WOM': 4},  # Thanksgiving Day (fourth Thursday in November)
    {'Month': 11, 'Day': None, 'DOW': '2', 'WOM': 4},  # Thanksgiving Eve (fourth Wednesday in November)
    {'Month': 2, 'Day': 14, 'DOW': None, 'WOM': None},  # Valentine's Day
    {'Month': 11, 'Day': 11, 'DOW': None, 'WOM': None},  # Veterans Day
    {'Month': 10, 'Day': 31, 'DOW': None, 'WOM': None},  # Halloween
    {'Month': 3, 'Day': 17, 'DOW': None, 'WOM': None},  # St. Patrick's Day
    {'Month': 11, 'Day': 25, 'DOW': '4', 'WOM': None},  # Black Friday
    {'Month': 12, 'Day': 26, 'DOW': None, 'WOM': None},  # Boxing Day
    ]

# Iterate over the public holidays and create the 'public_holiday' column
    df['PUBLIC_HOLIDAY'] = 0  # Initialize the column with 0 (not a public holiday)
    for holiday in public_holidays:
        month_mask = df['date'].dt.month == holiday['Month']
        day_mask = df['date'].dt.day == holiday['Day']
        dow_mask = df['date'].dt.dayofweek == int(holiday['DOW']) if holiday['DOW'] is not None else True
        wom_mask = (df['date'].dt.day - 1) // 7 + 1 == holiday['WOM'] if holiday['WOM'] is not None else True

        mask = month_mask & day_mask & dow_mask & wom_mask
        df.loc[mask, 'PUBLIC_HOLIDAY'] = 1
    return df

In [25]:
df_drop=algo_df.drop("YEAR",axis=1)
final_df=df_drop.drop("Revenue",axis=1)

In [26]:
Sales_Forecast_Training_Data_row=session.table('ANALYTICS."Sales_Forecast_Training_Data"')
Sales_Forecast_Training_Data_df = Sales_Forecast_Training_Data_row.to_pandas()

In [27]:
ml_df=final_df[list(Sales_Forecast_Training_Data_df.drop("Profit",axis=1).columns)]

In [28]:
df = pd.DataFrame()

df=ml_df[location_cols].drop_duplicates()

df=generate_date_data(df,datetime_object)

In [29]:
trc_df = pd.DataFrame()

trc_df=ml_df[truck_cols].drop_duplicates()

trc_df=generate_date_data(trc_df,datetime_object)

In [30]:
trc_df.drop(["MONTH","DOW","DAY","WOM","YEAR","PUBLIC_HOLIDAY"],axis=1,inplace=True)
merge_df=pd.merge(df, trc_df, how='inner', on="date") 

In [31]:
merge_df

Unnamed: 0,CITY_SEATTLE_ENCODED,CITY_DENVER_ENCODED,CITY_San Mateo_encoded,CITY_New York City_encoded,CITY_BOSTON_ENCODED,REGION_NY_ENCODED,REGION_MA_ENCODED,REGION_CO_ENCODED,REGION_WA_ENCODED,REGION_CA_ENCODED,...,MENU_TYPE_POUTINE_ENCODED,MENU_TYPE_ETHIOPIAN_ENCODED,MENU_TYPE_TACOS_ENCODED,MENU_TYPE_Ice Cream_encoded,MENU_TYPE_Hot Dogs_encoded,MENU_TYPE_CHINESE_ENCODED,MENU_TYPE_Grilled Cheese_encoded,MENU_TYPE_VEGETARIAN_ENCODED,MENU_TYPE_INDIAN_ENCODED,MENU_TYPE_RAMEN_ENCODED
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115145,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2115146,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2115147,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2115148,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [32]:
wdf=session.sql("Select * from ANALYTICS.WEATHER_DATA_API")

wdf=wdf.withColumn("H",F.substring(wdf["TIME"], 12, 2).cast("integer"))

wdf=wdf.withColumn("DATE",F.substring(wdf["TIME"], 0, 10))

wdf=wdf.select("WEATHERCODE","LOCATION_ID","H","DATE" ).to_pandas()

In [33]:
wdf['DATE'] = pd.to_datetime(wdf['DATE'])

In [34]:
wdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46629264 entries, 0 to 46629263
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   WEATHERCODE  int8          
 1   LOCATION_ID  int16         
 2   H            int8          
 3   DATE         datetime64[ns]
dtypes: datetime64[ns](1), int16(1), int8(2)
memory usage: 533.6 MB


In [35]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2115150 entries, 0 to 2115149
Data columns (total 37 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   CITY_SEATTLE_ENCODED              int64         
 1   CITY_DENVER_ENCODED               int64         
 2   CITY_San Mateo_encoded            int64         
 3   CITY_New York City_encoded        int64         
 4   CITY_BOSTON_ENCODED               int64         
 5   REGION_NY_ENCODED                 int64         
 6   REGION_MA_ENCODED                 int64         
 7   REGION_CO_ENCODED                 int64         
 8   REGION_WA_ENCODED                 int64         
 9   REGION_CA_ENCODED                 int64         
 10  LAT                               float64       
 11  LONG                              float64       
 12  LOCATION_ID                       int64         
 13  HOUR                              int64         
 14  date              

In [36]:
merge_df['HOUR'].value_counts()

16    140850
17    140850
18    140850
19    140850
20    140850
21    140850
22    140850
8     140850
9     140850
10    140850
11    140850
12    140850
13    140850
14    140850
15    140850
23      2400
Name: HOUR, dtype: int64

In [37]:
truck_df

Unnamed: 0,Truck_ID,Date,Starting_Hour,Ending_Hour,Num_of_locs,each_location_travel_distance,Max_Total_Travel_Distance,Starting_Location
0,27,2021-08-23,10,21,3,11,33,3304
1,28,2021-08-24,9,19,2,9,18,3304
2,43,2021-08-25,10,20,2,11,22,4121
3,44,2021-08-26,11,19,2,8,16,1917
4,46,2021-08-27,8,19,2,8,16,15428
5,47,2021-08-28,10,23,4,10,40,15428


In [38]:
wdf.rename(columns = {'H':'HOUR'}, inplace = True)

In [39]:
weadf = pd.merge(wdf, merge_df, right_on=['LOCATION_ID','date',"HOUR"], left_on=['LOCATION_ID','DATE',"HOUR"])

In [40]:
weadf = weadf.drop(['date'], axis=1)
weadf = weadf.drop(['WOM'], axis=1)
weadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2116275 entries, 0 to 2116274
Data columns (total 37 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   WEATHERCODE                       int8          
 1   LOCATION_ID                       int16         
 2   HOUR                              int8          
 3   DATE                              datetime64[ns]
 4   CITY_SEATTLE_ENCODED              int64         
 5   CITY_DENVER_ENCODED               int64         
 6   CITY_San Mateo_encoded            int64         
 7   CITY_New York City_encoded        int64         
 8   CITY_BOSTON_ENCODED               int64         
 9   REGION_NY_ENCODED                 int64         
 10  REGION_MA_ENCODED                 int64         
 11  REGION_CO_ENCODED                 int64         
 12  REGION_WA_ENCODED                 int64         
 13  REGION_CA_ENCODED                 int64         
 14  LAT               

In [41]:
weadf['HOUR'].value_counts()

8     140925
9     140925
10    140925
11    140925
12    140925
13    140925
14    140925
15    140925
16    140925
17    140925
18    140925
19    140925
20    140925
21    140925
22    140925
23      2400
Name: HOUR, dtype: int64

In [42]:
X_final_scaled.rename(columns = {'Date':'DATE'}, inplace = True)
X_final_scaled_revenue = X_final_scaled.copy()
X_final_scaled = X_final_scaled.drop(['Revenue'], axis=1)
X_final_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242811 entries, 0 to 540200
Data columns (total 39 columns):
 #   Column                                    Non-Null Count   Dtype         
---  ------                                    --------------   -----         
 0   TRUCK_ID                                  242811 non-null  int64         
 1   MONTH                                     242811 non-null  int64         
 2   HOUR                                      242811 non-null  int64         
 3   DOW                                       242811 non-null  int64         
 4   DAY                                       242811 non-null  int64         
 5   PUBLIC_HOLIDAY                            242811 non-null  int64         
 6   LAT                                       242811 non-null  float64       
 7   LONG                                      242811 non-null  float64       
 8   LOCATION_ID                               242811 non-null  int64         
 9   SUM_DAY_OF_WEEK

In [43]:
df1_columns = set(X_final_scaled.columns)
df2_columns = set(weadf.columns)

columns_only_in_df1 = df1_columns - df2_columns
columns_only_in_df2 = df2_columns - df1_columns

print("Columns only in df1:", columns_only_in_df1)
print("Columns only in df2:", columns_only_in_df2)

Columns only in df1: {'SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE', 'SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE'}
Columns only in df2: set()


In [44]:
merged_df = X_final_scaled.merge(weadf, on=['CITY_BOSTON_ENCODED',
 'CITY_DENVER_ENCODED',
 'CITY_New York City_encoded',
 'CITY_SEATTLE_ENCODED',
 'CITY_San Mateo_encoded',
 'DATE',
 'DAY',
 'DOW',
 'HOUR',
 'LAT',
 'LOCATION_ID',
 'LONG',
 'MENU_TYPE_BBQ_ENCODED',
 'MENU_TYPE_CHINESE_ENCODED',
 'MENU_TYPE_CREPES_ENCODED',
 'MENU_TYPE_ETHIOPIAN_ENCODED',
 'MENU_TYPE_GYROS_ENCODED',
 'MENU_TYPE_Grilled Cheese_encoded',
 'MENU_TYPE_Hot Dogs_encoded',
 'MENU_TYPE_INDIAN_ENCODED',
 'MENU_TYPE_Ice Cream_encoded',
 'MENU_TYPE_Mac & Cheese_encoded',
 'MENU_TYPE_POUTINE_ENCODED',
 'MENU_TYPE_RAMEN_ENCODED',
 'MENU_TYPE_SANDWICHES_ENCODED',
 'MENU_TYPE_TACOS_ENCODED',
 'MENU_TYPE_VEGETARIAN_ENCODED',
 'MONTH',
 'PUBLIC_HOLIDAY',
 'REGION_CA_ENCODED',
 'REGION_CO_ENCODED',
 'REGION_MA_ENCODED',
 'REGION_NY_ENCODED',
 'REGION_WA_ENCODED',
 'TRUCK_ID',
 'WEATHERCODE',
 'YEAR'], how='outer')

In [45]:
merged_df.isnull().sum()

TRUCK_ID                                          0
MONTH                                             0
HOUR                                              0
DOW                                               0
DAY                                               0
PUBLIC_HOLIDAY                                    0
LAT                                               0
LONG                                              0
LOCATION_ID                                       0
SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE          2116275
SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE    2116275
YEAR                                              0
WEATHERCODE                                       0
MENU_TYPE_BBQ_ENCODED                             0
MENU_TYPE_CREPES_ENCODED                          0
MENU_TYPE_GYROS_ENCODED                           0
MENU_TYPE_Mac & Cheese_encoded                    0
MENU_TYPE_SANDWICHES_ENCODED                      0
MENU_TYPE_ETHIOPIAN_ENCODED                       0
MENU_TYPE_PO

In [46]:
merged_df = merged_df.fillna({ 'SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE':(merged_df['SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE'].mean())})
merged_df = merged_df.fillna({ 'SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE':(merged_df['SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE'].mean())})

In [47]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2359086 entries, 0 to 2359085
Data columns (total 39 columns):
 #   Column                                    Dtype         
---  ------                                    -----         
 0   TRUCK_ID                                  int64         
 1   MONTH                                     int64         
 2   HOUR                                      int64         
 3   DOW                                       int64         
 4   DAY                                       int64         
 5   PUBLIC_HOLIDAY                            int64         
 6   LAT                                       float64       
 7   LONG                                      float64       
 8   LOCATION_ID                               int64         
 9   SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE        float64       
 10  SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE  float64       
 11  YEAR                                      int64         
 12  WEATHERCODE   

In [48]:
predicted_df = merged_df[['TRUCK_ID', 'MONTH', 'HOUR', 'DOW', 'DAY', 'PUBLIC_HOLIDAY', 'LAT',
       'LONG', 'LOCATION_ID', 'SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE',
       'SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE', 'WEATHERCODE',
       'MENU_TYPE_GYROS_ENCODED', 'MENU_TYPE_CREPES_ENCODED',
       'MENU_TYPE_BBQ_ENCODED', 'MENU_TYPE_SANDWICHES_ENCODED',
       'MENU_TYPE_Mac & Cheese_encoded', 'MENU_TYPE_POUTINE_ENCODED',
       'MENU_TYPE_ETHIOPIAN_ENCODED', 'MENU_TYPE_TACOS_ENCODED',
       'MENU_TYPE_Ice Cream_encoded', 'MENU_TYPE_Hot Dogs_encoded',
       'MENU_TYPE_CHINESE_ENCODED', 'MENU_TYPE_Grilled Cheese_encoded',
       'MENU_TYPE_VEGETARIAN_ENCODED', 'MENU_TYPE_INDIAN_ENCODED',
       'MENU_TYPE_RAMEN_ENCODED', 'CITY_SEATTLE_ENCODED',
       'CITY_DENVER_ENCODED', 'CITY_San Mateo_encoded',
       'CITY_New York City_encoded', 'CITY_BOSTON_ENCODED',
       'REGION_NY_ENCODED', 'REGION_MA_ENCODED', 'REGION_CO_ENCODED',
       'REGION_WA_ENCODED', 'REGION_CA_ENCODED']]
predicted_df

Unnamed: 0,TRUCK_ID,MONTH,HOUR,DOW,DAY,PUBLIC_HOLIDAY,LAT,LONG,LOCATION_ID,SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE,...,CITY_SEATTLE_ENCODED,CITY_DENVER_ENCODED,CITY_San Mateo_encoded,CITY_New York City_encoded,CITY_BOSTON_ENCODED,REGION_NY_ENCODED,REGION_MA_ENCODED,REGION_CO_ENCODED,REGION_WA_ENCODED,REGION_CA_ENCODED
0,17,9,16,4,10,0,39.748742,-104.972158,3304,-0.276074,...,0,1,0,0,0,0,0,1,0,0
1,17,9,17,4,10,0,39.748742,-104.972158,3304,-0.276074,...,0,1,0,0,0,0,0,1,0,0
2,17,9,18,4,10,0,39.748742,-104.972158,3304,-0.276074,...,0,1,0,0,0,0,0,1,0,0
3,17,9,19,4,10,0,39.748742,-104.972158,3304,-0.276074,...,0,1,0,0,0,0,0,1,0,0
4,17,9,20,4,10,0,39.748742,-104.972158,3304,-0.276074,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2359081,3,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,0,1,0,0,0,0,0,0,1
2359082,13,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,0,1,0,0,0,0,0,0,1
2359083,10,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,0,1,0,0,0,0,0,0,1
2359084,5,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,0,1,0,0,0,0,0,0,1


In [49]:
training_data=session.table('ANALYTICS."Sales_Forecast_Training_Data"').to_pandas()
columns_for_model=final_df[list(training_data.drop("Profit",axis=1).columns)]
columns_for_model.columns

Index(['TRUCK_ID', 'MONTH', 'HOUR', 'DOW', 'DAY', 'PUBLIC_HOLIDAY', 'LAT',
       'LONG', 'LOCATION_ID', 'SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE',
       'SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE', 'WEATHERCODE',
       'MENU_TYPE_GYROS_ENCODED', 'MENU_TYPE_CREPES_ENCODED',
       'MENU_TYPE_BBQ_ENCODED', 'MENU_TYPE_SANDWICHES_ENCODED',
       'MENU_TYPE_Mac & Cheese_encoded', 'MENU_TYPE_POUTINE_ENCODED',
       'MENU_TYPE_ETHIOPIAN_ENCODED', 'MENU_TYPE_TACOS_ENCODED',
       'MENU_TYPE_Ice Cream_encoded', 'MENU_TYPE_Hot Dogs_encoded',
       'MENU_TYPE_CHINESE_ENCODED', 'MENU_TYPE_Grilled Cheese_encoded',
       'MENU_TYPE_VEGETARIAN_ENCODED', 'MENU_TYPE_INDIAN_ENCODED',
       'MENU_TYPE_RAMEN_ENCODED', 'CITY_SEATTLE_ENCODED',
       'CITY_DENVER_ENCODED', 'CITY_San Mateo_encoded',
       'CITY_New York City_encoded', 'CITY_BOSTON_ENCODED',
       'REGION_NY_ENCODED', 'REGION_MA_ENCODED', 'REGION_CO_ENCODED',
       'REGION_WA_ENCODED', 'REGION_CA_ENCODED'],
      dtype='object')

In [50]:
predictions = model.predict(predicted_df)

In [51]:
predictions

array([3011.4856 , 5846.3574 , 7421.3857 , ...,  968.0405 , 1096.6093 ,
        960.98267], dtype=float32)

In [52]:
predicted_df['Predicted'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] = predictions


## Routing Build

In [53]:
truck_df

Unnamed: 0,Truck_ID,Date,Starting_Hour,Ending_Hour,Num_of_locs,each_location_travel_distance,Max_Total_Travel_Distance,Starting_Location
0,27,2021-08-23,10,21,3,11,33,3304
1,28,2021-08-24,9,19,2,9,18,3304
2,43,2021-08-25,10,20,2,11,22,4121
3,44,2021-08-26,11,19,2,8,16,1917
4,46,2021-08-27,8,19,2,8,16,15428
5,47,2021-08-28,10,23,4,10,40,15428


In [54]:
past_2_weeks_df_sorted

Unnamed: 0,TRUCK_ID,REVENUE
1,47,1974438.0
3,43,1405972.0
4,27,913659.0
2,46,894930.0
0,28,669889.0


In [55]:
# Step 2: Extract the sorted truck IDs from revenue_df
sorted_truck_ids = past_2_weeks_df_sorted['TRUCK_ID'].tolist()

# Step 3: Use the sorted truck IDs to sort the truck_df DataFrame
truck_df = truck_df.sort_values(by='Truck_ID')
truck_df = truck_df.set_index('Truck_ID')
truck_df = truck_df.loc[sorted_truck_ids].reset_index()

In [56]:
predicted_df

Unnamed: 0,TRUCK_ID,MONTH,HOUR,DOW,DAY,PUBLIC_HOLIDAY,LAT,LONG,LOCATION_ID,SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE,...,CITY_DENVER_ENCODED,CITY_San Mateo_encoded,CITY_New York City_encoded,CITY_BOSTON_ENCODED,REGION_NY_ENCODED,REGION_MA_ENCODED,REGION_CO_ENCODED,REGION_WA_ENCODED,REGION_CA_ENCODED,Predicted
0,17,9,16,4,10,0,39.748742,-104.972158,3304,-0.276074,...,1,0,0,0,0,0,1,0,0,3011.485596
1,17,9,17,4,10,0,39.748742,-104.972158,3304,-0.276074,...,1,0,0,0,0,0,1,0,0,5846.357422
2,17,9,18,4,10,0,39.748742,-104.972158,3304,-0.276074,...,1,0,0,0,0,0,1,0,0,7421.385742
3,17,9,19,4,10,0,39.748742,-104.972158,3304,-0.276074,...,1,0,0,0,0,0,1,0,0,8012.751953
4,17,9,20,4,10,0,39.748742,-104.972158,3304,-0.276074,...,1,0,0,0,0,0,1,0,0,7661.839844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2359081,3,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,1,0,0,0,0,0,0,1,1077.091919
2359082,13,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,1,0,0,0,0,0,0,1,956.508362
2359083,10,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,1,0,0,0,0,0,0,1,968.040527
2359084,5,8,22,0,23,0,37.548803,-122.317398,1397,-0.080459,...,0,1,0,0,0,0,0,0,1,1096.609253


In [57]:
def calculate_distances(df_predictions, starting_location_id):
    def haversine_distance(lat1, lon1, lat2, lon2):
        # Convert latitude and longitude from degrees to radians
        lat1_rad = math.radians(lat1)
        lon1_rad = math.radians(lon1)
        lat2_rad = math.radians(lat2)
        lon2_rad = math.radians(lon2)

        # Haversine formula
        dlon = lon2_rad - lon1_rad
        dlat = lat2_rad - lat1_rad
        a = math.sin(dlat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        distance = 6371 * c  # Radius of the Earth in kilometers
        return distance

    # Find the reference location based on the starting location ID
    reference_location = df_predictions[df_predictions['LOCATION_ID'] == starting_location_id]
    reference_latitude = reference_location['LAT'].values[0]
    reference_longitude = reference_location['LONG'].values[0]

    # List of other locations with their respective location IDs, latitudes, and longitudes
    other_locations = df_predictions[['LOCATION_ID', 'LAT', 'LONG']].drop_duplicates().values.tolist()

    # Calculate distances between starting location and other locations
    distances = []
    for location in other_locations:
        location_id = location[0]
        latitude = location[1]
        longitude = location[2]
        distance = haversine_distance(reference_latitude, reference_longitude, latitude, longitude)
        distances.append({'Location_ID_start': starting_location_id, 'Location_ID_end': location_id, 'distance': distance})

    # Create a DataFrame from the distances list
    df_distances = pd.DataFrame(distances)

    # Sort the DataFrame by distance in ascending order
    df_distances = df_distances.sort_values('distance')

    return df_distances

In [58]:
def find_distance(df1, df2):
    # Radius of the Earth in kilometers
    R = 6371

    lat1=df1['LAT']
    lon1=df1['LONG']
    lat2=df2['LAT']
    lon2=df2['LONG']
    # Convert latitude and longitude to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Difference between latitudes and longitudes
    delta_lat = lat2_rad - lat1_rad
    delta_lon = lon2_rad - lon1_rad

    # Haversine formula
    a = math.sin(delta_lat/2) ** 2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon/2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = R * c

    return distance

In [59]:
predicted_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2359086 entries, 0 to 2359085
Data columns (total 38 columns):
 #   Column                                    Dtype  
---  ------                                    -----  
 0   TRUCK_ID                                  int64  
 1   MONTH                                     int64  
 2   HOUR                                      int64  
 3   DOW                                       int64  
 4   DAY                                       int64  
 5   PUBLIC_HOLIDAY                            int64  
 6   LAT                                       float64
 7   LONG                                      float64
 8   LOCATION_ID                               int64  
 9   SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE        float64
 10  SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE  float64
 11  WEATHERCODE                               int64  
 12  MENU_TYPE_GYROS_ENCODED                   int64  
 13  MENU_TYPE_CREPES_ENCODED                  int64  
 14  ME

In [60]:
unique_location_ids = predicted_df['LOCATION_ID'].unique()

hourly_location_df = pd.DataFrame(columns=[str(hour) for hour in range(1, 25)])

for location_id in unique_location_ids:
    row_data = {str(hour): 0 for hour in range(1, 25)}
    hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)

hourly_location_df['LOCATION_ID'] = unique_location_ids
hourly_location_df = hourly_location_df[['LOCATION_ID'] + [str(hour) for hour in range(1, 25)]]
hourly_location_df

  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignore_index=True)
  hourly_location_df = hourly_location_df.append(row_data, ignor

Unnamed: 0,LOCATION_ID,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,3304,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3156,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15428,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15295,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,15296,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873,3580,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1874,3329,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1875,14861,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1876,14820,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
import math

for index, row in truck_df.iterrows():
    Truck_ID = row['Truck_ID']
    Date = row['Date']
    starting_hour = row['Starting_Hour']
    ending_hour = row['Ending_Hour']
    num_of_locs = row['Num_of_locs']
    each_location_travel_distance = row['each_location_travel_distance']
    Max_Total_Travel_Distance = row['Max_Total_Travel_Distance']
    Starting_Location = row['Starting_Location']

    working_hours = ending_hour - starting_hour
    # Calculate the base shift hours (without considering the remainder)
    shift_hours = working_hours // num_of_locs
    # Calculate the remaining hours to distribute
    remaining_hours = working_hours % num_of_locs

    # Create a list to store the shift hours for each shift
    shift_hours_list = [shift_hours] * num_of_locs

    # Distribute the remaining hours evenly across shifts
    for i in range(remaining_hours):
        shift_hours_list[i] += 1

    current_hour = starting_hour
    predicted_df['HOUR'] = current_hour
    for x in range(shift_hours_list[0]):
        current_hour += 1
        predicted_df['HOUR'] = current_hour
        next_df_prediction = predicted_df.copy().drop(columns = 'Predicted')
        # predicted_df['Predicted'] += xgb.predict(next_df_prediction)
        predicted_df['Predicted'] += model.predict(next_df_prediction)

    max_index = predicted_df['Predicted'].idxmax()
    row_with_max_value = predicted_df.loc[max_index]
    row_with_max_value['HOUR'] = starting_hour
    row_with_max_value

    route_list = [row_with_max_value]
    hour = starting_hour
    
    for x in range(num_of_locs-1):
        hour += shift_hours_list[x] #add on hours to predict at later hours
        df_distances = calculate_distances(merged_df, route_list[x]['LOCATION_ID']) #calculate distance to all location IDs
        filtered_distances = df_distances[(df_distances['Location_ID_start'] == route_list[x]['LOCATION_ID']) & (df_distances['distance'] < each_location_travel_distance)]
        result = merged_df[merged_df['LOCATION_ID'].isin(filtered_distances['Location_ID_end'])]
        current_hour = hour
        result['HOUR'] = current_hour
        result = result[['TRUCK_ID', 'MONTH', 'HOUR', 'DOW', 'DAY', 'PUBLIC_HOLIDAY', 'LAT', 'LONG', 'LOCATION_ID', 'SUM_DAY_OF_WEEK_AVG_CITY_MENU_TYPE', 'SUM_PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE', 
            'WEATHERCODE', 'MENU_TYPE_GYROS_ENCODED', 'MENU_TYPE_CREPES_ENCODED', 'MENU_TYPE_BBQ_ENCODED', 'MENU_TYPE_SANDWICHES_ENCODED', 'MENU_TYPE_Mac & Cheese_encoded', 'MENU_TYPE_POUTINE_ENCODED', 
            'MENU_TYPE_ETHIOPIAN_ENCODED', 'MENU_TYPE_TACOS_ENCODED', 'MENU_TYPE_Ice Cream_encoded', 'MENU_TYPE_Hot Dogs_encoded', 'MENU_TYPE_CHINESE_ENCODED', 'MENU_TYPE_Grilled Cheese_encoded', 
            'MENU_TYPE_VEGETARIAN_ENCODED', 'MENU_TYPE_INDIAN_ENCODED', 'MENU_TYPE_RAMEN_ENCODED', 'CITY_SEATTLE_ENCODED', 'CITY_DENVER_ENCODED', 'CITY_San Mateo_encoded', 'CITY_New York City_encoded', 
            'CITY_BOSTON_ENCODED', 'REGION_NY_ENCODED', 'REGION_MA_ENCODED', 'REGION_CO_ENCODED', 'REGION_WA_ENCODED', 'REGION_CA_ENCODED']]
        result['Predicted'] = model.predict(result)
        for x in range(shift_hours_list[0]):
            current_hour += 1
            result['HOUR'] = current_hour
            next_df_prediction = result.copy().drop(columns = 'Predicted')
            result['Predicted'] += model.predict(next_df_prediction)
            
        max_index = result['Predicted'].idxmax()
        max_value = result.loc[max_index]
        max_value['HOUR'] = hour
        route_list.append(max_value)

    day = row['Date'].day
    month = row['Date'].month

    # Filter the X_holdout DataFrame for the specified day, month, and truck ID
    filtered_holdout = algo_df[(algo_df['DAY'] == int(day))
        & (algo_df['MONTH'] == int(month)) 
        & (algo_df['TRUCK_ID'] == Truck_ID)]

    # Extract relevant columns for analysis
    hourly_values = filtered_holdout[['HOUR', 'Revenue']]

    hour_value_list = []
    Initial_locations = []
    hour_list = list(range(starting_hour, ending_hour))

    for i in range(len(shift_hours_list)):
        hour_value = 0.0
        shift_hours= shift_hours_list[i]
        # print(hour_list)

        for j in range(shift_hours):
            hour = hour_list[0]
            # print(type(hour))
            hour_list.remove(hour)
            # print(hour)
            filter_hour = filtered_holdout[(filtered_holdout['HOUR'] == hour)]
            if filter_hour.empty:
                # Calculate the mean revenue of the rest of the hours in the filtered_holdout DataFrame
                mean_revenue = filtered_holdout[filtered_holdout['HOUR'] != hour]['Revenue'].mean()
                hour_value += mean_revenue
            else:
                hour_value += filter_hour['Revenue'].values[0]

        hour_value_list.append(hour_value)
        filter_loc = filtered_holdout[(filtered_holdout['HOUR'] == hour)]
        if not filter_loc.empty:
                Initial_locations.append(filter_loc['LOCATION_ID'].values[0])
        else:
            # If the hour is missing, take the first row of data in filtered_holdout and use it as filter_loc
            filter_loc = filtered_holdout.iloc[0:1]
            Initial_locations.append(filter_loc['LOCATION_ID'].values[0])

    distance_travelled = 0
    revenue_earned = 0
    for i in range(len(route_list)):
        print('Truck ID:',Truck_ID)
        print('Hour:',route_list[i]['HOUR'])
        print('Shift Hours:',shift_hours_list[i])
        print('Location ID:', route_list[i]['LOCATION_ID'])
        print('Total predicted value:', route_list[i]['Predicted'])
        print('Total initial value:', hour_value_list[i])
        print('Additional Value', route_list[i]['Predicted']- hour_value_list[i])
        print('Original Location: ', Initial_locations[i])
        print()
        revenue_earned += route_list[i]['Predicted']
    for i in range(len(route_list)-1):
        distance_travelled += find_distance(route_list[i],route_list[i+1])

    print('Maximum possible distance travelled throughout all the shifts for truck id',Truck_ID,": ",max_total_travel_distance,'km')
    print('Total distance travelled: ',round(distance_travelled,2),'km')
    print('Dollars earned by km travelled: $',round(revenue_earned/distance_travelled,2),'/km')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['HOUR'] = current_hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['HOUR'] = current_hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a slice from a DataFrame.
Try

Truck ID: 47
Hour: 10.0
Shift Hours: 4
Location ID: 15218.0
Total predicted value: 65387.79296875
Total initial value: 48534.0
Additional Value 16853.79296875
Original Location:  15398

Truck ID: 47
Hour: 14.0
Shift Hours: 3
Location ID: 15217.0
Total predicted value: 63777.59765625
Total initial value: 19917.21212121212
Additional Value 43860.38553503788
Original Location:  15419

Truck ID: 47
Hour: 17.0
Shift Hours: 3
Location ID: 15217.0
Total predicted value: 66954.015625
Total initial value: 38890.0
Additional Value 28064.015625
Original Location:  15419

Truck ID: 47
Hour: 20.0
Shift Hours: 3
Location ID: 3978.0
Total predicted value: 49396.32421875
Total initial value: 45553.21212121212
Additional Value 3843.11209753788
Original Location:  15419

Maximum possible distance travelled throughout all the shifts for truck id 47 :  40 km
Total distance travelled:  3.31 km
Dollars earned by km travelled: $ 74225.69 /km


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['HOUR'] = current_hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a 

Truck ID: 43
Hour: 10.0
Shift Hours: 5
Location ID: 15218.0
Total predicted value: 129318.2578125
Total initial value: 34222.0
Additional Value 95096.2578125
Original Location:  15043

Truck ID: 43
Hour: 15.0
Shift Hours: 5
Location ID: 2520.0
Total predicted value: 79102.1875
Total initial value: 34266.0
Additional Value 44836.1875
Original Location:  5176

Maximum possible distance travelled throughout all the shifts for truck id 43 :  40 km
Total distance travelled:  10.29 km
Dollars earned by km travelled: $ 20246.96 /km


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['HOUR'] = current_hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a 

Truck ID: 27
Hour: 10.0
Shift Hours: 4
Location ID: 15218.0
Total predicted value: 181251.484375
Total initial value: 17902.0
Additional Value 163349.484375
Original Location:  14939

Truck ID: 27
Hour: 14.0
Shift Hours: 4
Location ID: 15217.0
Total predicted value: 63777.59765625
Total initial value: 9891.0
Additional Value 53886.59765625
Original Location:  3581

Truck ID: 27
Hour: 18.0
Shift Hours: 3
Location ID: 1422.0
Total predicted value: 65403.22265625
Total initial value: 32272.0
Additional Value 33131.22265625
Original Location:  3581

Maximum possible distance travelled throughout all the shifts for truck id 27 :  40 km
Total distance travelled:  7.01 km
Dollars earned by km travelled: $ 44293.35 /km


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['HOUR'] = current_hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a 

Truck ID: 46
Hour: 8.0
Shift Hours: 6
Location ID: 15218.0
Total predicted value: 260028.875
Total initial value: 25419.0
Additional Value 234609.875
Original Location:  1149

Truck ID: 46
Hour: 14.0
Shift Hours: 5
Location ID: 15217.0
Total predicted value: 90516.3828125
Total initial value: 19047.0
Additional Value 71469.3828125
Original Location:  3913

Maximum possible distance travelled throughout all the shifts for truck id 46 :  40 km
Total distance travelled:  1.85 km
Dollars earned by km travelled: $ 189652.74 /km


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['HOUR'] = current_hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['Predicted'] += model.predict(next_df_prediction)
A value is trying to be set on a copy of a 

Truck ID: 28
Hour: 9.0
Shift Hours: 5
Location ID: 15170.0
Total predicted value: 325379.90625
Total initial value: 19460.0
Additional Value 305919.90625
Original Location:  14938

Truck ID: 28
Hour: 14.0
Shift Hours: 5
Location ID: 15170.0
Total predicted value: 77251.3671875
Total initial value: 20871.73076923077
Additional Value 56379.636418269234
Original Location:  14958

Maximum possible distance travelled throughout all the shifts for truck id 28 :  40 km
Total distance travelled:  0.0 km
Dollars earned by km travelled: $ inf /km


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  max_value['HOUR'] = hour
  print('Dollars earned by km travelled: $',round(revenue_earned/distance_travelled,2),'/km')


In [62]:
truck_df

Unnamed: 0,Truck_ID,Date,Starting_Hour,Ending_Hour,Num_of_locs,each_location_travel_distance,Max_Total_Travel_Distance,Starting_Location
0,47,2021-08-28,10,23,4,10,40,15428
1,43,2021-08-25,10,20,2,11,22,4121
2,27,2021-08-23,10,21,3,11,33,3304
3,46,2021-08-27,8,19,2,8,16,15428
4,28,2021-08-24,9,19,2,9,18,3304


In [63]:
distance_travelled = 0
revenue_earned = 0
for i in range(len(route_list)):
    print('Hour:',route_list[i]['HOUR'])
    print('Shift Hours:',shift_hours_list[i])
    print('Location ID:', route_list[i]['LOCATION_ID'])
    print('Total predicted value:', route_list[i]['Predicted'])
    print('Total initial value:', hour_value_list[i])
    print('Additional Value', route_list[i]['Predicted']- hour_value_list[i])
    print('Original Location: ', Initial_locations[i])
    print()
    revenue_earned += route_list[i]['Predicted']
for i in range(len(route_list)-1):
    distance_travelled += find_distance(route_list[i],route_list[i+1])

print('Maximum possible distance travelled throughout all the shifts for truck id',Truck_ID,": ",max_total_travel_distance,'km')
print('Total distance travelled: ',round(distance_travelled,2),'km')
print('Dollars earned by km travelled: $',round(revenue_earned/distance_travelled,2),'/km')

Hour: 9.0
Shift Hours: 5
Location ID: 15170.0
Total predicted value: 325379.90625
Total initial value: 19460.0
Additional Value 305919.90625
Original Location:  14938

Hour: 14.0
Shift Hours: 5
Location ID: 15170.0
Total predicted value: 77251.3671875
Total initial value: 20871.73076923077
Additional Value 56379.636418269234
Original Location:  14958

Maximum possible distance travelled throughout all the shifts for truck id 28 :  40 km
Total distance travelled:  0.0 km
Dollars earned by km travelled: $ inf /km


  print('Dollars earned by km travelled: $',round(revenue_earned/distance_travelled,2),'/km')


## Other Stuff

In [64]:
session.use_schema("RAW_POS")

In [65]:
truck_ids = truck_df['Truck_ID'].unique()
# Create an empty DataFrame to store the results
result_df = pd.DataFrame()

truck_ids_formatted = ["'{}'".format(str(truck_id)) for truck_id in truck_ids]

# Get the truck information
query = "SELECT * FROM TRUCK WHERE TRUCK_ID IN ({})".format(",".join(truck_ids_formatted))
truck_info_rows = session.sql(query).collect()
truck_info_df = pd.DataFrame(truck_info_rows)
city = truck_info_df['PRIMARY_CITY'].iloc[0]

locations = truck_df['Starting_Location'].unique()

query = "SELECT * FROM LOCATION WHERE CITY = '{}'".format(city)
location_rows = session.sql(query).collect()
location_df = pd.DataFrame(location_rows)

# Merge location and truck data
city_locations = location_df.merge(df_unique_locations_lat_long, left_on='LOCATION_ID', right_on='Location ID', how='inner')
city_locations = city_locations[['LOCATION_ID', 'Latitude', 'Longitude']]
city_locations.rename(columns={"Latitude": "LAT"}, inplace=True)
city_locations.rename(columns={"Longitude": "LONG"}, inplace=True)

In [66]:
# loc_checker = city_locations.copy()
# loc_checker['DATE'] = date
# wdf=session.sql("Select * from ANALYTICS.WEATHER_DATA_API")
# wdf=wdf.withColumn("H",F.substring(wdf["TIME"], 12, 2).cast("integer"))
# wdf=wdf.withColumn("DATE",F.substring(wdf["TIME"], 0, 10))
# wdf=wdf.select("WEATHERCODE","LOCATION_ID","H","DATE" ).collect()
# wdf_df=pd.DataFrame(wdf)

In [67]:
# wdf_df.to_csv('wdf_data.csv', index=False)

In [68]:
wdf_df = pd.read_csv('wdf_data.csv')

In [69]:
wdf_df

Unnamed: 0,WEATHERCODE,LOCATION_ID,H,DATE
0,2,3304,0,2020-06-25
1,1,3304,1,2020-06-25
2,1,3304,2,2020-06-25
3,1,3304,3,2020-06-25
4,1,3304,4,2020-06-25
...,...,...,...,...
46629259,0,3766,19,2022-09-16
46629260,0,3766,20,2022-09-16
46629261,0,3766,21,2022-09-16
46629262,0,3766,22,2022-09-16


In [70]:
wdf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46629264 entries, 0 to 46629263
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   WEATHERCODE  int64 
 1   LOCATION_ID  int64 
 2   H            int64 
 3   DATE         object
dtypes: int64(3), object(1)
memory usage: 1.4+ GB


In [71]:
dates

[datetime.datetime(2021, 8, 23, 0, 0),
 datetime.datetime(2021, 8, 24, 0, 0),
 datetime.datetime(2021, 8, 25, 0, 0),
 datetime.datetime(2021, 8, 26, 0, 0),
 datetime.datetime(2021, 8, 27, 0, 0),
 datetime.datetime(2021, 8, 28, 0, 0)]

In [72]:
wdf_df['DATE'] = pd.to_datetime(wdf_df['DATE'])  # Convert 'DATE' column to datetime type
filtered_df = wdf_df[wdf_df['DATE'].isin(dates)]
filtered_df.head()

Unnamed: 0,WEATHERCODE,LOCATION_ID,H,DATE
10176,0,3304,0,2021-08-23
10177,0,3304,1,2021-08-23
10178,0,3304,2,2021-08-23
10179,0,3304,3,2021-08-23
10180,0,3304,4,2021-08-23


In [73]:
merge_df


Unnamed: 0,CITY_SEATTLE_ENCODED,CITY_DENVER_ENCODED,CITY_San Mateo_encoded,CITY_New York City_encoded,CITY_BOSTON_ENCODED,REGION_NY_ENCODED,REGION_MA_ENCODED,REGION_CO_ENCODED,REGION_WA_ENCODED,REGION_CA_ENCODED,...,MENU_TYPE_POUTINE_ENCODED,MENU_TYPE_ETHIOPIAN_ENCODED,MENU_TYPE_TACOS_ENCODED,MENU_TYPE_Ice Cream_encoded,MENU_TYPE_Hot Dogs_encoded,MENU_TYPE_CHINESE_ENCODED,MENU_TYPE_Grilled Cheese_encoded,MENU_TYPE_VEGETARIAN_ENCODED,MENU_TYPE_INDIAN_ENCODED,MENU_TYPE_RAMEN_ENCODED
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115145,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2115146,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2115147,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2115148,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [74]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2115150 entries, 0 to 2115149
Data columns (total 37 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   CITY_SEATTLE_ENCODED              int64         
 1   CITY_DENVER_ENCODED               int64         
 2   CITY_San Mateo_encoded            int64         
 3   CITY_New York City_encoded        int64         
 4   CITY_BOSTON_ENCODED               int64         
 5   REGION_NY_ENCODED                 int64         
 6   REGION_MA_ENCODED                 int64         
 7   REGION_CO_ENCODED                 int64         
 8   REGION_WA_ENCODED                 int64         
 9   REGION_CA_ENCODED                 int64         
 10  LAT                               float64       
 11  LONG                              float64       
 12  LOCATION_ID                       int64         
 13  HOUR                              int64         
 14  date              

In [75]:
truck_df

Unnamed: 0,Truck_ID,Date,Starting_Hour,Ending_Hour,Num_of_locs,each_location_travel_distance,Max_Total_Travel_Distance,Starting_Location
0,47,2021-08-28,10,23,4,10,40,15428
1,43,2021-08-25,10,20,2,11,22,4121
2,27,2021-08-23,10,21,3,11,33,3304
3,46,2021-08-27,8,19,2,8,16,15428
4,28,2021-08-24,9,19,2,9,18,3304
