In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.window import Window
from sklearn import preprocessing # https://github.com/Snowflake-Labs/snowpark-python-demos/tree/main/sp4py_utilities
from snowflake.snowpark.functions import col

import getpass
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#nj07294.ap-southeast-1

In [3]:
accountname = 'nj07294.ap-southeast-1'
#accountname = getpass.getpass() # ORGNAME-ACCOUNTNAME (separated by minus sign)

In [4]:
username = getpass.getpass()    # SNOWFLAKE-USERNAME

········


In [9]:
password = getpass.getpass()    # SNOWFLAKE-PASSWORD

········


In [10]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "FROSTBYTE_TASTY_BYTES",

    "warehouse": "HOL_WH"
}

session = Session.builder.configs(connection_parameters).create()

# Filter

In [7]:
sdf = session.table('ANALYTICS.ORDERS_V')
df=sdf.filter(col("COUNTRY")=='United States')

In [8]:
df=sdf.filter(col("COUNTRY")=='United States')

In [9]:
df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"ORDER_ID"  |"TRUCK_ID"  |"ORDER_TS"           |"ORDER_DETAIL_ID"  |"LINE_NUMBER"  |"TRUCK_BRAND_NAME"  |"MENU_TYPE"  |"PRIMARY_CITY"  |"REGION"  |"COUNTRY"      |"FRANCHISE_FLAG"  |"FRANCHISE_ID"  |"FRANCHISEE_FIRST_NAME"  |"FRANCHISEE_LAST_NAME"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"FIRST_NAME"  |"LAST_NAME"  |"E_MAIL"  |"PHONE_NUMBER"  |"CHILDREN_COUNT"  |"GENDER"  |"MARITAL_STATUS"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"   

### Extracting weather data

In [10]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "GLOBAL_WEATHER__CLIMATE_DATA_FOR_BI",

    "warehouse": "HOL_WH"
}

wea_session = Session.builder.configs(connection_parameters).create()
wdf = wea_session.table('STANDARD_TILE.HISTORY_DAY')

In [11]:
wdf.show()


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
wdf2 = wdf.to_pandas()


In [13]:
wdf2.head()

Unnamed: 0,POSTAL_CODE,COUNTRY,DATE_VALID_STD,DOY_STD,MIN_TEMPERATURE_AIR_2M_F,AVG_TEMPERATURE_AIR_2M_F,MAX_TEMPERATURE_AIR_2M_F,MIN_TEMPERATURE_WETBULB_2M_F,AVG_TEMPERATURE_WETBULB_2M_F,MAX_TEMPERATURE_WETBULB_2M_F,...,TOT_PRECIPITATION_IN,TOT_SNOWFALL_IN,TOT_SNOWDEPTH_IN,MIN_CLOUD_COVER_TOT_PCT,AVG_CLOUD_COVER_TOT_PCT,MAX_CLOUD_COVER_TOT_PCT,MIN_RADIATION_SOLAR_TOTAL_WPM2,AVG_RADIATION_SOLAR_TOTAL_WPM2,MAX_RADIATION_SOLAR_TOTAL_WPM2,TOT_RADIATION_SOLAR_TOTAL_WPM2
0,102103,NG,2021-11-16,320,77.5,81.9,89.6,75.6,77.0,78.6,...,0.07,0.0,0.0,83,96,100,0.0,236.9,857.0,5686.0
1,110054,IN,2021-11-16,320,50.7,62.7,77.1,43.8,51.9,60.0,...,0.0,0.0,0.0,0,1,6,0.0,189.7,713.2,4552.6
2,2044,AU,2021-11-16,320,52.6,59.2,66.0,49.1,51.4,54.3,...,0.0,0.0,0.0,1,15,86,0.0,371.1,1084.2,8905.7
3,21745-690,BR,2021-11-16,320,63.7,74.8,87.9,61.1,67.1,72.3,...,0.0,0.0,0.0,2,55,100,0.0,350.7,1078.2,8415.6
4,60596,DE,2021-11-16,320,40.4,42.0,43.9,39.5,40.9,42.3,...,0.0,0.0,0.0,99,100,100,0.0,12.8,59.6,306.7


### Drop columns (like discount amount, etc)

In [14]:
df=df.drop(['ORDER_DISCOUNT_AMOUNT','ORDER_TAX_AMOUNT','ORDER_AMOUNT','PRICE','UNIT_PRICE','QUANTITY','MENU_ITEM_NAME','MENU_ITEM_ID','MARITAL_STATUS','GENDER','CHILDREN_COUNT','PHONE_NUMBER','E_MAIL','LAST_NAME','FIRST_NAME','CUSTOMER_ID','FRANCHISEE_FIRST_NAME','FRANCHISEE_LAST_NAME','FRANCHISE_ID','FRANCHISE_FLAG','LINE_NUMBER','ORDER_ID'])

### Transform order_ts to year, month, day, day of the week, hour, public holiday binary

In [15]:
df=df.withColumn("Month",F.month(df["ORDER_TS"]))
df=df.withColumn("DOW",F.dayofweek(df["ORDER_TS"]))
df=df.withColumn("Day",F.dayofmonth(df["ORDER_TS"]))
df=df.withColumn("Hour",F.hour(df["ORDER_TS"]))
df=df.withColumn("WOM", ((F.dayofmonth(F.col('ORDER_TS')) - 1) / 7 + 1).cast('integer'))

In [16]:
# Create public holiday column binary
public_holidays = [
    {'Month': 7, 'Day': 4, 'DOW': None, 'WOM': None},  # 4th of July
    {'Month': 12, 'Day': 24, 'DOW': None, 'WOM': None},  # Christmas Eve
    {'Month': 12, 'Day': 25, 'DOW': None, 'WOM': None},  # Christmas Day
    {'Month': 10, 'Day': None, 'DOW': 'Monday', 'WOM': 2},  # Columbus Day (second Monday in October)
    {'Month': 6, 'Day': 19, 'DOW': None, 'WOM': None},  # Juneteenth
    {'Month': 9, 'Day': None, 'DOW': 'Monday', 'WOM': 1},  # Labor Day (first Monday in September)
    {'Month': 1, 'Day': None, 'DOW': 'Monday', 'WOM': 3},  # Martin Luther King, Jr. Day (third Monday in January)
    {'Month': 5, 'Day': None, 'DOW': 'Monday', 'WOM': -1},  # Memorial Day (last Monday in May)
    {'Month': 1, 'Day': 1, 'DOW': None, 'WOM': None},  # New Year's Day
    {'Month': 12, 'Day': 31, 'DOW': None, 'WOM': None},  # New Year's Eve
    {'Month': 11, 'Day': None, 'DOW': 'Thursday', 'WOM': 4},  # Thanksgiving Day (fourth Thursday in November)
    {'Month': 11, 'Day': None, 'DOW': 'Wednesday', 'WOM': 4},  # Thanksgiving Eve (fourth Wednesday in November)
    {'Month': 2, 'Day': 14, 'DOW': None, 'WOM': None},  # Valentine's Day
    {'Month': 11, 'Day': 11, 'DOW': None, 'WOM': None},  # Veterans Day
    {'Month': 10, 'Day': 31, 'DOW': None, 'WOM': None},  # Halloween
    {'Month': 3, 'Day': 17, 'DOW': None, 'WOM': None},  # St. Patrick's Day
    {'Month': 11, 'Day': 25, 'DOW': 'Friday', 'WOM': None},  # Black Friday
    {'Month': 12, 'Day': 26, 'DOW': None, 'WOM': None},  # Boxing Day
]

In [17]:
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType
def is_public_holiday(month, day, dow, wom):
    for holiday in public_holidays:
        if holiday['Month'] == month and holiday['DOW'] == dow and holiday['WOM'] == wom:
            if holiday['Day'] is None:
                return True
            elif holiday['Day'] == day:
                return True
    return False

session.sql("USE SCHEMA RAW_POS").collect()
@udf(session=session, name='public_holiday', input_types=[IntegerType(), IntegerType(), IntegerType(), IntegerType()], return_type=IntegerType(), is_permanent=False, replace=True)
def public_holiday(month: int, day: int, dow: int, wom: int) -> int:
    if is_public_holiday(month, day, dow, wom):
        return 1
    else:
        return 0

df=df.withColumn('PUBLIC_HOLIDAY', public_holiday(F.month(F.col('DATE')), F.dayofmonth(F.col('DATE')), F.dayofweek(F.col('DATE')), ((F.col('Day') - 1) / 7 + 1).cast('integer')))

## Grouping

In [18]:
df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"ORDER_TS"           |"ORDER_DETAIL_ID"  |"TRUCK_BRAND_NAME"  |"MENU_TYPE"  |"PRIMARY_CITY"  |"REGION"  |"COUNTRY"      |"LOCATION_ID"  |"ORDER_TOTAL"  |"MONTH"  |"DOW"  |"DAY"  |"HOUR"  |"WOM"  |"PUBLIC_HOLIDAY"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-05-07  |17          |2022-05-07 19:03:18  |883147691          |Smoky BBQ           |BBQ          |Denver          |Colorado  |United States  |5112           |63.0000        |5        |6      |7      |19      |2      |0                 |
|2022-05-07  |17          |2

In [19]:
grouped_df = df.groupBy("DATE","TRUCK_ID","MONTH","HOUR","DOW","DAY","Menu_Type","LOCATION_ID","PUBLIC_HOLIDAY").agg(F.sum("ORDER_TOTAL"))

# Joining with weather data

In [20]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "GLOBAL_WEATHER__CLIMATE_DATA_FOR_BI",

    "warehouse": "HOL_WH"
}

wea_session = Session.builder.configs(connection_parameters).create()
wdf = wea_session.table('STANDARD_TILE.HISTORY_DAY')

In [21]:
wdf=wea_session.sql("select * From STANDARD_TILE.HISTORY_DAY")

In [22]:
print(type(wdf))
#print(type(semi_final_df))

<class 'snowflake.snowpark.dataframe.DataFrame'>


In [23]:
grouped_df.show()

---------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"     |"LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |
---------------------------------------------------------------------------------------------------------------------------------------
|2022-05-07  |18          |5        |22      |6      |7      |Tacos           |1861           |0                 |2336.5000           |
|2022-05-07  |18          |5        |18      |6      |7      |Tacos           |1861           |0                 |10424.5000          |
|2022-05-07  |20          |5        |8       |6      |7      |Ramen           |14812          |0                 |10043.2500          |
|2022-05-07  |21          |5        |10      |6      |7      |Grilled Cheese  |14792          |0                 |1462.0000           |
|2022-05-07  |21          |5        |13      |6 

In [24]:
wdf_re=wdf.with_column_renamed(col("DATE_VALID_STD"), "DATE")

In [25]:
sdf_loc = session.table('RAW_POS.Location_New')

In [26]:
sdf_loc_dr=sdf_loc.drop("PLACEKEY","ISO_COUNTRY_CODE")

In [27]:
sdf_loc_dr.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------
|"LOCATION_ID"  |"LOCATION"                                        |"CITY"  |"REGION"  |"COUNTRY"      |"LAT"               |"LONG"               |"ZIPCODE"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------
|1030           |University Of Colorado Museum Of Natural History  |Denver  |CO        |United States  |40.00768674326263   |-105.26970066129032  |80802      |
|1031           |Denver Technological Center                       |Denver  |CO        |United States  |39.62735682905758   |-104.91269066527825  |80237      |
|1032           |Heritage Club At Denver Tech Center               |Denver  |CO        |United States  |39.62526657316754   |-104.91238594440485  |80237      |
|1033           |Porter Wound Care Cente

In [28]:
semi_final_df=grouped_df.join(sdf_loc_dr, grouped_df["LOCATION_ID"] == sdf_loc_dr["LOCATION_ID"])

In [29]:
semi_final_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"  |"l_3epo_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_fvrn_LOCATION_ID"  |"LOCATION"               |"CITY"         |"REGION"  |"COUNTRY"      |"LAT"               |"LONG"              |"ZIPCODE"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-10-31  |68          |10       |18      |1      |31     |Crepes       |3161                  |0                 |16919.0000          |316

In [30]:
# # # # weather=wdf_re.to_pandas()
# weather=wdf_re.to_pandas()
# syntax=pd.io.sql.get_schema(weather, "weather_data")
# session.use_schema("ANALYTICS")
# session.sql(
    
#     syntax
# ).collect()
# session.write_pandas(
#     df=weather,
#     table_name="weather_data",
#     database="frostbyte_tasty_bytes",
#     schema="ANALYTICS",
#     quote_identifiers=False,
#     overwrite=True)

In [31]:
semi_final_df=semi_final_df.with_column_renamed(col("ZIPCODE"), "POSTAL_CODE")

In [32]:
type(wdf_re["DATE"])

snowflake.snowpark.column.Column

In [33]:
wdf = session.table('RAW_POS.weather_data')
wdf=wdf.select("POSTAL_CODE","DATE","COUNTRY","AVG_TEMPERATURE_AIR_2M_F","AVG_TEMPERATURE_HEATINDEX_2M_F","TOT_PRECIPITATION_IN","TOT_SNOWFALL_IN","TOT_SNOWDEPTH_IN","AVG_CLOUD_COVER_TOT_PCT")


In [34]:
final_df = semi_final_df.join(wdf, (semi_final_df["DATE"] == wdf["DATE"]) & (semi_final_df["POSTAL_CODE"] == wdf["POSTAL_CODE"])  )

In [35]:
session.use_schema("ANALYTICS")
final_df.write.save_as_table(table_name="sales_prediction", mode='overwrite')

Failed to execute query [queryID: 01ac528f-3200-bc30-0003-d8760006b0be]  CREATE  OR  REPLACE    TABLE  sales_prediction AS  SELECT  *  FROM ( SELECT  *  FROM (( SELECT "DATE" AS "l_lsl4_DATE", "TRUCK_ID" AS "TRUCK_ID", "MONTH" AS "MONTH", "HOUR" AS "HOUR", "DOW" AS "DOW", "DAY" AS "DAY", "MENU_TYPE" AS "MENU_TYPE", "l_3epo_LOCATION_ID" AS "l_3epo_LOCATION_ID", "PUBLIC_HOLIDAY" AS "PUBLIC_HOLIDAY", "SUM(ORDER_TOTAL)" AS "SUM(ORDER_TOTAL)", "r_fvrn_LOCATION_ID" AS "r_fvrn_LOCATION_ID", "LOCATION" AS "LOCATION", "CITY" AS "CITY", "REGION" AS "REGION", "COUNTRY" AS "l_lsl4_COUNTRY", "LAT" AS "LAT", "LONG" AS "LONG", "POSTAL_CODE" AS "l_lsl4_POSTAL_CODE" FROM ( SELECT "DATE", "TRUCK_ID", "MONTH", "HOUR", "DOW", "DAY", "MENU_TYPE", "l_3epo_LOCATION_ID", "PUBLIC_HOLIDAY", "SUM(ORDER_TOTAL)", "r_fvrn_LOCATION_ID", "LOCATION", "CITY", "REGION", "COUNTRY", "LAT", "LONG", "ZIPCODE" AS "POSTAL_CODE" FROM ( SELECT  *  FROM (( SELECT "DATE" AS "DATE", "TRUCK_ID" AS "TRUCK_ID", "MONTH" AS "MONTH", "H

SnowparkSQLException: (1304): 002140 (42601): SQL compilation error:
Unknown function PUBLIC_HOLIDAY

## Encoding

In [65]:
to_encode_df = session.table('ANALYTICS.SALES_PREDICTION')

In [66]:
to_encode_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"l_08qh_DATE"  |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"   |"l_jho6_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_sim4_LOCATION_ID"  |"LOCATION"       |"CITY"   |"REGION"  |"l_08qh_COUNTRY"  |"LAT"              |"LONG"               |"l_08qh_POSTAL_CODE"  |"r_c08p_POSTAL_CODE"  |"r_c08p_DATE"  |"r_c08p_COUNTRY"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_

In [72]:
columns_to_drop = ['"r_sim4_LOCATION_ID"','"l_08qh_COUNTRY"','"l_08qh_POSTAL_CODE"','"r_c08p_DATE"','"r_c08p_COUNTRY"']
new_columns  = [c for c in to_encode_df.columns if c not in columns_to_drop]
dropped_df = to_encode_df.select(*new_columns)

In [91]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "FROSTBYTE_TASTY_BYTE",

    "warehouse": "HOL_WH"
}

nt_session = Session.builder.configs(connection_parameters).create()
sales = nt_session.sql("Select * from ANALYTICS.DEMAND_FORECAST_TRAINING_Base")

In [92]:
sales.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [93]:
merge_df=sales.select( "DATE","LOCATION_ID" , "MENU_TYPE","DAY_OF_WEEK_AVG_CITY_MENU_TYPE","PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE")

In [94]:
mdf=merge_df.to_pandas()

In [95]:
syntax=pd.io.sql.get_schema(mdf, "DEMAND_FORECAST_TRAINING_Base")
session.use_schema("ANALYTICS")
session.sql(

    syntax
).collect()
session.write_pandas(
    df=mdf,
    table_name="DEMAND_FORECAST_TRAINING_Base",
    database="frostbyte_tasty_bytes",
    schema="ANALYTICS",
    quote_identifiers=False,
    overwrite=True)

<snowflake.snowpark.table.Table at 0x2b880568880>

In [96]:
mdf=session.sql("Select * from ANALYTICS.DEMAND_FORECAST_TRAINING_BASE")

In [120]:
mdf.where(col("PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE").isNull()).count()

1888021

In [124]:
mdf.count()

3403151

In [None]:
mdfcol(k).isNull()).count()

In [97]:
mdf.show()

-------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"LOCATION_ID"  |"MENU_TYPE"   |"DAY_OF_WEEK_AVG_CITY_MENU_TYPE"  |"PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE"  |
-------------------------------------------------------------------------------------------------------------------------
|2021-01-04  |7365           |Ethiopian     |NULL                              |NULL                                    |
|2021-03-29  |9803           |Mac & Cheese  |1971.76056                        |NULL                                    |
|2021-02-19  |9078           |Tacos         |3061.29123                        |799330.5                                |
|2021-11-26  |12513          |Indian        |6702.0201                         |1196902.0                               |
|2021-03-20  |11007          |Crepes        |2897.72693                        |417552.0                                |
|2022-06-20  |3099      

In [74]:
dropped_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"l_08qh_DATE"  |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"  |"l_jho6_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"LOCATION"          |"CITY"         |"REGION"  |"LAT"              |"LONG"               |"r_c08p_POSTAL_CODE"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_TOT_PCT"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [98]:
merge_df=dropped_df.join(mdf, (dropped_df['"l_08qh_DATE"'] == mdf["DATE"])& (dropped_df['"MENU_TYPE"'] == mdf["MENU_TYPE"])& (dropped_df['"l_jho6_LOCATION_ID"'] == mdf["Location_ID"]),"inner").drop('"l_08qh_DATE"').drop('"Date"').drop('"l_jho6_LOCATION_ID"')

In [99]:
merge_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"l_itt4_MENU_TYPE"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"LOCATION"          |"CITY"   |"REGION"  |"LAT"              |"LONG"               |"r_c08p_POSTAL_CODE"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_TOT_PCT"  |"DATE"      |"LOCATION_ID"  |"r_khjn_MENU_TYPE"  |"DAY_OF_WEEK_AVG_CITY_MENU_TYPE"  |"PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE"

In [102]:
merge_df = merge_df.withColumnRenamed('"r_khjn_MENU_TYPE"','Menu_Type').withColumnRenamed('"r_c08p_POSTAL_CODE"','POSTAL_CODE')

In [105]:
merge_df=merge_df.drop('"l_itt4_MENU_TYPE"')

In [106]:
merge_df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"LOCATION"          |"CITY"   |"REGION"  |"LAT"              |"LONG"               |"POSTAL_CODE"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_TOT_PCT"  |"DATE"      |"LOCATION_ID"  |"MENU_TYPE"  |"DAY_OF_WEEK_AVG_CITY_MENU_TYPE"  |"PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE"  |
------------------------------------------------------------------

## Null Encoding

In [132]:
nullColumns = []
numRows = merge_df.count()
for k in merge_df.columns:
    
    nullRows = merge_df.where(col(k).isNull()).count()
    columns=[k,nullRows]
    if nullRows>0:
        columns.append(merge_df.stat.approxQuantile(k,[0.5]))
    nullColumns.append(columns)

nullColumns
# ['D']

[['TRUCK_ID', 0],
 ['MONTH', 0],
 ['HOUR', 0],
 ['DOW', 0],
 ['DAY', 0],
 ['PUBLIC_HOLIDAY', 0],
 ['"SUM(ORDER_TOTAL)"', 0],
 ['LOCATION', 0],
 ['CITY', 0],
 ['REGION', 0],
 ['LAT', 0],
 ['LONG', 0],
 ['POSTAL_CODE', 0],
 ['AVG_TEMPERATURE_AIR_2M_F', 0],
 ['AVG_TEMPERATURE_HEATINDEX_2M_F', 0],
 ['TOT_PRECIPITATION_IN', 0],
 ['TOT_SNOWFALL_IN', 0],
 ['TOT_SNOWDEPTH_IN', 0],
 ['AVG_CLOUD_COVER_TOT_PCT', 0],
 ['DATE', 0],
 ['LOCATION_ID', 0],
 ['MENU_TYPE', 0],
 ['DAY_OF_WEEK_AVG_CITY_MENU_TYPE', 186, [1683.1472034615383]],
 ['PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE', 10921, [292095.2258064516]]]

In [135]:
dropped_df=merge_df.na.fill({'DAY_OF_WEEK_AVG_CITY_MENU_TYPE': 1683.1472034615383, 'PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE': 292095.2258064516})

In [138]:
nullColumns = []
numRows = dropped_df.count()
for k in dropped_df.columns:
    
    nullRows = dropped_df.where(col(k).isNull()).count()
    columns=[k,nullRows]
    if nullRows>0:
        columns.append(dropped_df.stat.approxQuantile(k,[0.5]))
    nullColumns.append(columns)

nullColumns
# ['D']

[['TRUCK_ID', 0],
 ['MONTH', 0],
 ['HOUR', 0],
 ['DOW', 0],
 ['DAY', 0],
 ['PUBLIC_HOLIDAY', 0],
 ['"SUM(ORDER_TOTAL)"', 0],
 ['LOCATION', 0],
 ['CITY', 0],
 ['REGION', 0],
 ['LAT', 0],
 ['LONG', 0],
 ['POSTAL_CODE', 0],
 ['AVG_TEMPERATURE_AIR_2M_F', 0],
 ['AVG_TEMPERATURE_HEATINDEX_2M_F', 0],
 ['TOT_PRECIPITATION_IN', 0],
 ['TOT_SNOWFALL_IN', 0],
 ['TOT_SNOWDEPTH_IN', 0],
 ['AVG_CLOUD_COVER_TOT_PCT', 0],
 ['DATE', 0],
 ['LOCATION_ID', 0],
 ['MENU_TYPE', 0],
 ['DAY_OF_WEEK_AVG_CITY_MENU_TYPE', 0],
 ['PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE', 0]]

In [119]:
merge_df.count()

23566


### Encoding

In [139]:


dropped_df.show()

columns = dropped_df.columns

# Iterate over the columns and count the number of unique values
unique_counts = {}
for column in columns:
    unique_counts[column] = dropped_df.select(column).distinct().count()

# Print the unique counts for each column
for column, count in unique_counts.items():
    print(f"Column '{column}': {count}")

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"LOCATION"          |"CITY"   |"REGION"  |"LAT"              |"LONG"               |"POSTAL_CODE"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_TOT_PCT"  |"DATE"      |"LOCATION_ID"  |"MENU_TYPE"  |"DAY_OF_WEEK_AVG_CITY_MENU_TYPE"  |"PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE"  |
------------------------------------------------------------------

In [140]:
categoricalColumns = ['MENU_TYPE', 'LOCATION', 'CITY', 'REGION']

In [141]:
def one_hot_encode_columns(df, column_names):
    encoded_df = df
    
    for column_name in column_names:
        unique_values = [row[column_name] for row in df.select(column_name).distinct().collect()]
        
        # Create a new column for each unique value and perform one-hot encoding
        for value in unique_values:
            encoded_column_name = f"{column_name}_{value}_encoded"
            encoded_df = encoded_df.withColumn(encoded_column_name, F.when(F.col(column_name) == value, 1).otherwise(0))
    
    
    return encoded_df

In [142]:
encoded_df = one_hot_encode_columns(dropped_df, categoricalColumns)

In [143]:
final_df = encoded_df.drop(*categoricalColumns)

In [144]:
final_df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [152]:
X_final=final_df[['TRUCK_ID',"MONTH",'HOUR','DOW','DAY','LOCATION_ID','LAT','LONG','AVG_TEMPERATURE_AIR_2M_F',"AVG_TEMPERATURE_HEATINDEX_2M_F","TOT_PRECIPITATION_IN","TOT_SNOWFALL_IN","TOT_SNOWDEPTH_IN","AVG_CLOUD_COVER_TOT_PCT","DAY_OF_WEEK_AVG_CITY_MENU_TYPE","PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE" ]]

In [153]:
X_final.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"LOCATION_ID"  |"LAT"              |"LONG"               |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_TOT_PCT"  |"DAY_OF_WEEK_AVG_CITY_MENU_TYPE"  |"PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [154]:
X_final=X_final.to_pandas()

In [155]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
scaler=StandardScaler()
scaler.fit(X_final)
X_final_scaled = scaler.transform(X_final)

In [156]:
X_final_scaled=pd.DataFrame(X_final_scaled,columns=X_final.columns)

In [157]:
X_final_scaled.head()

Unnamed: 0,TRUCK_ID,MONTH,HOUR,DOW,DAY,LOCATION_ID,LAT,LONG,AVG_TEMPERATURE_AIR_2M_F,AVG_TEMPERATURE_HEATINDEX_2M_F,TOT_PRECIPITATION_IN,TOT_SNOWFALL_IN,TOT_SNOWDEPTH_IN,AVG_CLOUD_COVER_TOT_PCT,DAY_OF_WEEK_AVG_CITY_MENU_TYPE,PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE
0,-0.71705,0.072176,0.513137,0.452845,0.694936,-0.699511,1.234283,-1.055315,0.556419,0.54236,-0.43925,-0.111192,-0.199874,-0.876193,-0.861753,-0.355884
1,-0.71705,0.072176,0.983035,0.452845,0.694936,-0.699511,1.234283,-1.055315,0.556419,0.54236,-0.43925,-0.111192,-0.199874,-0.876193,-0.861753,-0.355884
2,-0.71705,0.072176,0.748086,0.452845,0.694936,-0.699511,1.234283,-1.055315,0.556419,0.54236,-0.43925,-0.111192,-0.199874,-0.876193,-0.861753,-0.355884
3,-0.71705,0.072176,1.687881,0.452845,0.694936,-0.699511,1.234283,-1.055315,0.556419,0.54236,-0.43925,-0.111192,-0.199874,-0.876193,-0.861753,-0.355884
4,-0.71705,0.072176,0.278188,0.452845,0.694936,-0.699511,1.234283,-1.055315,0.556419,0.54236,-0.43925,-0.111192,-0.199874,-0.876193,-0.861753,-0.355884


In [158]:
session.use_schema("ANALYTICS")
# final_df.write.save_as_table(table_name="Encoded_Data", mode='overwrite')