In [71]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.window import Window
from sklearn import preprocessing # https://github.com/Snowflake-Labs/snowpark-python-demos/tree/main/sp4py_utilities
from snowflake.snowpark.functions import col

import getpass
import pandas as pd
import matplotlib.pyplot as plt

In [72]:
#nj07294.ap-southeast-1

In [73]:
accountname = 'nj07294.ap-southeast-1'
#accountname = getpass.getpass() # ORGNAME-ACCOUNTNAME (separated by minus sign)

In [74]:
username = getpass.getpass()    # SNOWFLAKE-USERNAME

In [75]:
password = getpass.getpass()    # SNOWFLAKE-PASSWORD

In [76]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "FROSTBYTE_TASTY_BYTES",

    "warehouse": "HOL_WH"
}

session = Session.builder.configs(connection_parameters).create()

# Filter

In [77]:
sdf = session.table('ANALYTICS.ORDERS_V')
df=sdf.filter(col("COUNTRY")=='United States')

In [78]:
df=sdf.filter(col("COUNTRY")=='United States')

In [79]:
df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"ORDER_ID"  |"TRUCK_ID"  |"ORDER_TS"           |"ORDER_DETAIL_ID"  |"LINE_NUMBER"  |"TRUCK_BRAND_NAME"  |"MENU_TYPE"  |"PRIMARY_CITY"  |"REGION"  |"COUNTRY"      |"FRANCHISE_FLAG"  |"FRANCHISE_ID"  |"FRANCHISEE_FIRST_NAME"  |"FRANCHISEE_LAST_NAME"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"FIRST_NAME"  |"LAST_NAME"  |"E_MAIL"  |"PHONE_NUMBER"  |"CHILDREN_COUNT"  |"GENDER"  |"MARITAL_STATUS"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"   

### Extracting weather data

In [80]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "GLOBAL_WEATHER__CLIMATE_DATA_FOR_BI",

    "warehouse": "HOL_WH"
}

wea_session = Session.builder.configs(connection_parameters).create()
wdf = wea_session.table('STANDARD_TILE.HISTORY_DAY')

In [81]:
wdf.show()


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [82]:
wdf2 = wdf.to_pandas()


In [83]:
wdf2.head()

Unnamed: 0,POSTAL_CODE,COUNTRY,DATE_VALID_STD,DOY_STD,MIN_TEMPERATURE_AIR_2M_F,AVG_TEMPERATURE_AIR_2M_F,MAX_TEMPERATURE_AIR_2M_F,MIN_TEMPERATURE_WETBULB_2M_F,AVG_TEMPERATURE_WETBULB_2M_F,MAX_TEMPERATURE_WETBULB_2M_F,...,TOT_PRECIPITATION_IN,TOT_SNOWFALL_IN,TOT_SNOWDEPTH_IN,MIN_CLOUD_COVER_TOT_PCT,AVG_CLOUD_COVER_TOT_PCT,MAX_CLOUD_COVER_TOT_PCT,MIN_RADIATION_SOLAR_TOTAL_WPM2,AVG_RADIATION_SOLAR_TOTAL_WPM2,MAX_RADIATION_SOLAR_TOTAL_WPM2,TOT_RADIATION_SOLAR_TOTAL_WPM2
0,102103,NG,2021-11-16,320,77.5,81.9,89.6,75.6,77.0,78.6,...,0.07,0.0,0.0,83,96,100,0.0,236.9,857.0,5686.0
1,110054,IN,2021-11-16,320,50.7,62.7,77.1,43.8,51.9,60.0,...,0.0,0.0,0.0,0,1,6,0.0,189.7,713.2,4552.6
2,2044,AU,2021-11-16,320,52.6,59.2,66.0,49.1,51.4,54.3,...,0.0,0.0,0.0,1,15,86,0.0,371.1,1084.2,8905.7
3,21745-690,BR,2021-11-16,320,63.7,74.8,87.9,61.1,67.1,72.3,...,0.0,0.0,0.0,2,55,100,0.0,350.7,1078.2,8415.6
4,60596,DE,2021-11-16,320,40.4,42.0,43.9,39.5,40.9,42.3,...,0.0,0.0,0.0,99,100,100,0.0,12.8,59.6,306.7


### Drop columns (like discount amount, etc)

In [84]:
df=df.drop(['ORDER_DISCOUNT_AMOUNT','ORDER_TAX_AMOUNT','ORDER_AMOUNT','PRICE','UNIT_PRICE','QUANTITY','MENU_ITEM_NAME','MENU_ITEM_ID','MARITAL_STATUS','GENDER','CHILDREN_COUNT','PHONE_NUMBER','E_MAIL','LAST_NAME','FIRST_NAME','CUSTOMER_ID','FRANCHISEE_FIRST_NAME','FRANCHISEE_LAST_NAME','FRANCHISE_ID','FRANCHISE_FLAG','LINE_NUMBER','ORDER_ID'])

### Transform order_ts to year, month, day, day of the week, hour, public holiday binary

In [85]:
df=df.withColumn("Month",F.month(df["ORDER_TS"]))
df=df.withColumn("DOW",F.dayofweek(df["ORDER_TS"]))
df=df.withColumn("Day",F.dayofmonth(df["ORDER_TS"]))
df=df.withColumn("Hour",F.hour(df["ORDER_TS"]))
df=df.withColumn("WOM", ((F.dayofmonth(F.col('ORDER_TS')) - 1) / 7 + 1).cast('integer'))

In [86]:
# Create public holiday column binary
public_holidays = [
    {'Month': 7, 'Day': 4, 'DOW': None, 'WOM': None},  # 4th of July
    {'Month': 12, 'Day': 24, 'DOW': None, 'WOM': None},  # Christmas Eve
    {'Month': 12, 'Day': 25, 'DOW': None, 'WOM': None},  # Christmas Day
    {'Month': 10, 'Day': None, 'DOW': '1', 'WOM': 2},  # Columbus Day (second Monday in October)
    {'Month': 6, 'Day': 19, 'DOW': None, 'WOM': None},  # Juneteenth
    {'Month': 9, 'Day': None, 'DOW': '1', 'WOM': 1},  # Labor Day (first Monday in September)
    {'Month': 1, 'Day': None, 'DOW': '1', 'WOM': 3},  # Martin Luther King, Jr. Day (third Monday in January)
    {'Month': 5, 'Day': None, 'DOW': '1', 'WOM': -1},  # Memorial Day (last Monday in May)
    {'Month': 1, 'Day': 1, 'DOW': None, 'WOM': None},  # New Year's Day
    {'Month': 12, 'Day': 31, 'DOW': None, 'WOM': None},  # New Year's Eve
    {'Month': 11, 'Day': None, 'DOW': '4', 'WOM': 4},  # Thanksgiving Day (fourth Thursday in November)
    {'Month': 11, 'Day': None, 'DOW': '3', 'WOM': 4},  # Thanksgiving Eve (fourth Wednesday in November)
    {'Month': 2, 'Day': 14, 'DOW': None, 'WOM': None},  # Valentine's Day
    {'Month': 11, 'Day': 11, 'DOW': None, 'WOM': None},  # Veterans Day
    {'Month': 10, 'Day': 31, 'DOW': None, 'WOM': None},  # Halloween
    {'Month': 3, 'Day': 17, 'DOW': None, 'WOM': None},  # St. Patrick's Day
    {'Month': 11, 'Day': 25, 'DOW': '5', 'WOM': None},  # Black Friday
    {'Month': 12, 'Day': 26, 'DOW': None, 'WOM': None},  # Boxing Day
]

In [87]:
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType
'''def is_public_holiday(month, day, dow, wom):
    for holiday in public_holidays:
        if holiday['Month'] == month and holiday['DOW'] == dow and holiday['WOM'] == wom:
            if holiday['Day'] is None:
                return True
            elif holiday['Day'] == day:
                return True
    return False

session.sql("USE SCHEMA RAW_POS").collect()
@udf(session=session, name='public_holiday', input_types=[IntegerType(), IntegerType(), IntegerType(), IntegerType()], return_type=IntegerType(), is_permanent=False, replace=True)
def public_holiday(month: int, day: int, dow: int, wom: int) -> int:
    if is_public_holiday(month, day, dow, wom):
        return 1
    else:
        return 0

df=df.withColumn('PUBLIC_HOLIDAY', public_holiday(F.month(F.col('DATE')), F.dayofmonth(F.col('DATE')), F.dayofweek(F.col('DATE')), ((F.col('Day') - 1) / 7 + 1).cast('integer')))'''

from snowflake.snowpark.functions import col, when

# Initialize the 'PUBLIC_HOLIDAY' column to 0
df = df.withColumn('PUBLIC_HOLIDAY', F.lit(0))

# Iterate through the public_holidays list and update the 'PUBLIC_HOLIDAY' column
for h in public_holidays:
    df = df.withColumn(
        'PUBLIC_HOLIDAY',
        when(
            (F.month(F.col('DATE')) == h['Month']) &
            (F.dayofweek(F.col('DATE')) == h['DOW'] if h['DOW'] is not None else True) &
            (((F.col('Day') - 1) / 7 + 1).cast('integer') == h['WOM'] if h['WOM'] is not None else True) &
            (F.dayofmonth(F.col('DATE')) == h['Day'] if h['Day'] is not None else True),
            1
        ).otherwise(F.col('PUBLIC_HOLIDAY')))

## Grouping

In [88]:
df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"ORDER_TS"           |"ORDER_DETAIL_ID"  |"TRUCK_BRAND_NAME"  |"MENU_TYPE"  |"PRIMARY_CITY"  |"REGION"  |"COUNTRY"      |"LOCATION_ID"  |"ORDER_TOTAL"  |"MONTH"  |"DOW"  |"DAY"  |"HOUR"  |"WOM"  |"PUBLIC_HOLIDAY"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-05-07  |17          |2022-05-07 19:16:49  |883147729          |Smoky BBQ           |BBQ          |Denver          |Colorado  |United States  |5112           |46.0000        |5        |6      |7      |19      |2      |0                 |
|2022-05-07  |17          |2

In [89]:
grouped_df = df.groupBy("DATE","TRUCK_ID","MONTH","HOUR","DOW","DAY","Menu_Type","LOCATION_ID","PUBLIC_HOLIDAY").agg(F.sum("ORDER_TOTAL"))

# Joining with weather data

In [90]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "GLOBAL_WEATHER__CLIMATE_DATA_FOR_BI",

    "warehouse": "HOL_WH"
}

wea_session = Session.builder.configs(connection_parameters).create()
wdf = wea_session.table('STANDARD_TILE.HISTORY_DAY')

In [91]:
wdf=wea_session.sql("select * From STANDARD_TILE.HISTORY_DAY")

In [92]:
print(type(wdf))
#print(type(semi_final_df))

<class 'snowflake.snowpark.dataframe.DataFrame'>


In [93]:
grouped_df.show()

---------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"     |"LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |
---------------------------------------------------------------------------------------------------------------------------------------
|2022-05-05  |65          |5        |10      |4      |5      |Ramen           |12058          |0                 |20461.5000          |
|2022-05-05  |65          |5        |22      |4      |5      |Ramen           |3280           |0                 |12095.7500          |
|2022-05-05  |66          |5        |11      |4      |5      |Grilled Cheese  |3245           |0                 |7303.0000           |
|2022-05-05  |66          |5        |16      |4      |5      |Grilled Cheese  |2173           |0                 |8848.0000           |
|2022-05-05  |67          |5        |12      |4 

In [94]:
wdf_re=wdf.with_column_renamed(col("DATE_VALID_STD"), "DATE")

In [95]:
sdf_loc = session.table('RAW_POS.Location_New')

In [96]:
sdf_loc_dr=sdf_loc.drop("PLACEKEY","ISO_COUNTRY_CODE")

In [97]:
sdf_loc_dr.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------
|"LOCATION_ID"  |"LOCATION"                                        |"CITY"  |"REGION"  |"COUNTRY"      |"LAT"               |"LONG"               |"ZIPCODE"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------
|1030           |University Of Colorado Museum Of Natural History  |Denver  |CO        |United States  |40.00768674326263   |-105.26970066129032  |80802      |
|1031           |Denver Technological Center                       |Denver  |CO        |United States  |39.62735682905758   |-104.91269066527825  |80237      |
|1032           |Heritage Club At Denver Tech Center               |Denver  |CO        |United States  |39.62526657316754   |-104.91238594440485  |80237      |
|1033           |Porter Wound Care Cente

In [98]:
semi_final_df=grouped_df.join(sdf_loc_dr, grouped_df["LOCATION_ID"] == sdf_loc_dr["LOCATION_ID"])

In [99]:
semi_final_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"   |"l_jho6_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_sim4_LOCATION_ID"  |"LOCATION"                                          |"CITY"         |"REGION"  |"COUNTRY"      |"LAT"               |"LONG"              |"ZIPCODE"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-02-05  |62          |2        |21      |6      |5   

In [100]:
# # # # weather=wdf_re.to_pandas()
# weather=wdf_re.to_pandas()
# syntax=pd.io.sql.get_schema(weather, "weather_data")
# session.use_schema("ANALYTICS")
# session.sql(
    
#     syntax
# ).collect()
# session.write_pandas(
#     df=weather,
#     table_name="weather_data",
#     database="frostbyte_tasty_bytes",
#     schema="ANALYTICS",
#     quote_identifiers=False,
#     overwrite=True)

In [101]:
semi_final_df=semi_final_df.with_column_renamed(col("ZIPCODE"), "POSTAL_CODE")

In [102]:
type(wdf_re["DATE"])

snowflake.snowpark.column.Column

In [103]:
wdf = session.table('RAW_POS.weather_data')
wdf=wdf.select("POSTAL_CODE","DATE","COUNTRY","AVG_TEMPERATURE_AIR_2M_F","AVG_TEMPERATURE_HEATINDEX_2M_F","TOT_PRECIPITATION_IN","TOT_SNOWFALL_IN","TOT_SNOWDEPTH_IN","AVG_CLOUD_COVER_TOT_PCT")


In [104]:
final_df = semi_final_df.join(wdf, (semi_final_df["DATE"] == wdf["DATE"]) & (semi_final_df["POSTAL_CODE"] == wdf["POSTAL_CODE"])  )

In [105]:
session.use_schema("ANALYTICS")
final_df.write.save_as_table(table_name="sales_prediction", mode='overwrite')

## Encoding

In [106]:
to_encode_df = session.table('ANALYTICS.SALES_PREDICTION')

In [107]:
columns_to_drop = ['"l_nli8_DATE"', '"l_nli8_POSTAL_CODE"', '"r_srp1_DATE"','"r_kzjz_LOCATION_ID"','"r_srp1_COUNTRY"','"l_nli8_COUNTRY"']
new_columns  = [c for c in to_encode_df.columns if c not in columns_to_drop]
dropped_df = to_encode_df.select(*new_columns)

In [108]:
dropped_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"l_08qh_DATE"  |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"  |"l_jho6_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_sim4_LOCATION_ID"  |"LOCATION"          |"CITY"         |"REGION"  |"l_08qh_COUNTRY"  |"LAT"              |"LONG"               |"l_08qh_POSTAL_CODE"  |"r_c08p_POSTAL_CODE"  |"r_c08p_DATE"  |"r_c08p_COUNTRY"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"

In [109]:
columns = dropped_df.columns

# Iterate over the columns and count the number of unique values
unique_counts = {}
for column in columns:
    unique_counts[column] = dropped_df.select(column).distinct().count()

# Print the unique counts for each column
for column, count in unique_counts.items():
    print(f"Column '{column}': {count}")

Column '"l_08qh_DATE"': 339
Column 'TRUCK_ID': 45
Column 'MONTH': 12
Column 'HOUR': 15
Column 'DOW': 7
Column 'DAY': 31
Column 'MENU_TYPE': 15
Column '"l_jho6_LOCATION_ID"': 16
Column 'PUBLIC_HOLIDAY': 2
Column '"SUM(ORDER_TOTAL)"': 3261
Column '"r_sim4_LOCATION_ID"': 16
Column 'LOCATION': 16
Column 'CITY': 3
Column 'REGION': 3
Column '"l_08qh_COUNTRY"': 1
Column 'LAT': 14
Column 'LONG': 14
Column '"l_08qh_POSTAL_CODE"': 4
Column '"r_c08p_POSTAL_CODE"': 4
Column '"r_c08p_DATE"': 339
Column '"r_c08p_COUNTRY"': 1
Column 'AVG_TEMPERATURE_AIR_2M_F': 312
Column 'AVG_TEMPERATURE_HEATINDEX_2M_F': 320
Column 'TOT_PRECIPITATION_IN': 67
Column 'TOT_SNOWFALL_IN': 16
Column 'TOT_SNOWDEPTH_IN': 26
Column 'AVG_CLOUD_COVER_TOT_PCT': 100


In [110]:
categoricalColumns = ['MENU_TYPE', 'LOCATION', 'CITY', 'REGION']

In [111]:
def one_hot_encode_columns(df, column_names):
    encoded_df = df
    
    for column_name in column_names:
        unique_values = [row[column_name] for row in df.select(column_name).distinct().collect()]
        
        # Create a new column for each unique value and perform one-hot encoding
        for value in unique_values:
            encoded_column_name = f"{column_name}_{value}_encoded"
            encoded_df = encoded_df.withColumn(encoded_column_name, F.when(F.col(column_name) == value, 1).otherwise(0))
    
    
    return encoded_df

In [112]:
encoded_df = one_hot_encode_columns(dropped_df, categoricalColumns)

In [113]:
final_df = encoded_df.drop(*categoricalColumns)

In [115]:
final_df = final_df.withColumnRenamed('"l_jho6_LOCATION_ID"','LOCATION_ID').withColumnRenamed('"l_08qh_POSTAL_CODE"','POSTAL_CODE')

In [116]:
final_df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [117]:
X_final=final_df[['TRUCK_ID',"MONTH",'HOUR','DOW','DAY','PUBLIC_HOLIDAY','LOCATION_ID','LAT','LONG','AVG_TEMPERATURE_AIR_2M_F',"AVG_TEMPERATURE_HEATINDEX_2M_F","TOT_PRECIPITATION_IN","TOT_SNOWFALL_IN","TOT_SNOWDEPTH_IN","AVG_CLOUD_COVER_TOT_PCT"]]

In [118]:
X_final.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"PUBLIC_HOLIDAY"  |"LOCATION_ID"  |"LAT"              |"LONG"              |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_TOT_PCT"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|17          |6        |15      |0      |19     |1                 |5133           |39.75236618378834  |-105.0103434372534  |77.6                   

In [119]:
X_final=X_final.to_pandas()

In [120]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
scaler=StandardScaler()
scaler.fit(X_final)
X_final_scaled = scaler.transform(X_final)

In [121]:
X_final_scaled=pd.DataFrame(X_final_scaled,columns=X_final.columns)
y_final=final_df[['SUM(ORDER_TOTAL)']]
y_final=y_final.to_pandas()
y_final=pd.DataFrame(y_final,columns=y_final.columns)

In [122]:
X_final_scaled.head()

Unnamed: 0,TRUCK_ID,MONTH,HOUR,DOW,DAY,PUBLIC_HOLIDAY,LOCATION_ID,LAT,LONG,AVG_TEMPERATURE_AIR_2M_F,AVG_TEMPERATURE_HEATINDEX_2M_F,TOT_PRECIPITATION_IN,TOT_SNOWFALL_IN,TOT_SNOWDEPTH_IN,AVG_CLOUD_COVER_TOT_PCT
0,-0.795529,1.066215,0.273262,-1.071905,1.151241,-0.186399,-0.125149,1.260263,-1.067305,-0.351587,-0.360703,0.871032,-0.115043,-0.208092,0.849439
1,-0.795529,1.066215,0.742809,-1.071905,1.151241,-0.186399,-0.125149,1.260263,-1.067305,-0.351587,-0.360703,0.871032,-0.115043,-0.208092,0.849439
2,-0.795529,1.066215,1.447129,-1.071905,1.151241,-0.186399,-0.125149,1.260263,-1.067305,-0.351587,-0.360703,0.871032,-0.115043,-0.208092,0.849439
3,-0.795529,1.066215,1.681903,-1.071905,1.151241,-0.186399,-0.125149,1.260263,-1.067305,-0.351587,-0.360703,0.871032,-0.115043,-0.208092,0.849439
4,-0.795529,1.066215,0.038489,-1.071905,1.151241,-0.186399,-0.125149,1.260263,-1.067305,-0.351587,-0.360703,0.871032,-0.115043,-0.208092,0.849439


In [123]:
y_final.head()

Unnamed: 0,SUM(ORDER_TOTAL)
0,1513.0
1,3337.0
2,5933.0
3,99.0
4,819.0


In [124]:
session.use_schema("ANALYTICS")
# final_df.write.save_as_table(table_name="Encoded_Data", mode='overwrite')

In [125]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into features (X) and target (y)
X = X_final_scaled
y = y_final

# Split the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [126]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)

In [127]:
from sklearn.metrics import mean_squared_error

# Make predictions on the testing dataset
y_pred = lgbm.predict(X_test)

# Calculate the RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

RMSE: 4740.652387316153
