In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.window import Window
from sklearn import preprocessing # https://github.com/Snowflake-Labs/snowpark-python-demos/tree/main/sp4py_utilities
from snowflake.snowpark.functions import col

import getpass
import pandas as pd
import matplotlib.pyplot as plt

In [91]:
#nj07294.ap-southeast-1

In [2]:
accountname = 'nj07294.ap-southeast-1'
#accountname = getpass.getpass() # ORGNAME-ACCOUNTNAME (separated by minus sign)

In [3]:
username = getpass.getpass()    # SNOWFLAKE-USERNAME

In [4]:
password = getpass.getpass()    # SNOWFLAKE-PASSWORD

In [5]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "FROSTBYTE_TASTY_BYTES",

    "warehouse": "HOL_WH"
}

session = Session.builder.configs(connection_parameters).create()

# Filter

In [11]:
sdf = session.table('ANALYTICS.ORDERS_V')
df=sdf.filter(col("COUNTRY")=='United States')

In [97]:
df=sdf.filter(col("COUNTRY")=='United States')

In [98]:
df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"ORDER_ID"  |"TRUCK_ID"  |"ORDER_TS"           |"ORDER_DETAIL_ID"  |"LINE_NUMBER"  |"TRUCK_BRAND_NAME"  |"MENU_TYPE"  |"PRIMARY_CITY"  |"REGION"  |"COUNTRY"      |"FRANCHISE_FLAG"  |"FRANCHISE_ID"  |"FRANCHISEE_FIRST_NAME"  |"FRANCHISEE_LAST_NAME"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"FIRST_NAME"  |"LAST_NAME"  |"E_MAIL"  |"PHONE_NUMBER"  |"CHILDREN_COUNT"  |"GENDER"  |"MARITAL_STATUS"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"   

### Extracting weather data

In [99]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "GLOBAL_WEATHER__CLIMATE_DATA_FOR_BI",

    "warehouse": "HOL_WH"
}

wea_session = Session.builder.configs(connection_parameters).create()
wdf = wea_session.table('STANDARD_TILE.HISTORY_DAY')

In [100]:
wdf.show()


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [101]:
wdf2 = wdf.to_pandas()


In [102]:
wdf2.head()

Unnamed: 0,POSTAL_CODE,COUNTRY,DATE_VALID_STD,DOY_STD,MIN_TEMPERATURE_AIR_2M_F,AVG_TEMPERATURE_AIR_2M_F,MAX_TEMPERATURE_AIR_2M_F,MIN_TEMPERATURE_WETBULB_2M_F,AVG_TEMPERATURE_WETBULB_2M_F,MAX_TEMPERATURE_WETBULB_2M_F,...,TOT_PRECIPITATION_IN,TOT_SNOWFALL_IN,TOT_SNOWDEPTH_IN,MIN_CLOUD_COVER_TOT_PCT,AVG_CLOUD_COVER_TOT_PCT,MAX_CLOUD_COVER_TOT_PCT,MIN_RADIATION_SOLAR_TOTAL_WPM2,AVG_RADIATION_SOLAR_TOTAL_WPM2,MAX_RADIATION_SOLAR_TOTAL_WPM2,TOT_RADIATION_SOLAR_TOTAL_WPM2
0,71678,US,2021-06-24,175,72.0,82.7,92.8,70.0,75.4,78.4,...,0.0,0.0,0.0,0,22,85,0.0,288.4,911.3,6922.5
1,71701,US,2021-06-24,175,72.9,83.7,95.3,71.0,76.5,79.7,...,0.0,0.0,0.0,0,32,100,0.0,237.0,865.5,5686.7
2,71858,US,2021-06-24,175,73.4,83.8,93.6,72.1,76.2,78.6,...,0.0,0.0,0.0,0,23,97,0.0,274.1,874.3,6579.9
3,72085,US,2021-06-24,175,67.6,81.0,91.4,66.0,74.4,79.2,...,0.0,0.0,0.0,0,33,100,0.0,265.8,779.8,6379.2
4,72124,US,2021-06-24,175,71.8,82.7,92.3,69.2,75.3,79.2,...,0.0,0.0,0.0,0,37,100,0.0,252.4,873.3,6056.3


### Drop columns (like discount amount, etc)

In [103]:
df=df.drop(['ORDER_DISCOUNT_AMOUNT','ORDER_TAX_AMOUNT','ORDER_AMOUNT','PRICE','UNIT_PRICE','QUANTITY','MENU_ITEM_NAME','MENU_ITEM_ID','MARITAL_STATUS','GENDER','CHILDREN_COUNT','PHONE_NUMBER','E_MAIL','LAST_NAME','FIRST_NAME','CUSTOMER_ID','FRANCHISEE_FIRST_NAME','FRANCHISEE_LAST_NAME','FRANCHISE_ID','FRANCHISE_FLAG','LINE_NUMBER','ORDER_ID'])

### Transform order_ts to year, month, day, day of the week, hour, public holiday binary

In [104]:
df=df.withColumn("Month",F.month(df["ORDER_TS"]))
df=df.withColumn("DOW",F.dayofweek(df["ORDER_TS"]))
df=df.withColumn("Day",F.dayofmonth(df["ORDER_TS"]))
df=df.withColumn("Hour",F.hour(df["ORDER_TS"]))
df=df.withColumn("WOM", ((F.dayofmonth(F.col('ORDER_TS')) - 1) / 7 + 1).cast('integer'))

In [105]:
# Create public holiday column binary
public_holidays = [
    {'Month': 7, 'Day': 4, 'DOW': None, 'WOM': None},  # 4th of July
    {'Month': 12, 'Day': 24, 'DOW': None, 'WOM': None},  # Christmas Eve
    {'Month': 12, 'Day': 25, 'DOW': None, 'WOM': None},  # Christmas Day
    {'Month': 10, 'Day': None, 'DOW': '1', 'WOM': 2},  # Columbus Day (second Monday in October)
    {'Month': 6, 'Day': 19, 'DOW': None, 'WOM': None},  # Juneteenth
    {'Month': 9, 'Day': None, 'DOW': '1', 'WOM': 1},  # Labor Day (first Monday in September)
    {'Month': 1, 'Day': None, 'DOW': '1', 'WOM': 3},  # Martin Luther King, Jr. Day (third Monday in January)
    {'Month': 5, 'Day': None, 'DOW': '1', 'WOM': -1},  # Memorial Day (last Monday in May)
    {'Month': 1, 'Day': 1, 'DOW': None, 'WOM': None},  # New Year's Day
    {'Month': 12, 'Day': 31, 'DOW': None, 'WOM': None},  # New Year's Eve
    {'Month': 11, 'Day': None, 'DOW': '4', 'WOM': 4},  # Thanksgiving Day (fourth Thursday in November)
    {'Month': 11, 'Day': None, 'DOW': '3', 'WOM': 4},  # Thanksgiving Eve (fourth Wednesday in November)
    {'Month': 2, 'Day': 14, 'DOW': None, 'WOM': None},  # Valentine's Day
    {'Month': 11, 'Day': 11, 'DOW': None, 'WOM': None},  # Veterans Day
    {'Month': 10, 'Day': 31, 'DOW': None, 'WOM': None},  # Halloween
    {'Month': 3, 'Day': 17, 'DOW': None, 'WOM': None},  # St. Patrick's Day
    {'Month': 11, 'Day': 25, 'DOW': '5', 'WOM': None},  # Black Friday
    {'Month': 12, 'Day': 26, 'DOW': None, 'WOM': None},  # Boxing Day
]

In [106]:
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType
'''def is_public_holiday(month, day, dow, wom):
    for holiday in public_holidays:
        if holiday['Month'] == month and holiday['DOW'] == dow and holiday['WOM'] == wom:
            if holiday['Day'] is None:
                return True
            elif holiday['Day'] == day:
                return True
    return False

session.sql("USE SCHEMA RAW_POS").collect()
@udf(session=session, name='public_holiday', input_types=[IntegerType(), IntegerType(), IntegerType(), IntegerType()], return_type=IntegerType(), is_permanent=False, replace=True)
def public_holiday(month: int, day: int, dow: int, wom: int) -> int:
    if is_public_holiday(month, day, dow, wom):
        return 1
    else:
        return 0

df=df.withColumn('PUBLIC_HOLIDAY', public_holiday(F.month(F.col('DATE')), F.dayofmonth(F.col('DATE')), F.dayofweek(F.col('DATE')), ((F.col('Day') - 1) / 7 + 1).cast('integer')))'''

from snowflake.snowpark.functions import col, when

# Initialize the 'PUBLIC_HOLIDAY' column to 0
df = df.withColumn('PUBLIC_HOLIDAY', F.lit(0))

# Iterate through the public_holidays list and update the 'PUBLIC_HOLIDAY' column
for h in public_holidays:
    df = df.withColumn(
        'PUBLIC_HOLIDAY',
        when(
            (F.month(F.col('DATE')) == h['Month']) &
            (F.dayofweek(F.col('DATE')) == h['DOW'] if h['DOW'] is not None else True) &
            (((F.col('Day') - 1) / 7 + 1).cast('integer') == h['WOM'] if h['WOM'] is not None else True) &
            (F.dayofmonth(F.col('DATE')) == h['Day'] if h['Day'] is not None else True),
            1
        ).otherwise(F.col('PUBLIC_HOLIDAY')))

## Grouping

In [107]:
df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"ORDER_TS"           |"ORDER_DETAIL_ID"  |"TRUCK_BRAND_NAME"  |"MENU_TYPE"  |"PRIMARY_CITY"  |"REGION"  |"COUNTRY"      |"LOCATION_ID"  |"ORDER_TOTAL"  |"MONTH"  |"DOW"  |"DAY"  |"HOUR"  |"WOM"  |"PUBLIC_HOLIDAY"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-05-07  |17          |2022-05-07 19:16:49  |883147729          |Smoky BBQ           |BBQ          |Denver          |Colorado  |United States  |5112           |46.0000        |5        |6      |7      |19      |2      |0                 |
|2022-05-07  |17          |2

In [108]:
grouped_df = df.groupBy("DATE","TRUCK_ID","MONTH","HOUR","DOW","DAY","Menu_Type","LOCATION_ID","PUBLIC_HOLIDAY").agg(F.sum("ORDER_TOTAL"))

# Joining with weather data

In [109]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "GLOBAL_WEATHER__CLIMATE_DATA_FOR_BI",

    "warehouse": "HOL_WH"
}

wea_session = Session.builder.configs(connection_parameters).create()
wdf = wea_session.table('STANDARD_TILE.HISTORY_DAY')

In [110]:
wdf=wea_session.sql("select * From STANDARD_TILE.HISTORY_DAY")

In [111]:
print(type(wdf))
#print(type(semi_final_df))

<class 'snowflake.snowpark.dataframe.DataFrame'>


In [112]:
grouped_df.show()

---------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"     |"LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |
---------------------------------------------------------------------------------------------------------------------------------------
|2022-05-05  |65          |5        |10      |4      |5      |Ramen           |12058          |0                 |20461.5000          |
|2022-05-05  |65          |5        |22      |4      |5      |Ramen           |3280           |0                 |12095.7500          |
|2022-05-05  |66          |5        |11      |4      |5      |Grilled Cheese  |3245           |0                 |7303.0000           |
|2022-05-05  |66          |5        |16      |4      |5      |Grilled Cheese  |2173           |0                 |8848.0000           |
|2022-05-05  |67          |5        |12      |4 

In [113]:
wdf_re=wdf.with_column_renamed(col("DATE_VALID_STD"), "DATE")

In [114]:
sdf_loc = session.table('RAW_POS.Location_New')

In [115]:
sdf_loc_dr=sdf_loc.drop("PLACEKEY","ISO_COUNTRY_CODE")

In [116]:
sdf_loc_dr.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------
|"LOCATION_ID"  |"LOCATION"                                        |"CITY"  |"REGION"  |"COUNTRY"      |"LAT"               |"LONG"               |"ZIPCODE"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------
|1030           |University Of Colorado Museum Of Natural History  |Denver  |CO        |United States  |40.00768674326263   |-105.26970066129032  |80802      |
|1031           |Denver Technological Center                       |Denver  |CO        |United States  |39.62735682905758   |-104.91269066527825  |80237      |
|1032           |Heritage Club At Denver Tech Center               |Denver  |CO        |United States  |39.62526657316754   |-104.91238594440485  |80237      |
|1033           |Porter Wound Care Cente

In [117]:
semi_final_df=grouped_df.join(sdf_loc_dr, grouped_df["LOCATION_ID"] == sdf_loc_dr["LOCATION_ID"])

In [118]:
semi_final_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"   |"l_hiu2_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_kjds_LOCATION_ID"  |"LOCATION"                   |"CITY"         |"REGION"  |"COUNTRY"      |"LAT"               |"LONG"              |"ZIPCODE"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2022-02-05  |62          |2        |20      |6      |5      |BBQ           |4111                  |0                 |20145.00

In [119]:
# # # # weather=wdf_re.to_pandas()
# weather=wdf_re.to_pandas()
# syntax=pd.io.sql.get_schema(weather, "weather_data")
# session.use_schema("ANALYTICS")
# session.sql(
    
#     syntax
# ).collect()
# session.write_pandas(
#     df=weather,
#     table_name="weather_data",
#     database="frostbyte_tasty_bytes",
#     schema="ANALYTICS",
#     quote_identifiers=False,
#     overwrite=True)

In [120]:
semi_final_df=semi_final_df.with_column_renamed(col("ZIPCODE"), "POSTAL_CODE")

In [121]:
type(wdf_re["DATE"])

snowflake.snowpark.column.Column

In [122]:
wdf = session.table('RAW_POS.weather_data')
wdf=wdf.select("POSTAL_CODE","DATE","COUNTRY","AVG_TEMPERATURE_AIR_2M_F","AVG_TEMPERATURE_HEATINDEX_2M_F","TOT_PRECIPITATION_IN","TOT_SNOWFALL_IN","TOT_SNOWDEPTH_IN","AVG_CLOUD_COVER_TOT_PCT")


In [123]:
final_df = semi_final_df.join(wdf, (semi_final_df["DATE"] == wdf["DATE"]) & (semi_final_df["POSTAL_CODE"] == wdf["POSTAL_CODE"])  )

In [124]:
session.use_schema("ANALYTICS")
final_df.write.save_as_table(table_name="sales_prediction", mode='overwrite')

## Merging

In [125]:
to_encode_df = session.table('ANALYTICS.SALES_PREDICTION')

In [126]:
to_encode_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"l_g44h_DATE"  |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"  |"l_hiu2_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_kjds_LOCATION_ID"  |"LOCATION"          |"CITY"         |"REGION"  |"l_g44h_COUNTRY"  |"LAT"              |"LONG"               |"l_g44h_POSTAL_CODE"  |"r_szdb_POSTAL_CODE"  |"r_szdb_DATE"  |"r_szdb_COUNTRY"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"

In [127]:
columns_to_drop = ['"r_sim4_LOCATION_ID"','"l_08qh_COUNTRY"','"l_08qh_POSTAL_CODE"','"r_c08p_DATE"','"r_c08p_COUNTRY"']
new_columns  = [c for c in to_encode_df.columns if c not in columns_to_drop]
dropped_df = to_encode_df.select(*new_columns)

In [128]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "FROSTBYTE_TASTY_BYTE",

    "warehouse": "HOL_WH"
}

nt_session = Session.builder.configs(connection_parameters).create()
sales = nt_session.sql("Select * from ANALYTICS.DEMAND_FORECAST_TRAINING_Base")

In [129]:
sales.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [130]:
merge_df=sales.select( "DATE","LOCATION_ID" , "MENU_TYPE","DAY_OF_WEEK_AVG_CITY_MENU_TYPE","PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE")

In [131]:
mdf=merge_df.to_pandas()

In [132]:
# syntax=pd.io.sql.get_schema(mdf, "DEMAND_FORECAST_TRAINING_Base")
# session.use_schema("ANALYTICS")
# session.sql(

#     syntax
# ).collect()
# session.write_pandas(
#     df=mdf,
#     table_name="DEMAND_FORECAST_TRAINING_Base",
#     database="frostbyte_tasty_bytes",
#     schema="ANALYTICS",
#     quote_identifiers=False,
#     overwrite=True)

In [133]:
mdf=session.sql("Select * from ANALYTICS.DEMAND_FORECAST_TRAINING_BASE")

In [134]:
mdf.where(col("PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE").isNull()).count()

1888021

In [135]:
mdf.count()

3403151

In [136]:
mdf.show()

-------------------------------------------------------------------------------------------------------------------------
|"DATE"      |"LOCATION_ID"  |"MENU_TYPE"   |"DAY_OF_WEEK_AVG_CITY_MENU_TYPE"  |"PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE"  |
-------------------------------------------------------------------------------------------------------------------------
|2021-01-04  |7365           |Ethiopian     |NULL                              |NULL                                    |
|2021-03-29  |9803           |Mac & Cheese  |1971.76056                        |NULL                                    |
|2021-02-19  |9078           |Tacos         |3061.29123                        |799330.5                                |
|2021-11-26  |12513          |Indian        |6702.0201                         |1196902.0                               |
|2021-03-20  |11007          |Crepes        |2897.72693                        |417552.0                                |
|2022-06-20  |3099      

In [137]:
dropped_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"l_g44h_DATE"  |"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"   |"l_hiu2_LOCATION_ID"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_kjds_LOCATION_ID"  |"LOCATION"       |"CITY"   |"REGION"  |"l_g44h_COUNTRY"  |"LAT"              |"LONG"               |"l_g44h_POSTAL_CODE"  |"r_szdb_POSTAL_CODE"  |"r_szdb_DATE"  |"r_szdb_COUNTRY"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_

In [139]:
merge_df=dropped_df.join(mdf, (dropped_df['"l_g44h_DATE"'] == mdf["DATE"])& (dropped_df['"MENU_TYPE"'] == mdf["MENU_TYPE"])& (dropped_df['"l_hiu2_LOCATION_ID"'] == mdf["Location_ID"]),"inner").drop('"l_g44h_DATE"').drop('"Date"').drop('"l_hiu2_LOCATION_ID"')

In [140]:
merge_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"l_xdrx_MENU_TYPE"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_kjds_LOCATION_ID"  |"LOCATION"          |"CITY"   |"REGION"  |"l_g44h_COUNTRY"  |"LAT"              |"LONG"               |"l_g44h_POSTAL_CODE"  |"r_szdb_POSTAL_CODE"  |"r_szdb_DATE"  |"r_szdb_COUNTRY"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATIO

In [141]:
merge_df = merge_df.withColumnRenamed('"l_xdrx_MENU_TYPE"','Menu_Type').withColumnRenamed('"l_g44h_POSTAL_CODE"','POSTAL_CODE')

In [142]:
merge_df=merge_df.drop('"l_xdrx_MENU_TYPE"')

In [143]:
merge_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_kjds_LOCATION_ID"  |"LOCATION"          |"CITY"   |"REGION"  |"l_g44h_COUNTRY"  |"LAT"              |"LONG"               |"POSTAL_CODE"  |"r_szdb_POSTAL_CODE"  |"r_szdb_DATE"  |"r_szdb_COUNTRY"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |

## Null Encoding

In [144]:
nullColumns = []
numRows = merge_df.count()
for k in merge_df.columns:
    
    nullRows = merge_df.where(col(k).isNull()).count()
    columns=[k,nullRows]
    if nullRows>0:
        columns.append(merge_df.stat.approxQuantile(k,[0.5]))
    nullColumns.append(columns)

nullColumns
# ['D']

[['TRUCK_ID', 0],
 ['MONTH', 0],
 ['HOUR', 0],
 ['DOW', 0],
 ['DAY', 0],
 ['MENU_TYPE', 0],
 ['PUBLIC_HOLIDAY', 0],
 ['"SUM(ORDER_TOTAL)"', 0],
 ['"r_kjds_LOCATION_ID"', 0],
 ['LOCATION', 0],
 ['CITY', 0],
 ['REGION', 0],
 ['"l_g44h_COUNTRY"', 0],
 ['LAT', 0],
 ['LONG', 0],
 ['POSTAL_CODE', 0],
 ['"r_szdb_POSTAL_CODE"', 0],
 ['"r_szdb_DATE"', 0],
 ['"r_szdb_COUNTRY"', 0],
 ['AVG_TEMPERATURE_AIR_2M_F', 0],
 ['AVG_TEMPERATURE_HEATINDEX_2M_F', 0],
 ['TOT_PRECIPITATION_IN', 0],
 ['TOT_SNOWFALL_IN', 0],
 ['TOT_SNOWDEPTH_IN', 0],
 ['AVG_CLOUD_COVER_TOT_PCT', 0],
 ['DATE', 0],
 ['LOCATION_ID', 0],
 ['"r_dyy7_MENU_TYPE"', 0],
 ['DAY_OF_WEEK_AVG_CITY_MENU_TYPE', 186, [1683.1472034615383]],
 ['PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE', 10921, [291685.22222222225]]]

In [145]:
dropped_df=merge_df.na.fill({'DAY_OF_WEEK_AVG_CITY_MENU_TYPE': 1683.1472034615383, 'PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE': 292095.2258064516})

In [146]:
nullColumns = []
numRows = dropped_df.count()
for k in dropped_df.columns:
    
    nullRows = dropped_df.where(col(k).isNull()).count()
    columns=[k,nullRows]
    if nullRows>0:
        columns.append(dropped_df.stat.approxQuantile(k,[0.5]))
    nullColumns.append(columns)

nullColumns
# ['D']

[['TRUCK_ID', 0],
 ['MONTH', 0],
 ['HOUR', 0],
 ['DOW', 0],
 ['DAY', 0],
 ['MENU_TYPE', 0],
 ['PUBLIC_HOLIDAY', 0],
 ['"SUM(ORDER_TOTAL)"', 0],
 ['"r_kjds_LOCATION_ID"', 0],
 ['LOCATION', 0],
 ['CITY', 0],
 ['REGION', 0],
 ['"l_g44h_COUNTRY"', 0],
 ['LAT', 0],
 ['LONG', 0],
 ['POSTAL_CODE', 0],
 ['"r_szdb_POSTAL_CODE"', 0],
 ['"r_szdb_DATE"', 0],
 ['"r_szdb_COUNTRY"', 0],
 ['AVG_TEMPERATURE_AIR_2M_F', 0],
 ['AVG_TEMPERATURE_HEATINDEX_2M_F', 0],
 ['TOT_PRECIPITATION_IN', 0],
 ['TOT_SNOWFALL_IN', 0],
 ['TOT_SNOWDEPTH_IN', 0],
 ['AVG_CLOUD_COVER_TOT_PCT', 0],
 ['DATE', 0],
 ['LOCATION_ID', 0],
 ['"r_dyy7_MENU_TYPE"', 0],
 ['DAY_OF_WEEK_AVG_CITY_MENU_TYPE', 0],
 ['PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE', 0]]

In [147]:
merge_df.count()

23566


### Encoding

In [148]:


dropped_df.show()

columns = dropped_df.columns

# Iterate over the columns and count the number of unique values
unique_counts = {}
for column in columns:
    unique_counts[column] = dropped_df.select(column).distinct().count()

# Print the unique counts for each column
for column, count in unique_counts.items():
    print(f"Column '{column}': {count}")

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"MENU_TYPE"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"r_kjds_LOCATION_ID"  |"LOCATION"          |"CITY"   |"REGION"  |"l_g44h_COUNTRY"  |"LAT"              |"LONG"               |"POSTAL_CODE"  |"r_szdb_POSTAL_CODE"  |"r_szdb_DATE"  |"r_szdb_COUNTRY"  |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |

In [149]:
categoricalColumns = ['MENU_TYPE', 'LOCATION', 'CITY', 'REGION']

In [150]:
def one_hot_encode_columns(df, column_names):
    encoded_df = df
    
    for column_name in column_names:
        unique_values = [row[column_name] for row in df.select(column_name).distinct().collect()]
        
        # Create a new column for each unique value and perform one-hot encoding
        for value in unique_values:
            encoded_column_name = f"{column_name}_{value}_encoded"
            encoded_df = encoded_df.withColumn(encoded_column_name, F.when(F.col(column_name) == value, 1).otherwise(0))
    
    
    return encoded_df

In [151]:
encoded_df = one_hot_encode_columns(dropped_df, categoricalColumns)

In [152]:
final_df = encoded_df.drop(*categoricalColumns)

In [153]:
final_df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [154]:
X_final=final_df[['TRUCK_ID',"MONTH",'HOUR','DOW','DAY','PUBLIC_HOLIDAY',"SUM(ORDER_TOTAL)",'LOCATION_ID','LAT','LONG','AVG_TEMPERATURE_AIR_2M_F',"AVG_TEMPERATURE_HEATINDEX_2M_F","TOT_PRECIPITATION_IN","TOT_SNOWFALL_IN","TOT_SNOWDEPTH_IN","AVG_CLOUD_COVER_TOT_PCT","DAY_OF_WEEK_AVG_CITY_MENU_TYPE","PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE" ]]

In [155]:
X_final.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MONTH"  |"HOUR"  |"DOW"  |"DAY"  |"PUBLIC_HOLIDAY"  |"SUM(ORDER_TOTAL)"  |"LOCATION_ID"  |"LAT"              |"LONG"               |"AVG_TEMPERATURE_AIR_2M_F"  |"AVG_TEMPERATURE_HEATINDEX_2M_F"  |"TOT_PRECIPITATION_IN"  |"TOT_SNOWFALL_IN"  |"TOT_SNOWDEPTH_IN"  |"AVG_CLOUD_COVER_TOT_PCT"  |"DAY_OF_WEEK_AVG_CITY_MENU_TYPE"  |"PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [156]:
session.use_schema("ANALYTICS")
X_final.write.save_as_table(table_name="Encoded_Data", mode='overwrite')

In [157]:
X_final=X_final.to_pandas()

In [158]:
columns_not_to_scale=["TRUCK_ID","MONTH","HOUR","DOW","DAY","PUBLIC_HOLIDAY","LOCATION_ID","LAT","LONG","SUM(ORDER_TOTAL)"]     

cat_col = [i for i in X_final.columns if i not in columns_not_to_scale]




In [159]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
scaler=StandardScaler()
scaler.fit(X_final)
X_final_scaled = X_final.copy()
X_final_scaled[cat_col] = StandardScaler().fit_transform(X_final_scaled[cat_col])

In [160]:

y_final=final_df[['SUM(ORDER_TOTAL)']]
y_final=y_final.to_pandas()
y_final=pd.DataFrame(y_final,columns=y_final.columns)

In [161]:
X_final_scaled.head()

Unnamed: 0,TRUCK_ID,MONTH,HOUR,DOW,DAY,PUBLIC_HOLIDAY,SUM(ORDER_TOTAL),LOCATION_ID,LAT,LONG,AVG_TEMPERATURE_AIR_2M_F,AVG_TEMPERATURE_HEATINDEX_2M_F,TOT_PRECIPITATION_IN,TOT_SNOWFALL_IN,TOT_SNOWDEPTH_IN,AVG_CLOUD_COVER_TOT_PCT,DAY_OF_WEEK_AVG_CITY_MENU_TYPE,PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE
0,33,12,15,5,24,1,1596.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
1,33,12,21,5,24,1,9365.5,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
2,33,12,19,5,24,1,4939.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
3,33,12,16,5,24,1,1017.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
4,33,12,17,5,24,1,3433.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733


In [162]:
#sum()

In [163]:
# syntax=pd.io.sql.get_schema(X_final_scaled, "Sales_Forecaster_Train_Date")
# session.use_schema("ANALYTICS")

# session.sql(

#     syntax
# )

In [164]:
import math

In [165]:
X_final_scaled["SUM(ORDER_TOTAL)"] = X_final_scaled["SUM(ORDER_TOTAL)"].apply(lambda x: round(x, 2))

In [166]:
X_final_scaled["AVG_TEMPERATURE_AIR_2M_F"].dtypes

dtype('float64')

In [167]:
X_final_scaled.rename(columns={"SUM(ORDER_TOTAL)": "Profit"},inplace=True)

In [14]:
session.write_pandas(
    df=X_final_scaled,
    table_name="Sales_Forecast_Training_Data",
    database="FROSTBYTE_TASTY_BYTES",
    schema="ANALYTICS",
    quote_identifiers=False,
    overwrite=True
)


NameError: name 'X_final_scaled' is not defined

## Machine Learning Modelling

In [6]:
session.use_schema("ANALYTICS")
X_final_scaled=session.sql("Select * from Sales_Forecast_Training_Data").to_pandas()

In [7]:
X_final_scaled

Unnamed: 0,TRUCK_ID,MONTH,HOUR,DOW,DAY,PUBLIC_HOLIDAY,PROFIT,LOCATION_ID,LAT,LONG,AVG_TEMPERATURE_AIR_2M_F,AVG_TEMPERATURE_HEATINDEX_2M_F,TOT_PRECIPITATION_IN,TOT_SNOWFALL_IN,TOT_SNOWDEPTH_IN,AVG_CLOUD_COVER_TOT_PCT,DAY_OF_WEEK_AVG_CITY_MENU_TYPE,PREV_YEAR_MONTH_SALES_CITY_MENU_TYPE
0,33,12,15,5,24,1,1596.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
1,33,12,21,5,24,1,9365.5,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
2,33,12,19,5,24,1,4939.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
3,33,12,16,5,24,1,1017.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
4,33,12,17,5,24,1,3433.0,2290,47.561638,-122.376698,-1.370498,-1.352155,1.17069,-0.111192,-0.199874,1.302848,-0.841069,-0.593733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23561,37,2,13,0,6,0,2399.0,2290,47.561638,-122.376698,-1.154850,-1.142405,-0.43925,-0.111192,-0.199874,0.675549,-0.786248,-0.384552
23562,37,2,9,0,6,0,6356.0,2290,47.561638,-122.376698,-1.154850,-1.142405,-0.43925,-0.111192,-0.199874,0.675549,-0.786248,-0.384552
23563,37,2,10,0,6,0,3073.0,2290,47.561638,-122.376698,-1.154850,-1.142405,-0.43925,-0.111192,-0.199874,0.675549,-0.786248,-0.384552
23564,37,2,12,0,6,0,6043.0,2290,47.561638,-122.376698,-1.154850,-1.142405,-0.43925,-0.111192,-0.199874,0.675549,-0.786248,-0.384552


In [171]:
X_final_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23566 entries, 0 to 23565
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   TRUCK_ID                              23566 non-null  int8   
 1   MONTH                                 23566 non-null  int8   
 2   HOUR                                  23566 non-null  int8   
 3   DOW                                   23566 non-null  int8   
 4   DAY                                   23566 non-null  int8   
 5   PUBLIC_HOLIDAY                        23566 non-null  int8   
 6   PROFIT                                23566 non-null  float64
 7   LOCATION_ID                           23566 non-null  int16  
 8   LAT                                   23566 non-null  float64
 9   LONG                                  23566 non-null  float64
 10  AVG_TEMPERATURE_AIR_2M_F              23566 non-null  float64
 11  AVG_TEMPERATURE

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

# Split the dataset into features (X) and target (y)
X = X_final_scaled.drop("PROFIT",axis=1)
y = X_final_scaled["PROFIT"]

# Split the dataset into training and testing datasets
X_train, X_holdout_test, y_train, y_holdout_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_holdout, X_test, y_holdout, y_test = train_test_split(X_holdout_test, y_holdout_test, test_size=0.2, random_state=42)

In [173]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)

In [178]:
print('Train MSE is: ', mean_squared_error(lgbm.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(lgbm.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(lgbm.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(lgbm.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(lgbm.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(lgbm.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(lgbm.predict(X_train), y_train))
print('Test R2 is: ', r2_score(lgbm.predict(X_test), y_test))

Train MSE is:  2014136.3855472703
Test MSE is:  2289623.987367086

Train RMSE is:  1419.2027288401296
Test RMSE is:  1513.150351870919

Train MAE is:  1063.968459036136
Test MAE is:  1105.815378228909

Train R2 is:  0.9527253423956941
Test R2 is:  0.9448532150166624


In [175]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [179]:
print('Train MSE is: ', mean_squared_error(xgb.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(xgb.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(xgb.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(xgb.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(xgb.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(xgb.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(xgb.predict(X_train), y_train))
print('Test R2 is: ', r2_score(xgb.predict(X_test), y_test))

Train MSE is:  771526.7620708488
Test MSE is:  1236772.561729723

Train RMSE is:  878.3659613571377
Test RMSE is:  1112.102765813359

Train MAE is:  641.8070704684887
Test MAE is:  777.6534032164298

Train R2 is:  0.9827021720783029
Test R2 is:  0.9715130780191024


## Fine-tuned XGBoost

In [None]:
xgb = XGBRegressor(learning_rate= 0.04, max_depth= 10, n_estimators= 1700, subsample= 0.9)
xgb.fit(X_train, y_train)
print('Train MSE is: ', mean_squared_error(xgb.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(xgb.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(xgb.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(xgb.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(xgb.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(xgb.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(xgb.predict(X_train), y_train))
print('Test R2 is: ', r2_score(xgb.predict(X_test), y_test))

In [18]:
from sklearn. ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, VotingRegressor

# Build Random Forest Model and Evaluate both training and testing accuracy
rf = RandomForestRegressor(n_estimators=40)
rf.fit(X_train,y_train)

RandomForestRegressor(n_estimators=40)

In [20]:

print('Train MSE is: ', mean_squared_error(rf.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(rf.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(rf.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(rf.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(rf.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(rf.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(rf.predict(X_train), y_train))
print('Test R2 is: ', r2_score(rf.predict(X_test), y_test))

Train MSE is:  79323.84858511087
Test MSE is:  428601.3704179439

Train RMSE is:  281.6448980278373
Test RMSE is:  654.676538771586

Train MAE is:  130.88767548978004
Test MAE is:  325.33809981442204

Train R2 is:  0.9982722052471262
Test R2 is:  0.9903412496609059


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
import seaborn as sns
import scipy.stats as stats

In [24]:
def cvr2(model): #Cross Validation scores function
    results  = cross_validate(model, X_train, y_train, scoring='r2', cv=5, return_train_score = True)
    print('train_score: ', results['train_score'])
    print('test_score: ', results['test_score'], '\n')

    print('cross val training R^2 value is: ', sum(results['train_score'])/len(results['train_score']))
    print('cross val testing R^2 value is: ', sum(results['test_score'])/len(results['test_score']))

def cvrm(model): #Cross Validation scores function
    results  = cross_validate(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5, return_train_score = True)
    print('train_score: ', results['train_score'])
    print('test_score: ', results['test_score'], '\n')

    print('cross val training rmse is:', -(sum(results['train_score'])/len(results['train_score'])))
    print('cross val testing rmse is:', -(sum(results['test_score'])/len(results['test_score'])))
    
def cvm(model): #Cross Validation scores function
    results  = cross_validate(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=5, return_train_score = True)
    print('train_score: ', results['train_score'])
    print('test_score: ', results['test_score'], '\n')

    print('cross val training mae is:', (-(sum(results['train_score'])/len(results['train_score']))))
    print('cross val testing mae is:', (-(sum(results['test_score'])/len(results['test_score']))))
    
def cv(model):
    print("R^2 value: ")
    cvr2(model)
    print(" \nRmse: ")
    cvrm(model)
    print(" \nMae: ")
    cvm(model)

In [25]:
rf = RandomForestRegressor()

param_grid = {"n_estimators"      : [10,30,50, 100, 200],
              'ccp_alpha': [0.0, 0.1, 0.2],
              'criterion': ['mse', 'absolute_error'],
#             "max_features"      : ["auto", "sqrt", "log2"],
#              "min_samples_leaf" : [1, 5, 10],
#             "min_samples_split" : [2, 4, 10, 12, 16], 
#              "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90],
              'max_depth' : [5, 25, 125, 525, 1225]
             }

gs = GridSearchCV(rf, param_grid=param_grid, scoring='r2', cv= 3, n_jobs=-1, verbose = 2)
# cv: number of partitions for cross validation
# n_jobs: number of jobs to run in parallel, -1 means using all processors

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


In [None]:
rf = RandomForestRegressor(ccp_alpha = 0.0, criterion='mse',max_depth = 525, n_estimators = 200)
rf.fit(X_train, y_train)

cv(rf)

In [15]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor()

param_grid = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = param_grid, cv = 2, n_jobs=-1)
grid_GBR.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.04, max_depth=8, n_estimators=1500,
                          subsample=0.9)

 The best score across ALL searched params:
 0.9749017238306279

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 8, 'n_estimators': 1500, 'subsample': 0.9}


In [16]:
gbr = GradientBoostingRegressor(learning_rate= 0.04, max_depth= 8, n_estimators= 1500, subsample= 0.9)
gbr = gbr.fit(X_train, y_train)

print('Train MSE is: ', mean_squared_error(gbr.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(gbr.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(gbr.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(gbr.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(gbr.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(gbr.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(gbr.predict(X_train), y_train))
print('Test R2 is: ', r2_score(gbr.predict(X_test), y_test))

Train MSE is:  3857.2668986809076
Test MSE is:  160037.05054998022

Train RMSE is:  62.10689896203889
Test RMSE is:  400.0463105066465

Train MAE is:  45.05993768646041
Test MAE is:  189.97315879533045

Train R2 is:  0.9999167569609541
Test R2 is:  0.9964613325270291


In [None]:
def diagnostic_plots(df, variable):
    # function takes a dataframe (df) and
    # the variable of interest as arguments

    # define figure size
    plt.figure(figsize=(16, 4))

    # histogram
    plt.subplot(1, 3, 1)
    sns.histplot(df[variable], bins=30, kde = True)
    plt.title('Histogram')

    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('RM quantiles')

    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[variable])
    plt.title('Boxplot')


In [130]:
ab=AdaBoostRegressor(n_estimators=20,learning_rate=1)
ab.fit(X_train,y_train)
train_mse = mean_squared_error(ab.predict(X_train), y_train)#training mse
print('the training mean squared error is: ', train_mse) 
train_R2 = ab.score(X_train, y_train) #training r2
print('training R^2 value is: ', train_R2)

test_mse = mean_squared_error(ab.predict(X_test), y_test) #testing mse
print('the testing mean squared error is: ',test_mse)
test_R2 = ab.score(X_test, y_test) #testing r2
print('testing R^2 value is: ', test_R2)

the training mean squared error is:  8991744.007605808
training R^2 value is:  0.8063579674313232
the testing mean squared error is:  8906276.683779892
testing R^2 value is:  0.8056561654192178


In [132]:
from sklearn.model_selection import GridSearchCV
ab=AdaBoostRegressor(n_estimators=20,learning_rate=1)
param_grid = {"n_estimators":[10,50,100,300], "loss" : ['linear', 'square','exponential'], "learning_rate" : [0.5,1,2,3]}
gs = GridSearchCV(ab, param_grid=param_grid, scoring='accuracy', cv= 10, n_jobs=-1)
# cv: number of partitions for cross validation
# n_jobs: number of jobs to run in parallel, -1 means using all processors

gs = gs.fit(X_train, y_train) # 

print(gs.best_score_)
print(gs.best_params_)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]


nan
{'learning_rate': 0.5, 'loss': 'linear', 'n_estimators': 10}
