# Import libraries

In [None]:
# "magic commands" to enable autoreload of the imported packages
# %load_ext autoreload
# %reload_ext autoreload
# %autoreload 2

import re
from pathlib import Path
import joblib

# Data Manipulation
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 500
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.2f}'.format
# pd.options.display.precision 

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# Sklearn 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler  
from sklearn.pipeline import make_union, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn import set_config; set_config(display='diagram')
from sklearn.base import clone

set_config(transform_output="pandas")

# Sktime
!pip install sktime --quiet
from sktime.transformations.series.date import DateTimeFeatures
from sktime.transformations.series.summarize import WindowSummarizer
from sktime.transformations.series.time_since import TimeSince


from google.colab import drive
drive.mount('/content/drive')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


# Utility functions

In [None]:
#### Function to reduce memory usage ####
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_memory_usage(df, verbose=True):
    numerics = ['int32', 'int64', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Memory usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Feature engineering summary

|              |                                                              |                                        |
| ------------ | ------------------------------------------------------------ | -------------------------------------- |
| **TARGET**   | "sales"                                                      | Create lag and rolling window features |
| **FEATURES** |                                                              |                                        |
| Categorical  | "family", "store_nbr", "city", "state", "type", "cluster"          | Mean encoding with expanding window    |
| Categorical  | 'is_Local_holiday' (binary),    is_Regional_holiday' (binary),   'is_National_holiday' (binary),    'earthquake_day' (binary),    'earthquake_impact' (binary) ,   'wage_day' (binary) |                                        |
| Numeric      | "onpromotion",    "oil_price_interpolated"                   |                                        |
| Numeric      | "transactions"                                               | Create an one-day lag feature |
| Datetime     | "date"                                                       | Create time and date features          |

**NUMERIC FEATURES:**

* "onpromotion" = the total number of items in a product family that were being promoted at a store at a given date.
* "transactions" = the total number of transactions per store at a given date. (each value in the "transactions" column is the number of transactions per store for a given date)
* "oil_price_interpolated"

**LAG FEATURES:**

Lags are past values of the TARGET/ FEATURE.


**ROLLING WINDOW FEATURES:** 

We will create rolling window features to summarize more than one past value into a single feature (where we compute statistics) using a window over the past data. 
  * How we pick the window size: we try nested window features where we use multiple window sizes on different time scales (e.g. weeks, months) to capture information at those time scales. For example, patterns in short term (weeks) vs long term (months) in the target and feature.
  * Statistics: We will keep it simple and use mean and standard deviation. If the accuracy is not good enough, we can consider add more statistics.
      * Mean: moving average to smooth the data
      * Standard deviation: measures volatility
NOTE: (1) For forecasting, we need to use the values of the features that we know at predict time to avoid data leakage. So we need to shift the output of the row down by one (2) For the edge cases (NaN), we will use smaller window sizes at the edges and impute the missing data (Another option is to treat them as missing data and drop/impute)


**MEAN ENCODING USING EXPANDING WINDOW FEATURES:** 

We will manually create mean encoding features using expanding window for 4 categorical features: "family", "store_nbr", "city", "state". This implementation helps avoid look-ahead bias.
* Mean encoding is similar to label encoding, except here labels are correlated directly with the target. For example, in mean target encoding for each category in the feature label is decided with the mean value of the target variable on a training data/ testing data.
* The advantages of the mean target encoding are that it does not affect the volume of the data and helps in faster learning.
* NOTE: (1) To avoid data leakage, we need to shift the output of the row down by one. (2) We have multiple categorical features, the expanding window for each feature will be different from each other. 



**TIME FEATURE:**

We create a time feature (time since start) from the Datetime Index
* The "sales" series seems to have some changepoints (changepoint = abrupt change in any properties of a time series (e.g. trend, seasonality, autoregressive properties)).
* Thus, we include this time feature that would allow our tree-based model to segment over time, and isolate changepoints, outliers, and other interesting periods during training.    


**DATE FEATURES:**

We create several date features from the Datetime Index
* day_of_week
* is_weekend
* day_of_month
* week_of_month
* week_of_year
* month_of_year
* year


**FEATURE SELECTION:**

We will use **feature selection** to reduce the number of features.

-------------
**OTHER:**
* We can use `clone` to return an unfitted version of the pipeline (from sklearn.base import clone // pipeline = clone(pipeline)) 
* We have not considered DISTRIBUTED LAGS (...): the impact of ... on day t will probably last for multiple days into future after time t. Thus, The sales on a given day is influenced by ... on previous days as well as the same day
* We have not considered WEIGHTED WINDOW FEATURES: to be more sensitive to recent observations, e.g., to quickly pick up changes in trend. We assigns weights to the window (more weight to recent observations when computing a window feature)  
  * How to pick the weight: 
    * Exponential: Weight decays exponentially. We can specify parameter: rate of decay. E.g. Exponential weighted window (mean) 
    * Exponential weighted moving average (EWMA) at time t


# **PART 2** 

## 0-Load the datasets with encoded features

In [None]:
# Load the datasets with encoded features
enc_data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/enc_data")
train_enc_data_dir = enc_data_dir / "train_data.parquet"
test_enc_data_dir = enc_data_dir / "test_data.parquet"

train_data = pd.read_parquet(
    path=train_enc_data_dir, 
    engine="pyarrow"
    )

test_data = pd.read_parquet(
    path=test_enc_data_dir, 
    engine="pyarrow"
    )

train_data = train_data.set_index(["id", "date"]).sort_index()
test_data = test_data.set_index(["id", "date"]).sort_index()

display(train_data.head(), test_data.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,store_nbr,family,sales,onpromotion,transactions,oil_price_interpolated,is_Local_holiday,is_Regional_holiday,is_National_holiday,wage_day,earthquake_day,earthquake_impact,sales_ewma7_mean,sales_ewma7_std,enc_family_mean,enc_store_nbr_mean,enc_city_mean,enc_state_mean,enc_type_mean,enc_cluster_mean
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10_AUTOMOTIVE,2013-01-01,10,AUTOMOTIVE,0.0,0.0,0.0,93.14,no_holiday,no_holiday,National,yes,no,no,,,,254.53,255.14,73.6,326.74,708.23
10_AUTOMOTIVE,2013-01-02,10,AUTOMOTIVE,3.0,0.0,1293.0,93.14,no_holiday,no_holiday,no_holiday,no,no,no,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
10_AUTOMOTIVE,2013-01-03,10,AUTOMOTIVE,2.0,0.0,1157.0,92.97,no_holiday,no_holiday,no_holiday,no,no,no,0.75,2.12,1.5,1.5,1.5,1.5,1.5,1.5
10_AUTOMOTIVE,2013-01-04,10,AUTOMOTIVE,2.0,0.0,970.0,93.12,no_holiday,no_holiday,no_holiday,no,no,no,1.06,1.63,1.67,1.67,1.67,1.67,1.67,1.67
10_AUTOMOTIVE,2013-01-05,10,AUTOMOTIVE,0.0,0.0,1269.0,93.12,no_holiday,no_holiday,no_holiday,no,no,no,1.3,1.38,1.75,1.75,1.75,1.75,1.75,1.75


Unnamed: 0_level_0,Unnamed: 1_level_0,store_nbr,family,sales,onpromotion,transactions,oil_price_interpolated,is_Local_holiday,is_Regional_holiday,is_National_holiday,wage_day,earthquake_day,earthquake_impact,sales_ewma7_mean,sales_ewma7_std,enc_family_mean,enc_store_nbr_mean,enc_city_mean,enc_state_mean,enc_type_mean,enc_cluster_mean
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10_AUTOMOTIVE,2017-08-16,10,AUTOMOTIVE,,0.0,,46.8,no_holiday,no_holiday,no_holiday,no,no,no,1.91,1.63,3.08,3.08,3.08,3.08,3.08,3.08
10_AUTOMOTIVE,2017-08-17,10,AUTOMOTIVE,,0.0,,47.07,no_holiday,no_holiday,no_holiday,no,no,no,1.91,1.63,3.08,3.08,3.08,3.08,3.08,3.08
10_AUTOMOTIVE,2017-08-18,10,AUTOMOTIVE,,0.0,,48.59,no_holiday,no_holiday,no_holiday,no,no,no,1.91,1.63,3.08,3.08,3.08,3.08,3.08,3.08
10_AUTOMOTIVE,2017-08-19,10,AUTOMOTIVE,,0.0,,48.58,no_holiday,no_holiday,no_holiday,no,no,no,1.91,1.63,3.08,3.08,3.08,3.08,3.08,3.08
10_AUTOMOTIVE,2017-08-20,10,AUTOMOTIVE,,0.0,,47.98,no_holiday,no_holiday,no_holiday,no,no,no,1.91,1.63,3.08,3.08,3.08,3.08,3.08,3.08


## 1 - Create transformers for lag, window, time and date features

### Lag & Window transformer



In [None]:
# Lag & window transformer for the TARGET = "sales"
target_lag_window_transformer = WindowSummarizer(
    lag_feature={                          
        "lag": [1, 2, 7, 28, 29, 30, 31],                                               # lag features with different lags 
        "mean": [[1, 7], [1, 14], [1, 28], [1, 29], [1, 30], [1, 31], [1, 32],           # rolling window features with [[shift/lag, window size/length]]
                 [7, 7], [7, 14], [7, 28],   
                 [28, 7], [28, 28]],
        "std": [[1, 7], [1, 14], [1, 28], [1, 29], [1, 30], [1, 31], [1, 32],          
                 [7, 7], [7, 14], [7, 28],   
                 [28, 7], [28, 28]]
                 },                                                   
    target_cols=["sales"],   
    truncate=None,  # "bfill"=Backfill missing values from lagging and windowing. It will fill the NAs by carrying the first observation backwards.
)


# Lag & window transformer for the FEATURE = "transactions"
# "transactions" = the total number of transactions per store at a given date. (each value in the "transactions" column is the number of transactions per store for a given date)
feature_lag_transformer = WindowSummarizer(
    lag_feature={                        
        "mean": [[28, 7]],
        "std": [[28, 7]]},                                                   
    target_cols=["transactions"],   
    truncate=None,  # "bfill"=Backfill missing values from lagging and windowing. It will fill the NAs by carrying the first observation backwards.
)

#https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.transformations.series.summarize.WindowSummarizer.html?highlight=WindowSummarizer#sktime.transformations.series.summarize.WindowSummarizer
# lag defines how far back in the past the window starts, window length gives the length of the window across which to apply the function.  


In [None]:
# # Check how the lag_window_transformer works
# target_lag_window_transformer.fit(df[["sales"]])
# df_check = target_lag_window_transformer.transform(df[["sales"]])
# display(df[["sales"]].head(), df_check) 

## Check
# a = pd.to_datetime("2013-01-01") 
# b = np.timedelta64(28, "D")  
# print(a + b) 
# df_check.loc[("1_AUTOMOTIVE", '2013-01-01'):("1_AUTOMOTIVE", '2013-01-29')]

# target_lag_window_transformer = target_lag_window_transformer.clone()

### Time transformer

In [None]:
# Time transformer 
time_transformer = TimeSince(freq="D", keep_original_columns=False)

In [None]:
# # Check how the time_transformer works
# time_transformer.fit(df)
# df_check = time_transformer.transform(df)
# df_check

# time_transformer = time_transformer.clone()

### Date transformer

In [None]:
# Datetime transformer
date_features = [
    "day_of_week", 
    "is_weekend",
    "day_of_month",
    "week_of_month",
    "week_of_year",
    "month_of_year",
    "quarter_of_year",
    "year"
]

date_transformer = DateTimeFeatures(manual_selection=date_features, keep_original_columns=False)

#https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.transformations.series.date.DateTimeFeatures.html?highlight=datetimefeatures

In [None]:
# # Check how the datetime_transformer works on a subset of df
# date_transformer.fit(df.head())
# df_check = date_transformer.transform(df.head())

# for col in df_check[date_features].columns:
#     print(df_check[col].unique(), '\n')

# df_check

## 2 - Create pipeline

### Pipeline_1

|              |                                                              |                                                        |
| ------------ | ------------------------------------------------------------ | ------------------------------------------------------ |
| **TARGET**   | "sales"                                                      | Create lag and rolling window features -> MinMaxScaler |
| **FEATURES** |                                                              |                                                        |
| Numeric      | "transactions"                                               | Create an one-day lag feature -> MinMaxScaler |
| Numeric      | "onpromotion",    "oil_price_interpolated"                   | MinMaxScaler                                           |
| Mean encoded | "enc_family_mean",  "enc_store_nbr_mean", "enc_city_mean",  "enc_state_mean", "enc_type_mean", "enc_cluster_mean" | MinMaxScaler                                           |
| Time         | "time_since_2013-01-01 00:00:00"                             | MinMaxScaler                                           |
| Date         | "day_of_week",  "is_weekend", "day_of_month", "week_of_month", "week_of_year", "month_of_year", "quarter_of_year", "year" | OneHotEncoding                                         |
| Categorical  | 'is_Local_holiday' (binary),    is_Regional_holiday' (binary),   'is_National_holiday' (binary),    'earthquake_day' (binary),    'earthquake_impact' (binary) ,   'wage_day' (binary) | OneHotEncoding                                         |

#### Create pipeline_1

In [None]:
#---------- FEATURES ----------#
num_columns = [
    "onpromotion", "oil_price_interpolated", 
    "enc_family_mean",	"enc_store_nbr_mean",	
    "enc_city_mean", "enc_state_mean", 
    "enc_type_mean", "enc_cluster_mean",
    "sales_ewma7_mean",	"sales_ewma7_std"
    ]
              
cat_columns = [
    "is_Local_holiday", "is_Regional_holiday", "is_National_holiday",
    "earthquake_day", "earthquake_impact", "wage_day"
    ]      
                
date_columns = [
  'year', 'quarter_of_year', 'month_of_year', 
  'week_of_year', 'week_of_month', 
  'day_of_month', 'day_of_week', 'is_weekend'
  ]  

#---------- TRANSFORMERS ----------#
# date and time
date_transformer   
time_transformer   

# lag and window
target_lag_window_transformer   #create lag & window features for "sales"
feature_lag_transformer  #create an one-day lag feature for "transactions"

# minmaxscaler and onehotencoder
minmax_transformer = MinMaxScaler()
ohe_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")   



#---------- PIPELINE ----------#
# pipeline 1 for target_lag_window_transformer, feature_lag_transformer, time_transformer, date_transformer
u = make_union(
    target_lag_window_transformer,
    feature_lag_transformer, 
    time_transformer,
    date_transformer
)
ct_1 = make_column_transformer(                                    
    (minmax_transformer, make_column_selector(pattern="sales_lag|sales_mean|sales_std|time_since|transactions_lag")),         # print(re.search(r'sales_mean', 'sales_mean_14_7'))
    (ohe_transformer, date_columns),
    remainder='drop', 
    verbose_feature_names_out=False
    )
pl_1 = make_pipeline(u, ct_1)


# pipeline 2 for the rest of numeric and categorical features
ct_2 = make_column_transformer(
    (minmax_transformer, num_columns),
    (ohe_transformer, cat_columns),
    remainder='drop',
    verbose_feature_names_out=False
    )
pl_2 = make_pipeline(ct_2 )

# final pipeline
pipeline_1 = make_union(pl_1, pl_2)
pipeline_1 

#### Run pipeline_1

In [None]:
# Run pipeline_1
pipeline_1.fit(train_data)
train_data_pl1 = pipeline_1.transform(train_data)
test_data_pl1 = pipeline_1.transform(test_data)
train_data_pl1.head()

# # Reset pipeline (to return an unfitted version of the pipeline)
# pipeline = clone(pipeline)

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_lag_1,sales_lag_31,sales_lag_30,sales_lag_28,sales_lag_29,sales_lag_7,sales_lag_2,sales_mean_1_28,sales_mean_1_7,sales_mean_7_7,sales_mean_7_28,sales_mean_1_32,sales_mean_28_28,sales_mean_1_31,sales_mean_1_14,sales_mean_1_30,sales_mean_1_29,sales_mean_28_7,sales_mean_7_14,sales_std_7_28,sales_std_28_7,sales_std_1_31,sales_std_7_7,sales_std_1_32,sales_std_1_30,sales_std_1_29,sales_std_1_28,sales_std_1_14,sales_std_1_7,sales_std_7_14,sales_std_28_28,time_since_2013-01-01 00:00:00,year_2013,year_2014,year_2015,year_2016,year_2017,quarter_of_year_1,quarter_of_year_2,quarter_of_year_3,quarter_of_year_4,month_of_year_1,month_of_year_2,month_of_year_3,month_of_year_4,month_of_year_5,month_of_year_6,month_of_year_7,month_of_year_8,month_of_year_9,...,day_of_month_11,day_of_month_12,day_of_month_13,day_of_month_14,day_of_month_15,day_of_month_16,day_of_month_17,day_of_month_18,day_of_month_19,day_of_month_20,day_of_month_21,day_of_month_22,day_of_month_23,day_of_month_24,day_of_month_25,day_of_month_26,day_of_month_27,day_of_month_28,day_of_month_29,day_of_month_30,day_of_month_31,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,is_weekend_0,is_weekend_1,onpromotion,oil_price_interpolated,enc_family_mean,enc_store_nbr_mean,enc_city_mean,enc_state_mean,enc_type_mean,enc_cluster_mean,is_Local_holiday_Local,is_Local_holiday_no_holiday,is_Regional_holiday_Regional,is_Regional_holiday_no_holiday,is_National_holiday_National,is_National_holiday_no_holiday,earthquake_day_no,earthquake_day_yes,earthquake_impact_no,earthquake_impact_yes,wage_day_no,wage_day_yes
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1
10_AUTOMOTIVE,2013-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.79,,0.13,0.24,0.09,0.16,0.35,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
10_AUTOMOTIVE,2013-01-02,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
10_AUTOMOTIVE,2013-01-03,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
10_AUTOMOTIVE,2013-01-04,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
10_AUTOMOTIVE,2013-01-05,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [None]:
# Check the features
train_data_pl1.columns.tolist()

# # Check the columns and their values
# for col in train_data_pl1.columns:
#   print(col, "\n", train_data_pl1[col].nunique(), "\n", train_data_pl1[col].unique(), "\n")

['sales_lag_1',
 'sales_lag_31',
 'sales_lag_30',
 'sales_lag_28',
 'sales_lag_29',
 'sales_lag_7',
 'sales_lag_2',
 'sales_mean_1_28',
 'sales_mean_1_7',
 'sales_mean_7_7',
 'sales_mean_7_28',
 'sales_mean_1_32',
 'sales_mean_28_28',
 'sales_mean_1_31',
 'sales_mean_1_14',
 'sales_mean_1_30',
 'sales_mean_1_29',
 'sales_mean_28_7',
 'sales_mean_7_14',
 'sales_std_7_28',
 'sales_std_28_7',
 'sales_std_1_31',
 'sales_std_7_7',
 'sales_std_1_32',
 'sales_std_1_30',
 'sales_std_1_29',
 'sales_std_1_28',
 'sales_std_1_14',
 'sales_std_1_7',
 'sales_std_7_14',
 'sales_std_28_28',
 'time_since_2013-01-01 00:00:00',
 'year_2013',
 'year_2014',
 'year_2015',
 'year_2016',
 'year_2017',
 'quarter_of_year_1',
 'quarter_of_year_2',
 'quarter_of_year_3',
 'quarter_of_year_4',
 'month_of_year_1',
 'month_of_year_2',
 'month_of_year_3',
 'month_of_year_4',
 'month_of_year_5',
 'month_of_year_6',
 'month_of_year_7',
 'month_of_year_8',
 'month_of_year_9',
 'month_of_year_10',
 'month_of_year_11',
 'm

#### Add the TARGET="sales" and COLUMNS=["family", "store_nbr] back 






In [None]:
# For pipeline_1
train_data_pl1["sales"] = train_data["sales"]
train_data_pl1["family"] = train_data["family"]
train_data_pl1["store_nbr"] = train_data["store_nbr"]

test_data_pl1["sales"] = test_data["sales"]
test_data_pl1["family"] = test_data["family"]
test_data_pl1["store_nbr"] = test_data["store_nbr"]



#### Save the preprocessed dataset and pipeline_1

In [None]:
# Set directories
data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/data_from_pipelines")
pipeline_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipelines")

for _dir in [data_dir, pipeline_dir]:
    _dir.mkdir(exist_ok=True)

In [None]:
# Save the dataset 
train_out_dir = data_dir / "train_data_pl1.parquet"
test_out_dir = data_dir / "test_data_pl1.parquet"
train_data_pl1.to_parquet(train_out_dir)
test_data_pl1.to_parquet(test_out_dir)

# Save the feature engineering pipeline
out_dir = pipeline_dir / "pipeline_1.joblib"
joblib.dump(pipeline_1, out_dir)

['/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipeline_1/pipeline_1.joblib']

### Pipeline_2

|              |                                                              |                                                        |
| ------------ | ------------------------------------------------------------ | ------------------------------------------------------ |
| **TARGET**   | "sales"                                                      | Create lag and rolling window features -> MinMaxScaler |
| **FEATURES** |                                                              |                                                        |
| Numeric      | "transactions"                                               | Create an one-day lag feature -> MinMaxScaler |
| Numeric      | "onpromotion",    "oil_price_interpolated"                   | MinMaxScaler                                           |
| Mean encoded | "enc_family_mean",  "enc_store_nbr_mean", "enc_city_mean",  "enc_state_mean", "enc_type_mean", "enc_cluster_mean" | MinMaxScaler                                           |
| Time         | "time_since_2013-01-01 00:00:00"                             | MinMaxScaler                                           |
| Date         | "day_of_week",  "is_weekend", "day_of_month", "week_of_month", "week_of_year", "month_of_year", "quarter_of_year", "year" | MinMaxScaler                                         |
| Categorical  | 'is_Local_holiday' (binary),    is_Regional_holiday' (binary),   'is_National_holiday' (binary),    'earthquake_day' (binary),    'earthquake_impact' (binary) ,   'wage_day' (binary) | OneHotEncoding                                         |

#### Create pipeline_2

In [None]:
#---------- FEATURES ----------#
num_columns = [
    "onpromotion", "oil_price_interpolated", 
    "enc_family_mean",	"enc_store_nbr_mean",	
    "enc_city_mean", "enc_state_mean", 
    "enc_type_mean", "enc_cluster_mean",
    "sales_ewma7_mean",	"sales_ewma7_std"
    ]
              
cat_columns = [
    "is_Local_holiday", "is_Regional_holiday", "is_National_holiday",
    "earthquake_day", "earthquake_impact", "wage_day"
    ]      
                
date_columns = [
  'year', 'quarter_of_year', 'month_of_year', 
  'week_of_year', 'week_of_month', 
  'day_of_month', 'day_of_week', 'is_weekend'
  ]  


#---------- TRANSFORMERS ----------#
# date and time
date_transformer   
time_transformer   

# lag and window
target_lag_window_transformer   #create lag & window features for "sales"
feature_lag_transformer  #create an one-day lag feature for "transactions"

# minmaxscaler and onehotencoder
minmax_transformer = MinMaxScaler()
ohe_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")  


#---------- PIPELINE ----------#
# pipeline 1 for the transformers
u = make_union(
    target_lag_window_transformer,
    feature_lag_transformer, 
    time_transformer,
    date_transformer
)

pl_1 = make_pipeline(
    u, 
    make_column_transformer(
        (minmax_transformer, make_column_selector(pattern="sales_lag|sales_mean|sales_std|time_since|transactions_lag")),
        (minmax_transformer, date_columns),
        remainder='drop',
        verbose_feature_names_out=False
        )
    )

# pipeline 2 for categorical features
ct = make_column_transformer(
    (minmax_transformer, num_columns),
    (ohe_transformer, cat_columns),
    remainder='passthrough',
    verbose_feature_names_out=False
    )

pl_2 = make_pipeline(ct)


# final pipeline
pipeline_2 = make_union(pl_1, pl_2)
pipeline_2 

#### Run pipeline_2

In [None]:
# Run pipeline_2
pipeline_2.fit(train_data)
train_data_pl2 = pipeline_2.transform(train_data)
test_data_pl2 = pipeline_2.transform(test_data)
train_data_pl2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_lag_1,sales_lag_31,sales_lag_30,sales_lag_28,sales_lag_29,sales_lag_7,sales_lag_2,sales_mean_1_28,sales_mean_1_7,sales_mean_7_7,sales_mean_7_28,sales_mean_1_32,sales_mean_28_28,sales_mean_1_31,sales_mean_1_14,sales_mean_1_30,sales_mean_1_29,sales_mean_28_7,sales_mean_7_14,sales_std_7_28,sales_std_28_7,sales_std_1_31,sales_std_7_7,sales_std_1_32,sales_std_1_30,sales_std_1_29,sales_std_1_28,sales_std_1_14,sales_std_1_7,sales_std_7_14,sales_std_28_28,time_since_2013-01-01 00:00:00,year,quarter_of_year,month_of_year,week_of_year,week_of_month,day_of_month,day_of_week,is_weekend,onpromotion,oil_price_interpolated,enc_family_mean,enc_store_nbr_mean,enc_city_mean,enc_state_mean,enc_type_mean,enc_cluster_mean,is_Local_holiday_Local,is_Local_holiday_no_holiday,is_Regional_holiday_Regional,is_Regional_holiday_no_holiday,is_National_holiday_National,is_National_holiday_no_holiday,earthquake_day_no,earthquake_day_yes,earthquake_impact_no,earthquake_impact_yes,wage_day_no,wage_day_yes,store_nbr,family,sales,transactions
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1
10_AUTOMOTIVE,2013-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.79,,0.13,0.24,0.09,0.16,0.35,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,10,AUTOMOTIVE,0.0,0.0
10_AUTOMOTIVE,2013-01-02,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.33,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10,AUTOMOTIVE,3.0,1293.0
10_AUTOMOTIVE,2013-01-03,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.5,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10,AUTOMOTIVE,2.0,1157.0
10_AUTOMOTIVE,2013-01-04,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.67,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10,AUTOMOTIVE,2.0,970.0
10_AUTOMOTIVE,2013-01-05,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.83,1.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10,AUTOMOTIVE,0.0,1269.0


In [None]:
# Check the columns
train_data_pl2.columns

# # Check the columns and their values
# for col in train_data_pl2.columns:
#     print(col, "\n", train_data_pl2[col].nunique(), "\n", train_data_pl2[col].unique(), "\n")

Index(['sales_lag_1', 'sales_lag_31', 'sales_lag_30', 'sales_lag_28',
       'sales_lag_29', 'sales_lag_7', 'sales_lag_2', 'sales_mean_1_28',
       'sales_mean_1_7', 'sales_mean_7_7', 'sales_mean_7_28',
       'sales_mean_1_32', 'sales_mean_28_28', 'sales_mean_1_31',
       'sales_mean_1_14', 'sales_mean_1_30', 'sales_mean_1_29',
       'sales_mean_28_7', 'sales_mean_7_14', 'sales_std_7_28',
       'sales_std_28_7', 'sales_std_1_31', 'sales_std_7_7', 'sales_std_1_32',
       'sales_std_1_30', 'sales_std_1_29', 'sales_std_1_28', 'sales_std_1_14',
       'sales_std_1_7', 'sales_std_7_14', 'sales_std_28_28',
       'time_since_2013-01-01 00:00:00', 'year', 'quarter_of_year',
       'month_of_year', 'week_of_year', 'week_of_month', 'day_of_month',
       'day_of_week', 'is_weekend', 'onpromotion', 'oil_price_interpolated',
       'enc_family_mean', 'enc_store_nbr_mean', 'enc_city_mean',
       'enc_state_mean', 'enc_type_mean', 'enc_cluster_mean',
       'is_Local_holiday_Local', 'is_Loca

#### Add the TARGET="sales" and COLUMNS=["family", "store_nbr] back 

In [None]:
# For pipeline_2
train_data_pl2["sales"] = train_data["sales"]
train_data_pl2["family"] = train_data["family"]
train_data_pl2["store_nbr"] = train_data["store_nbr"]

test_data_pl2["sales"] = test_data["sales"]
test_data_pl2["family"] = test_data["family"]
test_data_pl2["store_nbr"] = test_data["store_nbr"]

#### Save the preprocessed dataset and pipeline_2

In [None]:
# Set directories
data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/data_from_pipelines")
pipeline_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipelines")

for _dir in [data_dir, pipeline_dir]:
    _dir.mkdir(exist_ok=True)

In [None]:
# Save the dataset 
train_out_dir = data_dir / "train_data_pl2.parquet"
test_out_dir = data_dir / "test_data_pl2.parquet"
train_data_pl2.to_parquet(train_out_dir)
test_data_pl2.to_parquet(test_out_dir)

# Save the feature engineering pipeline
out_dir = pipeline_dir / "pipeline_2.joblib"
joblib.dump(pipeline_2, out_dir)

['/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipeline_2/pipeline_2.joblib']

### Pipeline_lgbm_scaled

|              |                                                              |                                                        |
| ------------ | ------------------------------------------------------------ | ------------------------------------------------------ |
| **TARGET**   | "sales"                                                      | Create lag and rolling window features -> MinMaxScaler |
| **FEATURES** |                                                              |                                                        |
| Numeric      | "transactions"                                               | Create an one-day lag feature -> MinMaxScaler |
| Numeric      | "onpromotion",    "oil_price_interpolated"                   | MinMaxScaler                                           |
| Mean encoded | "enc_family_mean",  "enc_store_nbr_mean", "enc_city_mean",  "enc_state_mean", "enc_type_mean", "enc_cluster_mean" | MinMaxScaler                                           |
| Time         | "time_since_2013-01-01 00:00:00"                             | MinMaxScaler                                            |
| Date         | "day_of_week",  "is_weekend", "day_of_month", "week_of_month", "week_of_year", "month_of_year", "quarter_of_year", "year" | MinMaxScaler                                   |
| Categorical  | 'is_Local_holiday' (binary),    is_Regional_holiday' (binary),   'is_National_holiday' (binary),    'earthquake_day' (binary),    'earthquake_impact' (binary) ,   'wage_day' (binary) |                                    |

#### Create pipeline_lgbm_scaled

In [None]:
#---------- FEATURES ----------#
num_columns = [
    "onpromotion", "oil_price_interpolated", 
    "enc_family_mean",	"enc_store_nbr_mean",	
    "enc_city_mean", "enc_state_mean", 
    "enc_type_mean", "enc_cluster_mean",
    "sales_ewma7_mean",	"sales_ewma7_std"
    ]
              
cat_columns = [
    "is_Local_holiday", "is_Regional_holiday", "is_National_holiday",
    "earthquake_day", "earthquake_impact", "wage_day"
    ]      
                
date_columns = [
  'year', 'quarter_of_year', 'month_of_year', 
  'week_of_year', 'week_of_month', 
  'day_of_month', 'day_of_week', 'is_weekend'
  ]  


#---------- TRANSFORMERS ----------#
# date and time
date_transformer   
time_transformer   

# lag and window
target_lag_window_transformer   #create lag & window features for "sales"
feature_lag_transformer  #create an one-day lag feature for "transactions"

# minmaxscaler and onehotencoder
minmax_transformer = MinMaxScaler()


#---------- PIPELINE ----------#
# pipeline 1 for the transformers
u = make_union(
    target_lag_window_transformer,
    feature_lag_transformer, 
    time_transformer,
    date_transformer
)

pl_1 = make_pipeline(
    u, 
    make_column_transformer(
        (minmax_transformer, make_column_selector(pattern="sales_lag|sales_mean|sales_std|time_since|transactions_mean|transactions_std")),
        (minmax_transformer, date_columns),
        remainder='drop',
        verbose_feature_names_out=False
        )
    )

# pipeline 2 for numerical and categorical features
ct = make_column_transformer(
    (minmax_transformer, num_columns),
    remainder='passthrough',
    verbose_feature_names_out=False
    )

pl_2 = make_pipeline(ct)


# final pipeline
pipeline_lgbm = make_union(pl_1, pl_2)
pipeline_lgbm 

#### Run pipeline_lgbm_scaled

In [None]:
# Run pipeline_lgbm
pipeline_lgbm.fit(train_data)
train_data_lgbm = pipeline_lgbm.transform(train_data)
test_data_lgbm = pipeline_lgbm.transform(test_data)
train_data_lgbm.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_lag_1,sales_lag_31,sales_lag_30,sales_lag_28,sales_lag_29,sales_lag_7,sales_lag_2,sales_mean_1_28,sales_mean_1_7,sales_mean_7_7,sales_mean_7_28,sales_mean_1_32,sales_mean_28_28,sales_mean_1_31,sales_mean_1_14,sales_mean_1_30,sales_mean_1_29,sales_mean_28_7,sales_mean_7_14,sales_std_7_28,sales_std_28_7,sales_std_1_31,sales_std_7_7,sales_std_1_32,sales_std_1_30,sales_std_1_29,sales_std_1_28,sales_std_1_14,sales_std_1_7,sales_std_7_14,sales_std_28_28,transactions_mean_28_7,transactions_std_28_7,time_since_2013-01-01 00:00:00,year,quarter_of_year,month_of_year,week_of_year,week_of_month,day_of_month,day_of_week,is_weekend,onpromotion,oil_price_interpolated,enc_family_mean,enc_store_nbr_mean,enc_city_mean,enc_state_mean,enc_type_mean,enc_cluster_mean,sales_ewma7_mean,sales_ewma7_std,store_nbr,family,sales,transactions,is_Local_holiday,is_Regional_holiday,is_National_holiday,wage_day,earthquake_day,earthquake_impact
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1
10_AUTOMOTIVE,2013-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.79,,0.13,0.24,0.09,0.16,0.35,,,10,AUTOMOTIVE,0.0,0.0,no_holiday,no_holiday,National,yes,no,no
10_AUTOMOTIVE,2013-01-02,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.33,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,10,AUTOMOTIVE,3.0,1293.0,no_holiday,no_holiday,no_holiday,no,no,no
10_AUTOMOTIVE,2013-01-03,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.5,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,AUTOMOTIVE,2.0,1157.0,no_holiday,no_holiday,no_holiday,no,no,no
10_AUTOMOTIVE,2013-01-04,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.67,0.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,AUTOMOTIVE,2.0,970.0,no_holiday,no_holiday,no_holiday,no,no,no
10_AUTOMOTIVE,2013-01-05,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.83,1.0,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,AUTOMOTIVE,0.0,1269.0,no_holiday,no_holiday,no_holiday,no,no,no


In [None]:
# Check the columns
print(train_data_lgbm.columns)

# # Check the columns and their values
# for col in train_data_lgbm.columns:
#     print(col, "\n", train_data_lgbm[col].nunique(), "\n", train_data_lgbm[col].unique(), "\n")

Index(['sales_lag_1', 'sales_lag_31', 'sales_lag_30', 'sales_lag_28',
       'sales_lag_29', 'sales_lag_7', 'sales_lag_2', 'sales_mean_1_28',
       'sales_mean_1_7', 'sales_mean_7_7', 'sales_mean_7_28',
       'sales_mean_1_32', 'sales_mean_28_28', 'sales_mean_1_31',
       'sales_mean_1_14', 'sales_mean_1_30', 'sales_mean_1_29',
       'sales_mean_28_7', 'sales_mean_7_14', 'sales_std_7_28',
       'sales_std_28_7', 'sales_std_1_31', 'sales_std_7_7', 'sales_std_1_32',
       'sales_std_1_30', 'sales_std_1_29', 'sales_std_1_28', 'sales_std_1_14',
       'sales_std_1_7', 'sales_std_7_14', 'sales_std_28_28',
       'transactions_mean_28_7', 'transactions_std_28_7',
       'time_since_2013-01-01 00:00:00', 'year', 'quarter_of_year',
       'month_of_year', 'week_of_year', 'week_of_month', 'day_of_month',
       'day_of_week', 'is_weekend', 'onpromotion', 'oil_price_interpolated',
       'enc_family_mean', 'enc_store_nbr_mean', 'enc_city_mean',
       'enc_state_mean', 'enc_type_mean', 'en

In [None]:
train_data_lgbm.dtypes

sales_lag_1             float64
sales_lag_31            float64
sales_lag_30            float64
sales_lag_28            float64
sales_lag_29            float64
                         ...   
is_Regional_holiday    category
is_National_holiday    category
wage_day               category
earthquake_day         category
earthquake_impact      category
Length: 62, dtype: object

In [None]:
# DO NOT NEED

#### Add the TARGET="sales" and COLUMNS=["family", "store_nbr] back 
# # For pipeline_gbm
# train_data_lgbm["sales"] = train_data["sales"]
# train_data_lgbm["family"] = train_data["family"]
# train_data_lgbm["store_nbr"] = train_data["store_nbr"]

# test_data_lgbm["sales"] = test_data["sales"]
# test_data_lgbm["family"] = test_data["family"]
# test_data_lgbm["store_nbr"] = test_data["store_nbr"]

#### Save the preprocessed dataset and pipeline_lgbm_scaled

In [None]:
# Reduce size
train_data_lgbm = reduce_memory_usage(df=train_data_lgbm)
test_data_lgbm = reduce_memory_usage(df=test_data_lgbm)

Memory usage decreased to 652.54 Mb (42.4% reduction)
Memory usage decreased to  6.57 Mb (41.0% reduction)


In [None]:
# Set directories
data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/data_from_pipelines")
pipeline_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipelines")

for _dir in [data_dir, pipeline_dir]:
    _dir.mkdir(exist_ok=True)

In [None]:
# Save the dataset 
train_out_dir = data_dir / "data_from_pipeline_lgbm_scaled/train_data_from_pipeline_lgbm_scaled.parquet"
test_out_dir = data_dir / "data_from_pipeline_lgbm_scaled/test_data_from_pipeline_lgbm_scaled.parquet"
train_data_lgbm.to_parquet(train_out_dir)
test_data_lgbm.to_parquet(test_out_dir)

# Save the feature engineering pipeline
out_dir = pipeline_dir / "pipeline_lgbm_scaled.joblib"
joblib.dump(pipeline_lgbm, out_dir)

['/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipelines/pipeline_lgbm_scaled.joblib']

### Pipeline_lgbm_unscaled

|              |                                                              |                                                        |
| ------------ | ------------------------------------------------------------ | ------------------------------------------------------ |
| **TARGET**   | "sales"                                                      | Create lag and rolling window features -> MinMaxScaler |
| **FEATURES** |                                                              |                                                        |
| Numeric      | "transactions"                                               | Create an one-day lag feature -> MinMaxScaler |
| Numeric      | "onpromotion",    "oil_price_interpolated"                   | MinMaxScaler                                           |
| Mean encoded | "enc_family_mean",  "enc_store_nbr_mean", "enc_city_mean",  "enc_state_mean", "enc_type_mean", "enc_cluster_mean" | MinMaxScaler                                           |
| Time         | "time_since_2013-01-01 00:00:00"                             | MinMaxScaler                                            |
| Date         | "day_of_week",  "is_weekend", "day_of_month", "week_of_month", "week_of_year", "month_of_year", "quarter_of_year", "year" | MinMaxScaler                                   |
| Categorical  | 'is_Local_holiday' (binary),    is_Regional_holiday' (binary),   'is_National_holiday' (binary),    'earthquake_day' (binary),    'earthquake_impact' (binary) ,   'wage_day' (binary) |                                    |

#### Create pipeline_lgbm_unscaled

In [None]:
num_columns = ["onpromotion", "oil_price_interpolated", 
                "enc_family_mean",	"enc_store_nbr_mean",	
                "enc_city_mean", "enc_state_mean", 
                "enc_type_mean", "enc_cluster_mean", ]              
cat_columns = ["is_Local_holiday", "is_Regional_holiday", "is_National_holiday",
                "earthquake_day", "earthquake_impact", "wage_day"]      
other_columns = ["family", "store_nbr", "sales", "transactions"]
passthrough_columns = num_columns + cat_columns + other_columns


date_columns = ['year', 'quarter_of_year', 'month_of_year', 'week_of_year', 
                 'week_of_month', 'day_of_month', 'day_of_week', 'is_weekend']  

# PIPELINE FOR THE TRANSFORMERS
u = make_union(
    target_lag_window_transformer,
    feature_lag_transformer, 
    time_transformer,
    date_transformer
)

pl_1 = make_pipeline(
    u,
    make_column_transformer(
        ('passthrough', make_column_selector(pattern="sales_lag|sales_mean|sales_std|time_since|transactions_mean|transactions_std")),
        ('passthrough', date_columns),
        remainder='drop',
        verbose_feature_names_out=False
        )
    )

pl_2 = make_pipeline(
    make_column_transformer(
        ('passthrough', passthrough_columns),
        remainder='drop',
        verbose_feature_names_out=False
        )
    )
# FINAL PIPELINE 
pipeline_lgbm_unscaled =  make_union(pl_1, pl_2)
pipeline_lgbm_unscaled

#### Run pipeline_lgbm_unscaled

In [None]:
# Load the datasets with encoded features
enc_data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/enc_data")
train_enc_data_dir = enc_data_dir / "train_data.parquet"
test_enc_data_dir = enc_data_dir / "test_data.parquet"

train_data = pd.read_parquet(
    path=train_enc_data_dir, 
    engine="pyarrow"
    )

test_data = pd.read_parquet(
    path=test_enc_data_dir, 
    engine="pyarrow"
    )

train_data = train_data.set_index(["id", "date"]).sort_index()
test_data = test_data.set_index(["id", "date"]).sort_index()

In [None]:
# Run pipeline_lgbm
pipeline_lgbm_unscaled.fit(train_data)
train_data_lgbm_unscaled = pipeline_lgbm_unscaled.transform(train_data)
test_data_lgbm_unscaled = pipeline_lgbm_unscaled.transform(test_data)
train_data_lgbm_unscaled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sales_lag_1,sales_lag_31,sales_lag_30,sales_lag_28,sales_lag_29,sales_lag_7,sales_lag_2,sales_mean_1_28,sales_mean_1_7,sales_mean_7_7,sales_mean_7_28,sales_mean_1_32,sales_mean_28_28,sales_mean_1_31,sales_mean_1_14,sales_mean_1_30,sales_mean_1_29,sales_mean_28_7,sales_mean_7_14,sales_std_7_28,sales_std_28_7,sales_std_1_31,sales_std_7_7,sales_std_1_32,sales_std_1_30,sales_std_1_29,sales_std_1_28,sales_std_1_14,sales_std_1_7,sales_std_7_14,sales_std_28_28,transactions_mean_28_7,transactions_std_28_7,time_since_2013-01-01 00:00:00,year,quarter_of_year,month_of_year,week_of_year,week_of_month,day_of_month,day_of_week,is_weekend,onpromotion,oil_price_interpolated,enc_family_mean,enc_store_nbr_mean,enc_city_mean,enc_state_mean,enc_type_mean,enc_cluster_mean,is_Local_holiday,is_Regional_holiday,is_National_holiday,earthquake_day,earthquake_impact,wage_day,family,store_nbr,sales,transactions
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
10_AUTOMOTIVE,2013-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,2013,1,1,1,1,1,1,0,0.0,93.14,,254.53,255.14,73.6,326.74,708.23,no_holiday,no_holiday,National,no,no,yes,AUTOMOTIVE,10,0.0,0.0
10_AUTOMOTIVE,2013-01-02,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,2013,1,1,1,1,2,2,0,0.0,93.14,0.0,0.0,0.0,0.0,0.0,0.0,no_holiday,no_holiday,no_holiday,no,no,no,AUTOMOTIVE,10,3.0,1293.0
10_AUTOMOTIVE,2013-01-03,3.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2013,1,1,1,1,3,3,0,0.0,92.97,1.5,1.5,1.5,1.5,1.5,1.5,no_holiday,no_holiday,no_holiday,no,no,no,AUTOMOTIVE,10,2.0,1157.0
10_AUTOMOTIVE,2013-01-04,2.0,,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,3,2013,1,1,1,1,4,4,0,0.0,93.12,1.67,1.67,1.67,1.67,1.67,1.67,no_holiday,no_holiday,no_holiday,no,no,no,AUTOMOTIVE,10,2.0,970.0
10_AUTOMOTIVE,2013-01-05,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,4,2013,1,1,1,1,5,5,1,0.0,93.12,1.75,1.75,1.75,1.75,1.75,1.75,no_holiday,no_holiday,no_holiday,no,no,no,AUTOMOTIVE,10,0.0,1269.0


In [None]:
# Check the columns
print(train_data_lgbm_unscaled.columns)

# Check the columns and their values
for col in train_data_lgbm_unscaled.columns:
    print(col, "\n", train_data_lgbm_unscaled[col].nunique(), "\n", train_data_lgbm_unscaled[col].unique(), "\n")

Index(['sales_lag_1', 'sales_lag_31', 'sales_lag_30', 'sales_lag_28',
       'sales_lag_29', 'sales_lag_7', 'sales_lag_2', 'sales_mean_1_28',
       'sales_mean_1_7', 'sales_mean_7_7', 'sales_mean_7_28',
       'sales_mean_1_32', 'sales_mean_28_28', 'sales_mean_1_31',
       'sales_mean_1_14', 'sales_mean_1_30', 'sales_mean_1_29',
       'sales_mean_28_7', 'sales_mean_7_14', 'sales_std_7_28',
       'sales_std_28_7', 'sales_std_1_31', 'sales_std_7_7', 'sales_std_1_32',
       'sales_std_1_30', 'sales_std_1_29', 'sales_std_1_28', 'sales_std_1_14',
       'sales_std_1_7', 'sales_std_7_14', 'sales_std_28_28',
       'transactions_mean_28_7', 'transactions_std_28_7',
       'time_since_2013-01-01 00:00:00', 'year', 'quarter_of_year',
       'month_of_year', 'week_of_year', 'week_of_month', 'day_of_month',
       'day_of_week', 'is_weekend', 'onpromotion', 'oil_price_interpolated',
       'enc_family_mean', 'enc_store_nbr_mean', 'enc_city_mean',
       'enc_state_mean', 'enc_type_mean', 'en

In [None]:
train_data_lgbm_unscaled.dtypes

sales_lag_1                        float32
sales_lag_31                       float32
sales_lag_30                       float32
sales_lag_28                       float32
sales_lag_29                       float32
sales_lag_7                        float32
sales_lag_2                        float32
sales_mean_1_28                    float64
sales_mean_1_7                     float64
sales_mean_7_7                     float64
sales_mean_7_28                    float64
sales_mean_1_32                    float64
sales_mean_28_28                   float64
sales_mean_1_31                    float64
sales_mean_1_14                    float64
sales_mean_1_30                    float64
sales_mean_1_29                    float64
sales_mean_28_7                    float64
sales_mean_7_14                    float64
sales_std_7_28                     float64
sales_std_28_7                     float64
sales_std_1_31                     float64
sales_std_7_7                      float64
sales_std_1

#### Save the preprocessed dataset and pipeline_lgbm_unscaled

In [None]:
# Set directories
data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/data_from_pipelines")
pipeline_dir = Path("/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipelines")

for _dir in [data_dir, pipeline_dir]:
    _dir.mkdir(exist_ok=True)

In [None]:
# Save the dataset 
train_out_dir = data_dir / "train_data_from_pipeline_lgbm_unscaled.parquet"
test_out_dir = data_dir / "test_data_from_pipeline_lgbm_unscaled.parquet"
train_data_lgbm_unscaled.to_parquet(train_out_dir)
test_data_lgbm_unscaled.to_parquet(test_out_dir)

# Save the feature engineering pipeline
out_dir = pipeline_dir / "pipeline_lgbm_scaled.joblib"
joblib.dump(pipeline_lgbm_unscaled, out_dir)

['/content/drive/MyDrive/Colab Notebooks/Store-Sales/for_models/pipeline_lgbm_unscaled/pipeline_lgbm_scaled.joblib']