In [1]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder

import pickle
import pandas as pd
import numpy as np
import seaborn as sns
# VIEWING OPTIONS IN THE NOTEBOOK
from sklearn import set_config; set_config(display='diagram')

In [2]:
#load data
df = pd.read_pickle(f'../models/all_df.pickle')

#reset row index
df.reset_index(drop=True, inplace=True)

In [3]:
numerical_features = ['onpromotion', 'oil_price', 'oil_price_interpolated', 'transactions', 'cluster']
ordinal_features = ['city', 'state']
categorical_features = ['type_stores', 'type_holiday', 'locale', 'locale_name']
target = 'sales'

columns_drop = ['date', 'store_nbr']
df_to_preproc = df.drop(columns = columns_drop, axis=1)

In [4]:
df_to_preproc

Unnamed: 0,sales,onpromotion,city,state,type_stores,cluster,type_holiday,locale,locale_name,oil_price,oil_price_interpolated,transactions
0,7417.148000,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.14,93.14,2111.0
1,5873.244001,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,92.97,92.97,1833.0
2,5919.879001,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,93.12,93.12,1863.0
3,6318.785010,0,Quito,Pichincha,D,13,Work Day,National,Ecuador,0.00,0.00,1509.0
4,2199.087000,0,Quito,Pichincha,D,13,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,520.0
...,...,...,...,...,...,...,...,...,...,...,...,...
83483,8513.834000,166,El Carmen,Manabi,C,3,Transfer,National,Ecuador,48.81,48.81,768.0
83484,9139.678002,138,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,903.0
83485,14246.827996,173,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,0.00,0.00,1054.0
83486,11882.994000,126,El Carmen,Manabi,C,3,Not Holiday,Not Holiday,Not Holiday,47.59,47.59,818.0


In [5]:
# ordinal encode categorical feature according to its correlation with target
def encode(frame, feature):
    '''
    function that custom encode a categorical feature ordered by the most impactful 
    sub-category on the target
    '''
    # create a temporary dataframe  
    ordering = pd.DataFrame()
    # create an index with the unique values of the selected feature
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    # create a spmean column with the mean value of the target 
    # for each category of the selected feature, sorted by value
    ordering['spmean'] = frame[[feature, target]]\
                         .groupby(feature).mean()[target]
    ordering = ordering.sort_values('spmean')
    # create a column ordering with the number from 1 to the number of
    # categories for the selected feature
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    # extracting index and that column to a dictionary
    ordering = ordering['ordering'].to_dict()
    return ordering.keys()

In [6]:
# create transformer
feat_ordinal_dict = {}
for feature in ordinal_features:
    feat_ordinal_dict[feature] = list(encode(df_to_preproc,feature))

numerical_transformer =  MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
feat_ordinal = sorted(feat_ordinal_dict.keys())
feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]  
    
ordinal_transformer = OrdinalEncoder(
    categories=feat_ordinal_values_sorted,
)

transformer = ColumnTransformer([
    ('num_tr', numerical_transformer, numerical_features),
    ('cat_tr', categorical_transformer, categorical_features),
    ('ord_tr', ordinal_transformer, ordinal_features)

])


In [7]:
transformer

In [8]:
df_to_preproc = df_to_preproc.drop(columns = ['sales'], axis=1)
transformer.fit(df_to_preproc)
transformed_df = transformer.transform(df_to_preproc)
transformed_df

<83488x60 sparse matrix of type '<class 'numpy.float64'>'
	with 835474 stored elements in Compressed Sparse Row format>

In [9]:
# Convert sparse matrix to DataFrame
transformed_df = pd.DataFrame(transformed_df.toarray())

# Assign column names to the DataFrame
column_names = transformer.get_feature_names_out()
transformed_df.columns = column_names

# Optional: Reset index if needed
transformed_df.reset_index(drop=True, inplace=True)

# Concatenate the 'sales' column back to the transformed DataFrame
#transformed_df = pd.concat([transformed_df, df['sales'].reset_index(drop=True)], axis=1)

# View the transformed DataFrame
print(transformed_df.head())


   num_tr__onpromotion  num_tr__oil_price  num_tr__oil_price_interpolated  \
0                  0.0           0.841982                        0.841982   
1                  0.0           0.840445                        0.840445   
2                  0.0           0.841801                        0.841801   
3                  0.0           0.000000                        0.000000   
4                  0.0           0.000000                        0.000000   

   num_tr__transactions  num_tr__cluster  cat_tr__type_stores_A  \
0              0.252095             0.75                    0.0   
1              0.218817             0.75                    0.0   
2              0.222408             0.75                    0.0   
3              0.180034             0.75                    0.0   
4              0.061647             0.75                    0.0   

   cat_tr__type_stores_B  cat_tr__type_stores_C  cat_tr__type_stores_D  \
0                    0.0                    0.0             

In [10]:
transformed_df['store_nbr'] = df['store_nbr']
transformed_df['date'] = df['date']

In [11]:
X, y = transformed_df, df['sales']

In [12]:
X,y

(       num_tr__onpromotion  num_tr__oil_price  num_tr__oil_price_interpolated  \
 0                 0.000000           0.841982                        0.841982   
 1                 0.000000           0.840445                        0.840445   
 2                 0.000000           0.841801                        0.841801   
 3                 0.000000           0.000000                        0.000000   
 4                 0.000000           0.000000                        0.000000   
 ...                    ...                ...                             ...   
 83483             0.038056           0.441240                        0.441240   
 83484             0.031637           0.000000                        0.000000   
 83485             0.039661           0.000000                        0.000000   
 83486             0.028886           0.430212                        0.430212   
 83487             0.046768           0.430031                        0.430031   
 
        num_tr

In [13]:
#check missing value

X.isna().sum()

num_tr__onpromotion                                   0
num_tr__oil_price                                     0
num_tr__oil_price_interpolated                        0
num_tr__transactions                                  0
num_tr__cluster                                       0
                                                     ..
cat_tr__locale_name_Santo Domingo de los Tsachilas    0
ord_tr__city                                          0
ord_tr__state                                         0
store_nbr                                             0
date                                                  0
Length: 62, dtype: int64