In [1]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

import pickle
import pandas as pd
import numpy as np
import seaborn as sns
# VIEWING OPTIONS IN THE NOTEBOOK
from sklearn import set_config; set_config(display='diagram')

In [3]:
numerical_features = ['date', 'onpromotion', 'oil_price', 'oil_price_interpolated']

#categorical_features = ['type', 'locale', 'locale_name']

ordinal_features = ['type', 'locale', 'locale_name']

target = 'sales'

In [None]:
def encode(frame, feature):
    '''
    function that custom encode a categorical feature ordered by the most impactful 
    sub-category on the target
    '''
    # create a temporary dataframe  
    ordering = pd.DataFrame()
    # create an index with the unique values of the selected feature
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    # create a spmean column with the mean value of the target 
    # for each category of the selected feature, sorted by value
    ordering['spmean'] = frame[[feature, target]]\
                         .groupby(feature).mean()[target]
    ordering = ordering.sort_values('spmean')
    # create a column ordering with the number from 1 to the number of
    # categories for the selected feature
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    # extracting index and that column to a dictionary
    ordering = ordering['ordering'].to_dict()
    return ordering.keys()

In [None]:
df = pd.read_pickle(f'../models/data_cleaning.pickle')

feat_ordinal_dict = {}
for feature in ordinal_features:
    feat_ordinal_dict[feature] = list(encode(df,feature))

numerical_transformer =  MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
feat_ordinal = sorted(feat_ordinal_dict.keys())
feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]  
    
ordinal_transformer = OrdinalEncoder(
    categories=feat_ordinal_values_sorted,
)

transformer = ColumnTransformer([
    ('num_tr', numerical_transformer, numerical_features),
    ('cat_tr', categorical_transformer, categorical_features),
    ('ord_tr', ordinal_transformer, ordinal_features)

])
transformer.fit(df)
transformed_df = transformer.transform(df)

X, y = transformed_df, df[target]

