In [1]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

import pickle
import pandas as pd
import numpy as np
import seaborn as sns
# VIEWING OPTIONS IN THE NOTEBOOK
from sklearn import set_config; set_config(display='diagram')

In [2]:
#load data
df = pd.read_pickle(f'../models/all_df.pickle')

#reset row index
df.reset_index(drop=True, inplace=True)

In [3]:
columns_drop = ['sales', 'date', 'store_nbr']
df_to_preproc = df.drop(columns = columns_drop, axis=1)

In [4]:

numerical_features = ['onpromotion', 'oil_price', 'oil_price_interpolated', 'transactions', 'cluster']
ordinal_features = ['city', 'state', 'locale', 'type_stores', 'locale_name', 'type_holiday']
target = 'sales'

# Create transformers
numerical_transformer = MinMaxScaler()

#categorical_transformer = OneHotEncoder(handle_unknown='ignore')
ordinal_transformer = OrdinalEncoder()

# Create the column transformer
transformer = ColumnTransformer([
    ('num_tr', numerical_transformer, numerical_features),
    ('ord_tr', ordinal_transformer, ordinal_features)
])


In [5]:
transformer

In [6]:
# Fit and transform the DataFrame
transformer.fit(df_to_preproc)
transformed_df = transformer.transform(df_to_preproc)

# Extract the target variable
X = transformed_df
y = df[target]

# Include all column names in X and y
X_columns = df_to_preproc.columns.tolist()
y_column = target

# Create a new DataFrame for X with column names
X_df = pd.DataFrame(X, columns=X_columns)

# # Create a new DataFrame for y with the target variable name
y_df = pd.DataFrame(y, columns=[y_column])

# # Concatenate X_df and y_df horizontally to create a new DataFrame with X, y, and all column names
df_new = pd.concat([X_df, y_df], axis=1)


In [7]:
df_new['store_nbr'] = df['store_nbr']
df_new['date'] = df['date']
df_new['sales'] = df['sales']
df_new[columns_drop] = df[columns_drop]

In [8]:
df_new

Unnamed: 0,onpromotion,city,state,type_stores,cluster,type_holiday,locale,locale_name,oil_price,oil_price_interpolated,transactions,sales,store_nbr,date
0,0.000000,0.841982,0.841982,0.252095,0.750,18.0,12.0,5.0,3.0,21.0,10.0,7417.148000,1,2013-01-02
1,0.000000,0.840445,0.840445,0.218817,0.750,18.0,12.0,5.0,3.0,21.0,10.0,5873.244001,1,2013-01-03
2,0.000000,0.841801,0.841801,0.222408,0.750,18.0,12.0,5.0,3.0,21.0,10.0,5919.879001,1,2013-01-04
3,0.000000,0.000000,0.000000,0.180034,0.750,18.0,12.0,4.0,3.0,5.0,12.0,6318.785010,1,2013-01-05
4,0.000000,0.000000,0.000000,0.061647,0.750,18.0,12.0,5.0,3.0,21.0,10.0,2199.087000,1,2013-01-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83483,0.038056,0.441240,0.441240,0.091333,0.125,5.0,10.0,4.0,2.0,5.0,11.0,8513.834000,54,2017-08-11
83484,0.031637,0.000000,0.000000,0.107493,0.125,5.0,10.0,5.0,2.0,21.0,10.0,9139.678002,54,2017-08-12
83485,0.039661,0.000000,0.000000,0.125569,0.125,5.0,10.0,5.0,2.0,21.0,10.0,14246.827996,54,2017-08-13
83486,0.028886,0.430212,0.430212,0.097319,0.125,5.0,10.0,5.0,2.0,21.0,10.0,11882.994000,54,2017-08-14
