In [None]:

# Basic
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn

# Viz
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.preprocessing import FunctionTransformer # custom transformer helper function
from sklearn.base import BaseEstimator, TransformerMixin # custom transformer


sklearn.set_config(enable_metadata_routing=True)
# Save and Load Model
import joblib


# Modeling
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from scipy.stats import uniform, randint
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet



# Metrics (Evaluasi Model)
from sklearn.metrics import r2_score, mean_absolute_error


# Model selection
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

# Warning
import warnings
warnings.filterwarnings('ignore')



This notebook demonstrates a more advanced preprocessing workflow, intended to serve as a comparison with the simpler approach presented in the other notebook.


In [2]:
df = df = pd.read_csv(r'C:\Users\Hewlett Packard\Desktop\SAYA\project-3-ml-bigmart-sales-demand\data\bigmart.csv')
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


>## Data Preprocessing 

Data preprocessing dibuat menyesuaikan pola data yang ada pada dataset, oleh karena itu penangangan dibuat berdasarkan masalah pada dataset tersebut

### Custom data prep
* Visibility zero handling
* Fat standardizer
* Rare handler
* Outlet size imputer

### Column transformer data prep  
**Numerical**(Continues)
* `Item_Weight`  standard scaling  
* `Item_Visibility`  standard scaling
* `Item_MRP`  standard scaling  
all of them are not contain extreme outlier

**Categorical nominal**  
* `Item_Type`  binary encoding  (too much unique values)
* `Item_Fat_Content`  one hot encoding  
* `Outlet_Type`  one hot encoding
* `Outlet_Establishment_Year` ← (it's role here is categoric) one hot encoding

**Ordinal**
* `Outlet_Location_Type ` ordinal encoding  
* `Outlet_Size`  ordinal encoding  
datanya berjenjang

## Data Splitting

In [3]:
features = df.drop(columns=["Outlet_Identifier", "Item_Identifier", "Item_Outlet_Sales"]).columns.to_list()
target = ["Item_Outlet_Sales"]

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=2026,  # supaya hasilnya konsisten 
)

## Custom Data Preprocessing



### **Convert Year from Int to String**

In [4]:
def est_year_to_string(X):
    X = X.copy()
    X["Outlet_Establishment_Year"] = X["Outlet_Establishment_Year"].astype("string")
    return X

In [5]:
# Import FunctionTransformer
est_year_to_string_transformer = FunctionTransformer(
    est_year_to_string,
    validate=False
)

**Function testing**

In [6]:
X_train_temp = X_train.copy()

In [7]:
X_train_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6818 entries, 6057 to 2305
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                5648 non-null   float64
 1   Item_Fat_Content           6818 non-null   object 
 2   Item_Visibility            6818 non-null   float64
 3   Item_Type                  6818 non-null   object 
 4   Item_MRP                   6818 non-null   float64
 5   Outlet_Establishment_Year  6818 non-null   int64  
 6   Outlet_Size                4918 non-null   object 
 7   Outlet_Location_Type       6818 non-null   object 
 8   Outlet_Type                6818 non-null   object 
dtypes: float64(3), int64(1), object(5)
memory usage: 532.7+ KB


In [8]:
X_train_temp = est_year_to_string_transformer.fit_transform(X_train_temp)

In [9]:
X_train_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6818 entries, 6057 to 2305
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                5648 non-null   float64
 1   Item_Fat_Content           6818 non-null   object 
 2   Item_Visibility            6818 non-null   float64
 3   Item_Type                  6818 non-null   object 
 4   Item_MRP                   6818 non-null   float64
 5   Outlet_Establishment_Year  6818 non-null   string 
 6   Outlet_Size                4918 non-null   object 
 7   Outlet_Location_Type       6818 non-null   object 
 8   Outlet_Type                6818 non-null   object 
dtypes: float64(3), object(5), string(1)
memory usage: 532.7+ KB


### **Visibility Zero Handling**

In [10]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class ZeroToNaNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].replace(0, np.nan)
        return X


In [11]:
visibility_zero_handler = ZeroToNaNTransformer(column="Item_Visibility")

**Function testing**

In [12]:
X_train_temp = X_train.copy()

In [13]:
X_train_temp.isna().sum()

Item_Weight                  1170
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                  1900
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [14]:
# Apply the ZeroToNaNTransformer to 'Item_Visibility' column in X_train_temp(testing df)
X_train_temp = visibility_zero_handler.fit_transform(X_train_temp)

In [15]:
X_train_temp.isna().sum()

Item_Weight                  1170
Item_Fat_Content                0
Item_Visibility               427
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                  1900
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

### **Fat Standardizer**

In [16]:
# Function to standardize 'Item_Fat_Content' that apply to entire dataframe
def fat_standardizer(X):
    X = X.copy()

    col = "Item_Fat_Content"

    X[col] = (
        X[col]
        .str.lower()
        .str.strip()
        .replace({
            "lf": "low fat",
            "low fat": "low fat",
            "reg": "regular",
            "regular": "regular",
        })
    )

    return X


In [17]:
# Import FunctionTransformer
fat_standardizer_transformer = FunctionTransformer(
    fat_standardizer,
    validate=False
)

**Function testing**

In [18]:
X_train_temp = X_train.copy()

In [19]:
X_train_temp = fat_standardizer_transformer.transform(X_train_temp)

In [20]:
X_train_temp["Item_Fat_Content"].unique()

array(['regular', 'low fat'], dtype=object)

### **Rare Handler**

Secara umum berdasarkan section EDA , tidak ada rare type pada `Item_Type`. Namun hal ini tetap dilakukan sebagai penanganan potensi salah input

In [21]:
from sklearn.preprocessing import FunctionTransformer

def replace_rare_categories(X, threshold=0.01, columns=None):
    X = X.copy()
    for col in columns:
        freq = X[col].value_counts(normalize=True)
        rare_categories = freq[freq < threshold].index
        X[col] = X[col].replace(rare_categories, "Other")
    return X


In [22]:
rare_transformer = FunctionTransformer(
    replace_rare_categories,
    kw_args={"columns": ["Item_Type"]},
    validate=False
)


### **Outlet Size Imputer**

Sebelumnya kita mengetahui terdapat missing pada outlet size, pada dasarnya `Outlet_Size` bisa diisi berdasarkan `Outlet_Type`.

In [23]:
def outlet_size_imputer(X):
    X = X.copy()

    outlet_size_map = (
        X.groupby("Outlet_Type")["Outlet_Size"]
         .agg(lambda x: x.mode().iloc[0])
    )

    global_mode = X["Outlet_Size"].mode().iloc[0]

    X["Outlet_Size"] = (
        X["Outlet_Size"]
        .fillna(X["Outlet_Type"].map(outlet_size_map))
        .fillna(global_mode)
    )

    return X


In [24]:
outlet_size_transformer = FunctionTransformer(
    outlet_size_imputer,
    validate=False
)


**Function testing**

In [25]:
X_train_temp = X_train.copy()

In [26]:
X_train_temp = outlet_size_transformer.transform(X_train_temp)

In [27]:
X_train_temp.isna().sum()

Item_Weight                  1170
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

## Column Transformer Preprocessing

In [28]:
ordinal_cols = ["Outlet_Size", "Outlet_Location_Type"]

ordinal_categories = [
    ['Small', 'Medium', 'High'],                # Outlet_Size
    ['Tier 3', 'Tier 2', 'Tier 1']               # Outlet_Location_Type
]

### **Column Transformer Pipeline**

In [29]:
ct_preprocess = ColumnTransformer(
    transformers=[
        # NUMERIC
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), ["Item_Weight", "Item_Visibility", "Item_MRP"]),

        # NOMINAL CATEGORICAL
        ("nom", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore"))
        ]), ["Item_Fat_Content", "Outlet_Type", "Outlet_Establishment_Year"]),

        # ORDINAL
        ("ord", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ordinal", OrdinalEncoder(
                categories=ordinal_categories,
                handle_unknown="use_encoded_value",
                unknown_value=-1
            ))
        ]), ordinal_cols),

        # BINARY ENCODING
        ("bin", ce.BinaryEncoder(), ["Item_Type"]),
    ],
    remainder="passthrough"
)


### **Main Pipeline**

In [30]:
pipe = Pipeline([
    ("est_year_to_string", est_year_to_string_transformer),
    ("visibility_zero_handler", visibility_zero_handler),
    ("fat_standardizer", fat_standardizer_transformer),
    ("rare_handler", rare_transformer),
    ("outlet_size_imputer", outlet_size_transformer),
    ("preprocess", ct_preprocess),
])


### **Pipeline Testing**

In [31]:
# 3. Fit preprocessing di train saja
X_train_prep = pipe.fit_transform(X_train)
X_test_prep  = pipe.transform(X_test)

In [32]:
X_train_prep.shape


(6818, 22)

In [33]:
X_test_prep.shape


(1705, 22)

In [34]:
display(np.isnan(X_train_prep).any())
display(np.isinf(X_train_prep).any())

np.False_

np.False_

In [35]:
X_train_prep[:5]


array([[-0.05961067,  1.18875611,  0.7406247 ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [-0.8633839 , -0.29529132, -0.41067608,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         2.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 1.91436181, -0.15412297, -0.84297916,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  2.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ],
       [-0.32083697

In [36]:
pipe.named_steps["preprocess"].get_feature_names_out()


array(['num__Item_Weight', 'num__Item_Visibility', 'num__Item_MRP',
       'nom__Item_Fat_Content_regular',
       'nom__Outlet_Type_Supermarket Type1',
       'nom__Outlet_Type_Supermarket Type2',
       'nom__Outlet_Type_Supermarket Type3',
       'nom__Outlet_Establishment_Year_1987',
       'nom__Outlet_Establishment_Year_1997',
       'nom__Outlet_Establishment_Year_1998',
       'nom__Outlet_Establishment_Year_1999',
       'nom__Outlet_Establishment_Year_2002',
       'nom__Outlet_Establishment_Year_2004',
       'nom__Outlet_Establishment_Year_2007',
       'nom__Outlet_Establishment_Year_2009', 'ord__Outlet_Size',
       'ord__Outlet_Location_Type', 'bin__Item_Type_0',
       'bin__Item_Type_1', 'bin__Item_Type_2', 'bin__Item_Type_3',
       'bin__Item_Type_4'], dtype=object)

### **Save Pipeline**

In [37]:
import joblib

NUM_COLS = ["Item_Weight", "Item_Visibility", "Item_MRP"]
CAT_COLS = ["Item_Fat_Content", "Item_Type", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type", "Outlet_Establishment_Year"]

joblib.dump(
    {
        "preprocessor": pipe,
        "num_cols": NUM_COLS,
        "cat_cols": CAT_COLS,
        "target": "item_outlet_sales"
    },
    "preprocess_artifacts.joblib"
)


['preprocess_artifacts.joblib']

In [38]:
artifacts = joblib.load("preprocess_artifacts.joblib")

In [39]:
artifacts["num_cols"]

['Item_Weight', 'Item_Visibility', 'Item_MRP']

In [40]:
artifacts["cat_cols"]

['Item_Fat_Content',
 'Item_Type',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type',
 'Outlet_Establishment_Year']

In [41]:
artifacts["preprocessor"]

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

## Baseline

## OLS

In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

model_ols = LinearRegression()
model_ols.fit(X_train_prep, y_train)

y_pred = model_ols.predict(X_test_prep)

print("R2 :", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


R2 : 0.5677831055699456
MAE: 839.7573242848028


Kemungkinan prediksi meleset 839 dari rata-rata yaitu 2181. Angka ini masih cukup tinggi

In [44]:
y_pred[:5]

array([[2082.75002625],
       [3055.04692064],
       [1992.16387087],
       [3550.40686412],
       [2685.11753626]])

In [45]:
df["Item_Outlet_Sales"].describe()

count     8523.000000
mean      2181.288914
std       1706.499616
min         33.290000
25%        834.247400
50%       1794.331000
75%       3101.296400
max      13086.964800
Name: Item_Outlet_Sales, dtype: float64

In [46]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_prep, y_train)

y_pred = rf.predict(X_test_prep)


In [47]:
from sklearn.metrics import mean_absolute_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, r2


(803.5468719243403, 0.563917913943141)

In [48]:
import statsmodels.api as sm

# pastikan X numeric & tanpa NaN
X_train_sm = sm.add_constant(X_train_prep)

ols_sm = sm.OLS(y_train, X_train_sm).fit()
print(ols_sm.summary())


                            OLS Regression Results                            
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.561
Method:                 Least Squares   F-statistic:                     484.4
Date:                Thu, 22 Jan 2026   Prob (F-statistic):               0.00
Time:                        08:01:30   Log-Likelihood:                -57580.
No. Observations:                6818   AIC:                         1.152e+05
Df Residuals:                    6799   BIC:                         1.153e+05
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.131e+15   1.77e+15     -2.331      0.0

In [49]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.001, max_iter=1000)
lasso.fit(X_train_prep, y_train)

y_pred = lasso.predict(X_test_prep)

print("R2  :", r2_score(y_test, y_pred))
print("MAE :", mean_absolute_error(y_test, y_pred))


R2  : 0.5677835370345722
MAE : 839.7561137462764


In [50]:
preprocess = pipe.named_steps['preprocess']

feature_names = preprocess.get_feature_names_out()
feature_names

array(['num__Item_Weight', 'num__Item_Visibility', 'num__Item_MRP',
       'nom__Item_Fat_Content_regular',
       'nom__Outlet_Type_Supermarket Type1',
       'nom__Outlet_Type_Supermarket Type2',
       'nom__Outlet_Type_Supermarket Type3',
       'nom__Outlet_Establishment_Year_1987',
       'nom__Outlet_Establishment_Year_1997',
       'nom__Outlet_Establishment_Year_1998',
       'nom__Outlet_Establishment_Year_1999',
       'nom__Outlet_Establishment_Year_2002',
       'nom__Outlet_Establishment_Year_2004',
       'nom__Outlet_Establishment_Year_2007',
       'nom__Outlet_Establishment_Year_2009', 'ord__Outlet_Size',
       'ord__Outlet_Location_Type', 'bin__Item_Type_0',
       'bin__Item_Type_1', 'bin__Item_Type_2', 'bin__Item_Type_3',
       'bin__Item_Type_4'], dtype=object)

In [51]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gbr_pipeline = Pipeline([
    ("preprocess", pipe),  # pakai preprocessing yang SAMA
    ("model", gbr)
])

gbr_pipeline.fit(X_train, y_train)

y_pred = gbr_pipeline.predict(X_test)
mae_gbr = mean_absolute_error(y_test, y_pred)

mae_gbr


769.0881309393444

In [52]:
from sklearn.ensemble import ExtraTreesRegressor

etr = ExtraTreesRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

etr_pipeline = Pipeline([
    ("preprocess", pipe),
    ("model", etr)
])

etr_pipeline.fit(X_train, y_train)

y_pred = etr_pipeline.predict(X_test)
mae_etr = mean_absolute_error(y_test, y_pred)

mae_etr


820.2424246930204

In [53]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_pipeline = Pipeline([
    ("preprocess", pipe),
    ("model", xgb)
])

xgb_pipeline.fit(X_train, y_train)

y_pred = xgb_pipeline.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred)

mae_xgb


803.4862670898438

In [54]:
from sklearn.linear_model import LinearRegression

lr_pipeline = Pipeline([
    ("preprocess", pipe),
    ("model", LinearRegression())
])

lr_pipeline.fit(X_train, y_train)

y_pred = lr_pipeline.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred)

mae_lr


839.7573242848028

In [55]:
print("GBR :", mae_gbr)
print("ETR :", mae_etr)
print("XGB :", mae_xgb)
print("LR  :", mae_lr)


GBR : 769.0881309393444
ETR : 820.2424246930204
XGB : 803.4862670898438
LR  : 839.7573242848028
