##### STAGE 04 - PREPROCESSING PIPELINE FOR NUMERICAL AND CATEGORICAL FEATURES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import os
import pickle
import joblib

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

###### Load the engineered_training_data and engineered_evaluation_data for `..data/processed_data`

In [2]:
train_data = pd.read_csv("../data/processed_data/engineered_training_data.csv")
eval_data = pd.read_csv("../data/processed_data/engineered_evaluation_data.csv")

print(train_data.info())
print(f"Train data shape: {train_data.shape}")
print(f"Eval data shape: {eval_data.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576815 entries, 0 to 576814
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   median_ppsf              576815 non-null  float64
 1   median_dom               576815 non-null  float64
 2   sold_above_list          576815 non-null  float64
 3   off_market_in_two_weeks  576815 non-null  float64
 4   bus                      576815 non-null  float64
 5   hospital                 576815 non-null  float64
 6   mall                     576815 non-null  float64
 7   restaurant               576815 non-null  float64
 8   station                  576815 non-null  float64
 9   price                    576815 non-null  float64
dtypes: float64(10)
memory usage: 44.0 MB
None
Train data shape: (576815, 10)
Eval data shape: (148448, 10)


In [3]:
train_data.head()

Unnamed: 0,median_ppsf,median_dom,sold_above_list,off_market_in_two_weeks,bus,hospital,mall,restaurant,station,price
0,31.813674,59.5,0.142857,0.043478,2.0,4.0,1.0,45.0,4.0,200773.999557
1,104.931794,290.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105863.681174
2,122.807018,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126572.277873
3,171.817343,126.0,0.153846,0.277778,0.0,0.0,0.0,0.0,0.0,352711.838012
4,68.638393,111.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,70415.266483


In [4]:
train_data.describe()

Unnamed: 0,median_ppsf,median_dom,sold_above_list,off_market_in_two_weeks,bus,hospital,mall,restaurant,station,price
count,576815.0,576815.0,576815.0,576815.0,576815.0,576815.0,576815.0,576815.0,576815.0,576815.0
mean,195.464364,69.474221,0.213779,0.239997,0.531968,4.072398,1.170703,48.516169,5.803733,340113.3
std,845.359653,84.940345,0.164107,0.203563,1.332819,8.578513,2.617532,148.507724,16.877063,300515.8
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10464.32
25%,97.319411,32.0,0.103448,0.037037,0.0,0.0,0.0,1.0,0.0,166556.4
50%,141.396761,52.5,0.186992,0.228412,0.0,1.0,0.0,9.0,0.0,263456.8
75%,223.38741,85.0,0.293333,0.380734,0.0,4.0,1.0,36.0,4.0,415627.4
max,366700.0,7777.0,1.0,1.0,26.0,96.0,45.0,2803.0,192.0,6342460.0


In [5]:
eval_data.head()

Unnamed: 0,median_ppsf,median_dom,sold_above_list,off_market_in_two_weeks,bus,hospital,mall,restaurant,station,price
0,123.333333,50.0,0.171429,0.3,0.0,2.0,0.0,30.0,5.0,164839.7
1,94.691957,10.0,0.098361,0.469136,5.0,4.0,3.0,259.0,1.0,132164.3
2,1386.111111,188.0,0.037313,0.006623,11.0,26.0,8.0,2054.0,114.0,1494755.0
3,233.574442,97.0,0.095238,0.0625,0.0,1.0,0.0,0.0,0.0,488232.2
4,183.644102,42.0,0.285714,0.388889,0.0,1.0,4.0,26.0,4.0,194468.1


###### Since the engineered dataset contains only the numerical columns, we would only be requested to perform standardization and normalization on the dataset for the purpose of ML training.

In [6]:
target_column = "price"
feature_columns = [col for col in train_data.columns if col != target_column]
X_train = train_data[feature_columns]
y_train = train_data[target_column]

X_eval = eval_data[feature_columns]
y_eval = eval_data[target_column]

In [7]:
X_train.head()

Unnamed: 0,median_ppsf,median_dom,sold_above_list,off_market_in_two_weeks,bus,hospital,mall,restaurant,station
0,31.813674,59.5,0.142857,0.043478,2.0,4.0,1.0,45.0,4.0
1,104.931794,290.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,122.807018,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,171.817343,126.0,0.153846,0.277778,0.0,0.0,0.0,0.0,0.0
4,68.638393,111.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0


###### `We are going to be using the MinMax Scaler and the Standard Scaler for standardization` =====>> `X_train`

In [8]:
min_max_scaler = MinMaxScaler()
min_max_scaler

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [9]:
X_train_min_max = min_max_scaler.fit_transform(X_train)
X_train_min_max[:5]

array([[8.67566778e-05, 7.65076508e-03, 1.42857143e-01, 4.34782609e-02,
        7.69230769e-02, 4.16666667e-02, 2.22222222e-02, 1.60542276e-02,
        2.08333333e-02],
       [2.86151607e-04, 3.72894432e-02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [3.34897784e-04, 6.30063006e-03, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [4.68550159e-04, 1.62016202e-02, 1.53846154e-01, 2.77777778e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [1.87178601e-04, 1.42728559e-02, 1.42857143e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])

In [10]:
std_scaler = StandardScaler()
std_scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [11]:
X_train_scaled = std_scaler.fit_transform(X_train_min_max)
X_train_scaled[:5]

array([[-0.19358723, -0.1174263 , -0.43216748, -0.96539795,  1.10144992,
        -0.00843941, -0.06521527, -0.02367669, -0.10687488],
       [-0.10709365,  2.59624528, -1.30267825, -1.17898476, -0.39913012,
        -0.47472107, -0.44725487, -0.3266915 , -0.34388314],
       [-0.08594852, -0.24104257, -1.30267825, -1.17898476, -0.39913012,
        -0.47472107, -0.44725487, -0.3266915 , -0.34388314],
       [-0.02797276,  0.66547678, -0.36520511,  0.18559759, -0.39913012,
        -0.47472107, -0.44725487, -0.3266915 , -0.34388314],
       [-0.15002618,  0.4888821 , -0.43216748, -1.17898476, -0.39913012,
        -0.47472107, -0.44725487, -0.3266915 , -0.34388314]])

###### `Convert transformed to DataFrame`

In [12]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled.head()

Unnamed: 0,median_ppsf,median_dom,sold_above_list,off_market_in_two_weeks,bus,hospital,mall,restaurant,station
0,-0.193587,-0.117426,-0.432167,-0.965398,1.10145,-0.008439,-0.065215,-0.023677,-0.106875
1,-0.107094,2.596245,-1.302678,-1.178985,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883
2,-0.085949,-0.241043,-1.302678,-1.178985,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883
3,-0.027973,0.665477,-0.365205,0.185598,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883
4,-0.150026,0.488882,-0.432167,-1.178985,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883


###### `Concatenate X_train_scaled with its target ====>>> price`

In [13]:
train_data_preprocessed = pd.concat([X_train_scaled, y_train.reset_index(drop=True)], axis=1)
train_data_preprocessed.head()  

Unnamed: 0,median_ppsf,median_dom,sold_above_list,off_market_in_two_weeks,bus,hospital,mall,restaurant,station,price
0,-0.193587,-0.117426,-0.432167,-0.965398,1.10145,-0.008439,-0.065215,-0.023677,-0.106875,200773.999557
1,-0.107094,2.596245,-1.302678,-1.178985,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883,105863.681174
2,-0.085949,-0.241043,-1.302678,-1.178985,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883,126572.277873
3,-0.027973,0.665477,-0.365205,0.185598,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883,352711.838012
4,-0.150026,0.488882,-0.432167,-1.178985,-0.39913,-0.474721,-0.447255,-0.326692,-0.343883,70415.266483


##### `Preprocessors for X_train`

In [14]:
print(std_scaler)
print(min_max_scaler)

StandardScaler()
MinMaxScaler()


##### `Apply the preprocessors to =====>>>> X_eval`

In [15]:
X_eval_min_max = min_max_scaler.transform(X_eval)
X_eval_scaled = std_scaler.transform(X_eval_min_max)
X_eval_scaled = pd.DataFrame(X_eval_scaled, columns=X_eval.columns)
eval_data_preprocessed = pd.concat([X_eval_scaled, y_eval.reset_index(drop=True)], axis=1)
eval_data_preprocessed.head()

Unnamed: 0,median_ppsf,median_dom,sold_above_list,off_market_in_two_weeks,bus,hospital,mall,restaurant,station,price
0,-0.085326,-0.22927,-0.258065,0.294764,-0.39913,-0.24158,-0.447255,-0.124682,-0.047623,164839.7
1,-0.119207,-0.700189,-0.70331,1.125643,3.35232,-0.008439,0.698864,1.417327,-0.284631,132164.3
2,1.408451,1.395401,-1.075306,-1.146452,7.85406,2.55611,2.609062,13.504251,6.410852,1494755.0
3,0.045082,0.32406,-0.722338,-0.871954,-0.39913,-0.358151,-0.447255,-0.326692,-0.343883,488232.2
4,-0.013983,-0.323453,0.438343,0.731431,-0.39913,-0.358151,1.080904,-0.151616,-0.106875,194468.1


##### SAVING THE PREPROCESSED DATASETS

In [16]:
train_data_preprocessed.to_csv("../data/processed_data/scaled_engineered_training_data.csv", index=False)
eval_data_preprocessed.to_csv("../data/processed_data/scaled_engineered_evaluation_data.csv", index=False)
print("✅ Standardization on the Engineered data for training and evaluation datasets have been successfully saved.")

✅ Standardization on the Engineered data for training and evaluation datasets have been successfully saved.


##### SAVING THE PREPROCESSORS

In [17]:
PREPROCESSORS = {
    "MIN_MAX_SCALER": min_max_scaler,
    "STANDARD_SCALER": std_scaler
}

with open("../model/preprocessors.pkl", "wb") as file:
    pickle.dump(PREPROCESSORS, file)
print("✅ Preprocessors have been saved successfully.")

✅ Preprocessors have been saved successfully.


In [18]:
#joblib.dump(PREPROCESSORS, "../model/preprocessors.joblib")