# FEATURE ENGINEERING - MLE2

In [78]:
import pandas as pd
import numpy as np
import uuid

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from loguru import logger

In [142]:
raw_data = pd.read_csv('../data/raw/hotel_bookings.csv')
raw_data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


## Train Split

In [143]:
TRAIN_SIZE = round(raw_data.shape[0]*0.8)
TEST_SIZE = raw_data.shape[0] - TRAIN_SIZE

print(f"Train size: {TRAIN_SIZE}")
print(f"Test size: {TEST_SIZE}")

Train size: 95512
Test size: 23878


In [144]:
train_indices = raw_data.sample(TRAIN_SIZE).index
test_indices = raw_data.drop(train_indices).index

In [145]:
train_raw_data = raw_data.iloc[train_indices]
test_raw_data = raw_data.iloc[test_indices]

In [69]:
print(f"Train indices: {len(train_indices)}")
print(f"Test indices: {len(test_indices)}")

Train indices: 95512
Test indices: 23878


In [146]:
train_raw_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 95512 entries, 41814 to 60558
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           95512 non-null  object 
 1   is_canceled                     95512 non-null  int64  
 2   lead_time                       95512 non-null  int64  
 3   arrival_date_year               95512 non-null  int64  
 4   arrival_date_month              95512 non-null  object 
 5   arrival_date_week_number        95512 non-null  int64  
 6   arrival_date_day_of_month       95512 non-null  int64  
 7   stays_in_weekend_nights         95512 non-null  int64  
 8   stays_in_week_nights            95512 non-null  int64  
 9   adults                          95512 non-null  int64  
 10  children                        95508 non-null  float64
 11  babies                          95512 non-null  int64  
 12  meal                            9

In [147]:
train_raw_data.describe(include='all')

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
count,95512,95512.0,95512.0,95512.0,95512,95512.0,95512.0,95512.0,95512.0,95512.0,95508.0,95512.0,95512,95124,95512,95512,95512.0,95512.0,95512.0,95512,95512,95512.0,95512,82455.0,5420.0,95512.0,95512,95512.0,95512.0,95512.0,95512,95512
unique,2,,,,12,,,,,,,,5,168,8,5,,,,10,11,,3,,,,4,,,,3,922
top,City Hotel,,,,August,,,,,,,,BB,PRT,Online TA,TA/TO,,,,A,A,,No Deposit,,,,Transient,,,,Check-Out,2015-10-21
freq,63383,,,,11059,,,,,,,,73902,38852,45182,78245,,,,68880,59251,,83717,,,,71677,,,,60234,1163
mean,,0.369357,103.983876,2016.156399,,27.149458,15.810254,0.927684,2.49955,1.856625,0.104693,0.0078,,,,,0.031891,0.087015,0.136287,,,0.220841,,87.011825,190.155535,2.314641,,101.804885,0.062715,0.572284,,
std,,0.482633,106.712382,0.707582,,13.606873,8.788102,0.998305,1.909222,0.584985,0.400171,0.094625,,,,,0.175712,0.838498,1.491815,,,0.651524,,110.903491,132.015556,17.668902,,51.049399,0.244985,0.792637,,
min,,0.0,0.0,2015.0,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,,,0.0,,1.0,6.0,0.0,,-6.38,0.0,0.0,,
25%,,0.0,18.0,2016.0,,16.0,8.0,0.0,1.0,2.0,0.0,0.0,,,,,0.0,0.0,0.0,,,0.0,,9.0,67.0,0.0,,69.02,0.0,0.0,,
50%,,0.0,69.0,2016.0,,27.0,16.0,1.0,2.0,2.0,0.0,0.0,,,,,0.0,0.0,0.0,,,0.0,,14.0,180.0,0.0,,94.5,0.0,0.0,,
75%,,1.0,160.0,2017.0,,38.0,23.0,2.0,3.0,2.0,0.0,0.0,,,,,0.0,0.0,0.0,,,0.0,,229.0,270.0,0.0,,126.0,0.0,1.0,,


In [148]:
train_raw_data["market_segment"].value_counts()

market_segment
Online TA        45182
Offline TA/TO    19366
Groups           15860
Direct           10108
Corporate         4228
Complementary      580
Aviation           186
Undefined            2
Name: count, dtype: int64

In [149]:
one = OneHotEncoder()
testi = one.fit_transform(train_raw_data[["hotel"]]).toarray()
testi

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], shape=(95512, 2))

In [150]:
inputtito = SimpleImputer(strategy='mean')
testi2 = inputtito.fit_transform(train_raw_data[["lead_time"]])
testi2

array([[ 14.],
       [103.],
       [239.],
       ...,
       [315.],
       [ 29.],
       [ 34.]], shape=(95512, 1))

In [151]:
class FeatureEngineeringProcessor:
    def __init__(self, raw_data: pd.DataFrame, pipeline_name: str) -> None:
        self.raw_data = raw_data
        self.pipeline_name = pipeline_name
    
    def impute_scale(self, n_components: int =2) -> pd.DataFrame:
        """
        Imputa los valores faltantes y escala las variables numéricas.
        Luego aplica PCA para reducir la dimensionalidad.
        """

        numeric_columns = [ "lead_time",
                            "adults",
                            "children",
                            "babies",
                            "adr"]
        logger.info(f"Iniciando nuestro pipeline de impute_scale...")
        pipe = Pipeline(
            steps=[
                ('imputer_mean' , SimpleImputer(strategy='mean')),
                ('std_scaling', StandardScaler()),
                ('PCA', PCA(n_components=n_components))
            ]
        )
        return pd.DataFrame(pipe.fit_transform(self.raw_data[numeric_columns])
                            , columns=["pca_1", "pca_2"])

    def encode_categorical(self) -> pd.DataFrame:
        encoded_vars = []
        for var in ["hotel", "market_segment", "reserved_room_type"]:
            logger.info(f"Codificando con ONE {var}...")
            encoder = OneHotEncoder()
            encoded = encoder.fit_transform(self.raw_data[[var]]).toarray()
            cols = [f"{var}_{cat}" for cat in encoder.categories_[0]]
            _dataframe = pd.DataFrame(
                data=encoded,
                columns=cols
            )
            encoded_vars.append(_dataframe)

        return pd.concat(encoded_vars, axis=1)

    def run(self) -> pd.DataFrame:

        categorical = self.encode_categorical()
        numerics = self.impute_scale()
        # Dataset previo al pipeline
        modeling_dataset = pd.concat([categorical, numerics], axis=1)

        logger.info(f"Iniciando nuestro pipeline {self.pipeline_name}...")
        pipe = Pipeline(
            steps=[
                ('feature_seleccion', VarianceThreshold()),
                ('scaling_robust', RobustScaler())
            ]
        )
        return pd.DataFrame(
            pipe.fit_transform(modeling_dataset),
            columns= modeling_dataset.columns
        )
        

In [152]:
train_processor = FeatureEngineeringProcessor(raw_data = train_raw_data, 
                                        pipeline_name = "Feature Engineering TRAIN")
train_processor.run()

[32m2025-10-01 04:05:30.208[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m31[0m - [1mCodificando con ONE hotel...[0m
[32m2025-10-01 04:05:30.270[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m31[0m - [1mCodificando con ONE market_segment...[0m
[32m2025-10-01 04:05:30.319[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m31[0m - [1mCodificando con ONE reserved_room_type...[0m
[32m2025-10-01 04:05:30.360[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_scale[0m:[36m17[0m - [1mIniciando nuestro pipeline de impute_scale...[0m
[32m2025-10-01 04:05:30.479[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m50[0m - [1mIniciando nuestro pipeline Feature Engineering TRAIN...[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,reserved_room_type_A,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,pca_1,pca_2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047341,-0.241115
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.498357,1.190548
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.771633,0.287823
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.166764,0.806919
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.140997,-0.126305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.979355,0.577455
95508,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.438929,-0.214626
95509,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.578154,1.643388
95510,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.542211,-0.246459


In [154]:
test_processor = FeatureEngineeringProcessor(raw_data = test_raw_data, 
                                        pipeline_name = "Feature Engineering - TEST")
test_processor.run()

[32m2025-10-01 04:05:48.954[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m31[0m - [1mCodificando con ONE hotel...[0m
[32m2025-10-01 04:05:48.980[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m31[0m - [1mCodificando con ONE market_segment...[0m
[32m2025-10-01 04:05:49.000[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m31[0m - [1mCodificando con ONE reserved_room_type...[0m
[32m2025-10-01 04:05:49.040[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_scale[0m:[36m17[0m - [1mIniciando nuestro pipeline de impute_scale...[0m
[32m2025-10-01 04:05:49.098[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m50[0m - [1mIniciando nuestro pipeline Feature Engineering - TEST...[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,reserved_room_type_A,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,pca_1,pca_2
0,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267512,-0.354517
1,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113346,0.092099
2,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.926760,-0.369194
3,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151050,0.114169
4,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.030738,0.302334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.313080,0.853230
23874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.313080,0.853230
23875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.901064,-0.126371
23876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379690,0.914610
