# FEATURE ENGINEERING - MLE2

In [100]:
import pandas as pd
import numpy as np
import uuid

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from loguru import logger
from datetime import datetime, timezone

In [101]:
raw_data = pd.read_csv('../data/raw/hotel_bookings.csv')
raw_data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


## Train Split

In [102]:
TRAIN_SIZE = round(raw_data.shape[0]*0.8)
TEST_SIZE = raw_data.shape[0] - TRAIN_SIZE

print(f"Train size: {TRAIN_SIZE}")
print(f"Test size: {TEST_SIZE}")

Train size: 95512
Test size: 23878


In [103]:
train_indices = raw_data.sample(TRAIN_SIZE).index
test_indices = raw_data.drop(train_indices).index

In [104]:
train_raw_data = raw_data.iloc[train_indices]
test_raw_data = raw_data.iloc[test_indices]

In [105]:
print(f"Train indices: {len(train_indices)}")
print(f"Test indices: {len(test_indices)}")

Train indices: 95512
Test indices: 23878


In [106]:
train_raw_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 95512 entries, 108279 to 28582
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           95512 non-null  object 
 1   is_canceled                     95512 non-null  int64  
 2   lead_time                       95512 non-null  int64  
 3   arrival_date_year               95512 non-null  int64  
 4   arrival_date_month              95512 non-null  object 
 5   arrival_date_week_number        95512 non-null  int64  
 6   arrival_date_day_of_month       95512 non-null  int64  
 7   stays_in_weekend_nights         95512 non-null  int64  
 8   stays_in_week_nights            95512 non-null  int64  
 9   adults                          95512 non-null  int64  
 10  children                        95509 non-null  float64
 11  babies                          95512 non-null  int64  
 12  meal                            

In [107]:
train_raw_data.describe(include='all')

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
count,95512,95512.0,95512.0,95512.0,95512,95512.0,95512.0,95512.0,95512.0,95512.0,...,95512,82407.0,5434.0,95512.0,95512,95512.0,95512.0,95512.0,95512,95512
unique,2,,,,12,,,,,,...,3,,,,4,,,,3,921
top,City Hotel,,,,August,,,,,,...,No Deposit,,,,Transient,,,,Check-Out,2015-10-21
freq,63522,,,,11195,,,,,,...,83690,,,,71734,,,,59981,1160
mean,,0.372006,104.05436,2016.156221,,27.168712,15.781828,0.92892,2.500639,1.854291,...,,86.576189,189.266838,2.313845,,101.799541,0.06239,0.571761,,
std,,0.483342,106.898912,0.707037,,13.603159,8.778479,1.001504,1.919101,0.568009,...,,110.598488,131.810292,17.619642,,51.112473,0.245601,0.79275,,
min,,0.0,0.0,2015.0,,1.0,1.0,0.0,0.0,0.0,...,,1.0,6.0,0.0,,-6.38,0.0,0.0,,
25%,,0.0,18.0,2016.0,,16.0,8.0,0.0,1.0,2.0,...,,9.0,62.0,0.0,,69.0,0.0,0.0,,
50%,,0.0,69.0,2016.0,,28.0,16.0,1.0,2.0,2.0,...,,14.0,179.0,0.0,,94.5,0.0,0.0,,
75%,,1.0,161.0,2017.0,,38.0,23.0,2.0,3.0,2.0,...,,229.0,270.0,0.0,,126.0,0.0,1.0,,


In [108]:
class FeatureEngineeringProcessor:
    def __init__(self, raw_data: pd.DataFrame, pipeline_name: str) -> None:
        self.raw_data = raw_data
        self.pipeline_name = pipeline_name
        self.feature_table = None
    
    def impute_scale(self, n_components: int =2) -> pd.DataFrame:
        """
        Imputa los valores faltantes y escala las variables numéricas.
        Luego aplica PCA para reducir la dimensionalidad.
        """

        numeric_columns = [ "lead_time",
                            "adults",
                            "children",
                            "babies",
                            "adr"]
        logger.info(f"Iniciando nuestro pipeline de impute_scale...")
        pipe = Pipeline(
            steps=[
                ('imputer_mean' , SimpleImputer(strategy='mean')),
                ('std_scaling', StandardScaler()),
                ('PCA', PCA(n_components=n_components))
            ]
        )
        return pd.DataFrame(pipe.fit_transform(self.raw_data[numeric_columns])
                            , columns=["great_feature1", "great_feature2"])

    def encode_categorical(self) -> pd.DataFrame:
        encoded_vars = []
        for var in ["hotel", "market_segment", "reserved_room_type"]:
            logger.info(f"Codificando con ONE {var}...")
            encoder = OneHotEncoder()
            encoded = encoder.fit_transform(self.raw_data[[var]]).toarray()
            cols = [f"{var}_{cat}" for cat in encoder.categories_[0]]
            _dataframe = pd.DataFrame(
                data=encoded,
                columns=cols
            )
            encoded_vars.append(_dataframe)

        return pd.concat(encoded_vars, axis=1)

    def run(self) -> pd.DataFrame:

        categorical = self.encode_categorical()
        numerics = self.impute_scale()
        # Dataset previo al pipeline
        modeling_dataset = pd.concat([categorical, numerics], axis=1)

        logger.info(f"Iniciando nuestro pipeline {self.pipeline_name}...")
        pipe = Pipeline(
            steps=[
                ('feature_seleccion', VarianceThreshold()),
                ('scaling_robust', RobustScaler())
            ]
        )
        self.feature_table = pd.DataFrame(
            pipe.fit_transform(modeling_dataset),
            columns= modeling_dataset.columns
        )
        self.feature_table['booking_id'] = [str(uuid.uuid4()) for _ in range(self.feature_table.shape[0])]
        self.feature_table["event_timestamp"] = [datetime.now(timezone.utc) for _ in range(self.feature_table.shape[0])]
        import time
        time.sleep(1)
        self.feature_table["created"] = [datetime.now(timezone.utc) for _ in range(self.feature_table.shape[0])]
        return self.feature_table
    
    def write_feature_table(self, filepath: str) -> None:
        """Guarda la feature table en un archivo parquet."""
        if self.feature_table is not None:
            self.feature_table.to_parquet(filepath, index=False)
            logger.info(f"Feature table guardada en {filepath}")
        else:
            raise Exception("No hay feature table para guardar. Ejecuta el método run() primero.")

        

In [109]:
train_processor = FeatureEngineeringProcessor(raw_data = train_raw_data, 
                                        pipeline_name = "Feature Engineering TRAIN")
train_processor.run()

train_processor.write_feature_table('../feast_service/fs_ml2/feature_repo/data/booking_features.parquet')

[32m2025-10-14 20:52:14.391[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m32[0m - [1mCodificando con ONE hotel...[0m
[32m2025-10-14 20:52:14.406[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m32[0m - [1mCodificando con ONE market_segment...[0m
[32m2025-10-14 20:52:14.421[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m32[0m - [1mCodificando con ONE reserved_room_type...[0m
[32m2025-10-14 20:52:14.459[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_scale[0m:[36m18[0m - [1mIniciando nuestro pipeline de impute_scale...[0m
[32m2025-10-14 20:52:14.500[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m51[0m - [1mIniciando nuestro pipeline Feature Engineering TRAIN...[0m
[32m2025-10-14 20:52:15.971[0m | [1mINFO    [0m | [36m__main__[0m:[36mwrite_feature_table[0m:[36m73[0m - [1mFeature table guardada en ../feast_service/fs_ml2/feature_repo/data/booking_featu

In [110]:
pd.read_parquet('../feast_service/fs_ml2/feature_repo/data/booking_features.parquet').head()

Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,...,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,great_feature1,great_feature2,booking_id,event_timestamp,created
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.073414,-1.21478,0017ea8e-8292-4b8c-b67a-6bac03adf2b2,2025-10-15 01:52:14.767173+00:00,2025-10-15 01:52:15.852557+00:00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.206829,1.666868,f24c05d7-ac6d-47a6-8879-b284d94e5fb1,2025-10-15 01:52:14.767181+00:00,2025-10-15 01:52:15.852573+00:00
2,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,3.617122,-1.425459,d5664c5c-e8a4-4f3d-93c6-bccb9816c61f,2025-10-15 01:52:14.767181+00:00,2025-10-15 01:52:15.852574+00:00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.661425,1.229981,564056f2-e41e-455f-a914-4bef83b2d004,2025-10-15 01:52:14.767182+00:00,2025-10-15 01:52:15.852575+00:00
4,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.230793,-1.139403,e1260e45-649c-40c5-af11-a94acf06abec,2025-10-15 01:52:14.767182+00:00,2025-10-15 01:52:15.852575+00:00


In [111]:
test_processor = FeatureEngineeringProcessor(raw_data = test_raw_data, 
                                        pipeline_name = "Feature Engineering - TEST")
test_processor.run()

[32m2025-10-14 20:52:27.166[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m32[0m - [1mCodificando con ONE hotel...[0m
[32m2025-10-14 20:52:27.175[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m32[0m - [1mCodificando con ONE market_segment...[0m
[32m2025-10-14 20:52:27.187[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categorical[0m:[36m32[0m - [1mCodificando con ONE reserved_room_type...[0m
[32m2025-10-14 20:52:27.208[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_scale[0m:[36m18[0m - [1mIniciando nuestro pipeline de impute_scale...[0m
[32m2025-10-14 20:52:27.244[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m51[0m - [1mIniciando nuestro pipeline Feature Engineering - TEST...[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,reserved_room_type_A,...,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_P,great_feature1,great_feature2,booking_id,event_timestamp,created
0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.152016,-0.268956,546250d2-d041-430c-b66c-5fd0421e0309,2025-10-15 01:52:27.324119+00:00,2025-10-15 01:52:28.435868+00:00
1,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.287168,-0.367767,b715b80d-062f-4641-b698-9085505eb4c1,2025-10-15 01:52:27.324126+00:00,2025-10-15 01:52:28.435881+00:00
2,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.112392,0.207466,f7b1e080-8cb4-4b69-8c24-75212ea91d64,2025-10-15 01:52:27.324126+00:00,2025-10-15 01:52:28.435882+00:00
3,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.013876,-0.273836,1edde8ce-11d6-48d0-94eb-4373ad6f3035,2025-10-15 01:52:27.324126+00:00,2025-10-15 01:52:28.435883+00:00
4,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.067666,0.120450,e4143283-6040-4ce0-9093-f159e4dde659,2025-10-15 01:52:27.324127+00:00,2025-10-15 01:52:28.435884+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.076465,0.293763,8e7f7fc2-b2e8-4be0-a4c3-3df84ff6a781,2025-10-15 01:52:27.329686+00:00,2025-10-15 01:52:28.441435+00:00
23874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.126832,1.019776,64f2c8d3-7f1d-46d2-b9c4-5428a3cc4aff,2025-10-15 01:52:27.329687+00:00,2025-10-15 01:52:28.441435+00:00
23875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.421983,0.062414,88dd3e8c-ed66-46a7-8290-5a7bbc1cb850,2025-10-15 01:52:27.329687+00:00,2025-10-15 01:52:28.441435+00:00
23876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.090617,0.790442,270ed891-0445-49f8-a055-0231b11679ad,2025-10-15 01:52:27.329687+00:00,2025-10-15 01:52:28.441435+00:00


## Trabajando con Feast

In [112]:
_test_fs_df = pd.read_parquet('../feast_service/fs_ml2/feature_repo/data/booking_features.parquet')
_test_fs_df.head()

Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,...,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,great_feature1,great_feature2,booking_id,event_timestamp,created
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.073414,-1.21478,0017ea8e-8292-4b8c-b67a-6bac03adf2b2,2025-10-15 01:52:14.767173+00:00,2025-10-15 01:52:15.852557+00:00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.206829,1.666868,f24c05d7-ac6d-47a6-8879-b284d94e5fb1,2025-10-15 01:52:14.767181+00:00,2025-10-15 01:52:15.852573+00:00
2,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,3.617122,-1.425459,d5664c5c-e8a4-4f3d-93c6-bccb9816c61f,2025-10-15 01:52:14.767181+00:00,2025-10-15 01:52:15.852574+00:00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.661425,1.229981,564056f2-e41e-455f-a914-4bef83b2d004,2025-10-15 01:52:14.767182+00:00,2025-10-15 01:52:15.852575+00:00
4,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.230793,-1.139403,e1260e45-649c-40c5-af11-a94acf06abec,2025-10-15 01:52:14.767182+00:00,2025-10-15 01:52:15.852575+00:00


In [113]:
from feast import FeatureStore

In [114]:
# 1. Conectarse al Feature Store
fs = FeatureStore(repo_path="../feast_service/fs_ml2/feature_repo")

## Online

In [115]:
# 2. Obtener el Feature Service correcto (el que tiene la OnDemandFeatureView)
#    ¡Corregimos el nombre!
feature_service = fs.get_feature_service("dsrp_feature_service")

In [116]:
# 3. Definir las entidades y los datos de entrada para la predicción
#    Para get_online_features, el formato es una lista de diccionarios.
entity_rows = [
    {
        "booking_id": "2f7561b2-c7e1-4874-a9d3-02215a994180",
        "kpi1": 2.0,
        "kpi2": 10.0,
    },
    {
        "booking_id": "d521b151-6c1d-4caf-9405-a24c2cb9f8a8",
        "kpi1": 5.0,
        "kpi2": 10.0,
    },
]

In [117]:
# 4. Llamar a la función correcta: get_online_features
online_features = fs.get_online_features(
    features=feature_service,  # Es más limpio pasar el servicio directamente
    entity_rows=entity_rows
).to_dict()

pd.DataFrame.from_dict(online_features)


Unnamed: 0,booking_id,great_feature1_kpi1,great_feature2_kpi2
0,2f7561b2-c7e1-4874-a9d3-02215a994180,-2.24571,-11.927738
1,d521b151-6c1d-4caf-9405-a24c2cb9f8a8,-3.168793,6.090684


## Historical

In [118]:
# 2. Crear el DataFrame de "eventos pasados" (ground truth)
#    Este es el DataFrame al que Feast unirá las características.
#    Típicamente aquí tendrías tu variable objetivo (el "target").
#    ¡Debe tener la columna 'event_timestamp'!
entity_df = pd.DataFrame.from_dict({
    "booking_id": [
        "0017ea8e-8292-4b8c-b67a-6bac03adf2b2", 
        "f24c05d7-ac6d-47a6-8879-b284d94e5fb1"
    ],
})
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)
entity_df

Unnamed: 0,booking_id,event_timestamp
0,0017ea8e-8292-4b8c-b67a-6bac03adf2b2,2025-10-15 01:54:37.537500+00:00
1,f24c05d7-ac6d-47a6-8879-b284d94e5fb1,2025-10-15 01:54:37.537500+00:00


In [119]:
# 3. Definir qué características quieres obtener
#    Puedes usar el FeatureService que agrupa las características base.
feature_service = fs.get_feature_service("fs_service_pc")

In [120]:
# 4. Llamar a get_historical_features
print("Generando dataset de entrenamiento...")
training_job = fs.get_historical_features(
    entity_df=entity_df,
    features=feature_service
)

Generando dataset de entrenamiento...


In [123]:
# 5. Convertir el resultado a un DataFrame de Pandas
training_df = training_job.to_df()
training_df

Unnamed: 0,booking_id,event_timestamp,great_feature1,great_feature2
0,0017ea8e-8292-4b8c-b67a-6bac03adf2b2,2025-10-15 01:54:37.537500+00:00,-1.073414,-1.21478
1,f24c05d7-ac6d-47a6-8879-b284d94e5fb1,2025-10-15 01:54:37.537500+00:00,0.206829,1.666868
