In [1]:
import io

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class BookingDataEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, hotels, rooms, meals, operators):
        self.hotels = hotels
        self.rooms = rooms
        self.meals = meals
        self.operators = operators

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def _get_room_id(row):
            for col in ["id", "id_x", "id_y", "id_z"]:
                if pd.notna(row[col]):
                    return row[col]
            return -1

        if X is not None:
            # Operator Code
            X["operator_code"] = X["operator_code"].fillna(X["hotel_id"]).apply(str)

            # Hotel ID, Charter Bool
            X = X.merge(
                self.hotels,
                how="left",
                left_on="hotel_id",
                right_index=True,
                suffixes=("_external", ""),
            )
            X["hotel_id"] = X["hotel_id"].fillna(-1).astype(int)

            

            return X
        return None

In [2]:
from db import *

hotels = pd.read_sql(
    sql=select(
        mapping_hotel.c.id,
        mapping_hotel.c.hotel_id,
        mapping_hotel.c.is_charter,
    ),
    con=engine.connect(),
).set_index("id")

rooms = pd.read_sql(
    sql=select(
        accommodation_hotel_room.c.id,
        accommodation_hotel_room.c.hotel_id,
        accommodation_hotel_room.c.name,
        mapping_hotel_room.c.room_code,
        mapping_hotel_room.c.room_type,
    ).join_from(mapping_hotel_room, accommodation_hotel_room),
    con=engine.connect(),
)

meals = pd.read_sql(
    sql=select(
        definitions_meal_plan.c.id.label("meal_id"),
        definitions_meal_plan.c.code,
    ),
    con=engine.connect(),
).set_index("code")

operators = pd.read_sql(
    sql=select(
        mapping_operator.c.operator_id,
        mapping_operator.c.external_id,
    ),
    con=engine.connect(),
).set_index("external_id")

  mapping_hotel = Table(


In [3]:
from pathlib import Path

from sklearn.pipeline import Pipeline
from transformers import BookingDataReadCsv

pipeline = Pipeline(
    [
        ("csv_reader", BookingDataReadCsv()),
        #("data_encoder", BookingDataEncoder(hotels, rooms, meals, operators)),
    ]
)

In [4]:
base_dir = Path(".").resolve()

df = pipeline.fit_transform(base_dir / "data" / "bookings.csv")

In [5]:
df.head(10)

Unnamed: 0,ref_id,res_id,hotel_id,operator_id,operator_code,bkg_ref,guest_name,sales_date,in_date,out_date,...,status5,purchase_contract_id,purchase_spo_id,sales_contract_id,sales_spo_id,sales_spo_name,sales_spo_code,purchase_spo_name,purchase_spo_code,main_season
0,1535416,778062,218736,6,AUH277,RDUMMY0208,RDUMMY0208 RDUMMY0208,2017-01-01,2023-03-09,2023-03-15,...,Man,291586,859993,0,0,,,RAMADA WYNDHAM EBD,RAMADA WYNDHAM EBD,
1,1771832,1505568,127536,1015,127536,500730517,Michael1 Fisher1,2021-09-10,2023-09-07,2023-09-16,...,XML,229335,712099,229346,0,,,Better with Jumeirah Campaign 2021 on BB / HB ...,WBJ2BB1 / WBJ2HB1,All 21-22
2,1797019,1527231,208929,27,DXB2HS,J6G1D,BOBAN MAKSIMOVIC,2021-10-21,2023-10-17,2023-10-25,...,Txt,0,0,0,0,,,,,
3,1841027,1527251,207460,27,DXB2FM,JFGLU,BOBAN MAKSIMOVIC,2021-10-22,2023-10-25,2023-10-29,...,Txt,267750,0,268978,0,,,,,ALL 22-23
4,1834088,1561581,195703,69,DXBR32,4331426,Daniela Engels,2021-11-25,2023-03-04,2023-03-07,...,Txt,236631,0,236653,0,,,,,DM 22-23
5,1835563,1562847,195703,69,DXBR32,4332197,MANFRED VITT,2021-11-29,2023-03-11,2023-03-14,...,Txt,236631,0,236653,0,,,,,DM 22-23
6,1835566,1562849,195703,69,DXBR32,4332177,JENS FISCHER,2021-11-29,2023-03-11,2023-03-14,...,Txt,236631,726638,236653,728124,BigXtra 2021/22/23 New Year Campaign,AGBNY,BigXtra 2021/22/23 New Year Campaign,AGBNY,DM 22-23
7,1835991,1563225,195703,69,DXBR32,4332578,REINERRONALD SCHWARZ,2021-11-30,2023-03-25,2023-03-28,...,Txt,236631,732378,236653,732381,BigXtra Cruise 2021-2022 Program,AGBCC,BigXtra Cruise 2021-2022 Program,AGBCC,DM 22-23
8,1839307,1566018,191641,69,DXBN03,4334229,EVELYN PRUDZIC,2021-12-09,2023-04-17,2023-04-24,...,Txt,234598,722755,234679,722828,BigXtra – Sonnenklar 2022-2023,MPBXSHB,BigXtra – Sonnenklar 2022-2023,MPBXSHB,DM 22-23
9,1842081,1568376,191641,69,DXBN03,4336378,PER NICOLAI,2021-12-16,2023-04-17,2023-04-24,...,Txt,234598,722755,234679,722828,BigXtra – Sonnenklar 2022-2023,MPBXSHB,BigXtra – Sonnenklar 2022-2023,MPBXSHB,DM 22-23


In [6]:
df["ref_id"].is_unique

False