# Feature Engineering v1 (Booking-Focused)

### FE v1 — Step 1 (Notebook setup)

In [1]:
import pandas as pd
import numpy as np

# Load shared sample
df = pd.read_csv("../data_sample/train_100k.csv")

# Booking-only dataset
df_b = df[df["is_booking"] == 1].copy()

df_b.shape

(8058, 24)

### FE v1 — Step 2 (build feature function)

Now we’ll create a single function make_features() that:

- filters/derives the features you agreed on

- returns:

1. X (features)

2. y (target)

In [2]:
def make_features(df_in: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    df = df_in.copy()

    # ---- Target ----
    y = df["hotel_cluster"]

    # ---- Dates → month + length_of_stay ----
    df["srch_ci_dt"] = pd.to_datetime(df["srch_ci"], errors="coerce")
    df["srch_co_dt"] = pd.to_datetime(df["srch_co"], errors="coerce")

    df["checkin_month"] = df["srch_ci_dt"].dt.month

    df["length_of_stay"] = (df["srch_co_dt"] - df["srch_ci_dt"]).dt.days
    df["length_of_stay"] = df["length_of_stay"].clip(lower=0)  # safety

    df["stay_type"] = np.where(df["length_of_stay"] <= 3, "short", "long")

    # ---- Party / family ----
    df["has_children"] = df["srch_children_cnt"] > 0

    # ---- Distance ----
    df["distance_missing"] = df["orig_destination_distance"].isna()

    # distance buckets (km): coarse categories
    df["distance_bucket"] = pd.cut(
        df["orig_destination_distance"],
        bins=[-np.inf, 300, 1000, 3000, np.inf],
        labels=["near", "mid", "far", "very_far"]
    ).astype("object")

    # explicitly label missing distances as its own category
    df.loc[df["distance_missing"], "distance_bucket"] = "unknown"

    # ---- Feature set (v1) ----
    feature_cols = [
        # geo + context
        "site_name", "posa_continent",
        "user_location_country", "user_location_region",
        "srch_destination_id", "srch_destination_type_id",

        # search intent
        "srch_adults_cnt", "srch_children_cnt", "srch_rm_cnt",

        # time
        "checkin_month", "length_of_stay", "stay_type",

        # device / commercial context
        "is_mobile", "is_package", "channel",

        # distance engineered
        "distance_missing", "distance_bucket",
    ]

    X = df[feature_cols]
    return X, y


### FE v1 — Step 3 (apply the feature function + sanity checks)

In [3]:
X, y = make_features(df_b)

print("X shape:", X.shape)
print("y shape:", y.shape)

# quick checks
print("\nMissing values in X (top 10):")
print(X.isna().sum().sort_values(ascending=False).head(10))

print("\nDtypes in X:")
print(X.dtypes)


X shape: (8058, 17)
y shape: (8058,)

Missing values in X (top 10):
site_name           0
checkin_month       0
distance_missing    0
channel             0
is_package          0
is_mobile           0
stay_type           0
length_of_stay      0
srch_rm_cnt         0
posa_continent      0
dtype: int64

Dtypes in X:
site_name                    int64
posa_continent               int64
user_location_country        int64
user_location_region         int64
srch_destination_id          int64
srch_destination_type_id     int64
srch_adults_cnt              int64
srch_children_cnt            int64
srch_rm_cnt                  int64
checkin_month                int32
length_of_stay               int64
stay_type                   object
is_mobile                    int64
is_package                   int64
channel                      int64
distance_missing              bool
distance_bucket             object
dtype: object


**code explained:**

X, y = make_features(df_b)

- builds your feature matrix X from booking-only rows

- builds your target vector y = hotel_cluster

The prints checked:

- shapes (so we know the pipeline is consistent)

- missing values (so models won’t crash)

- data types (so we know what needs encoding)

Key takeaway:

- Most features are integer IDs (categorical-ish)

- We  have true categorical features: stay_type, distance_bucket

- We have boolean: distance_missing

### FE v1 — Step 4 (prepare model-ready features)

In [4]:
categorical_cols = ["stay_type", "distance_bucket"]

X_enc = pd.get_dummies(X, columns=categorical_cols, drop_first=False)
X_enc.shape

(8058, 22)