# Shared Model Dataset

This notebook creates a cached dataset (`df_model.parquet`) that serves as the
single source of truth for all downstream ML models.

All models should load this file instead of reprocessing the raw CSVs.


In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from tqdm.notebook import tqdm


In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df_allbookings = df[df["is_booking"] == 1]

In [4]:
PROJECT_ROOT = Path("..")
DATA_DIR = PROJECT_ROOT / "data"
df_dest = pd.read_csv(DATA_DIR / "destinations.csv")

In [5]:
df_dest = df_dest.set_index("srch_destination_id")


In [6]:
dest_cols = df_dest.columns.tolist()


## Merge of allbookings & destinations

In [7]:
df_merged = df_allbookings.merge(
    df_dest,
    how="left",
    left_on="srch_destination_id",
    right_index=True
)


In [8]:
assert len(df_merged) == len(df_allbookings)


In [9]:
missing_rate = df_merged[dest_cols].isna().any(axis=1).mean()
missing_rate


0.004171036490570678

In [10]:
df_merged.shape

(3000693, 173)

In [11]:
df_merged.isnull().sum()

date_time                    0
site_name                    0
posa_continent               0
user_location_country        0
user_location_region         0
                         ...  
d145                     12516
d146                     12516
d147                     12516
d148                     12516
d149                     12516
Length: 173, dtype: int64

## Dropping 12516 bookings with missing destination data

In [12]:
df_model = df_merged.dropna(subset=dest_cols)


In [13]:
assert df_model[dest_cols].isna().sum().sum() == 0


In [14]:
len(df_merged), len(df_model)


(3000693, 2988177)

## Reduce complexity by casting float32

In [15]:
df_model = df_merged.dropna(subset=dest_cols).copy()
df_model[dest_cols] = df_model[dest_cols].astype("float32")



In [16]:
df_model[dest_cols].dtypes.value_counts()


float32    149
Name: count, dtype: int64

In [17]:
df_model.columns

Index(['date_time', 'site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       ...
       'd140', 'd141', 'd142', 'd143', 'd144', 'd145', 'd146', 'd147', 'd148',
       'd149'],
      dtype='object', length=173)

## Save as Parquet

In [19]:
df_model.to_parquet("../data/df_model.parquet", index=False)


In [None]:
from pathlib import Path

CACHE_PATH = Path("../data/df_model.parquet")

if CACHE_PATH.exists():
    df_model = pd.read_parquet(CACHE_PATH)
    print("Loaded cached df_model")
else:
    df = pd.read_csv("../data/train.csv")
    df_allbookings = df[df["is_booking"] == 1]

    df_dest = pd.read_csv("../data/destinations.csv").set_index("srch_destination_id")
    dest_cols = df_dest.columns.tolist()

    df_merged = df_allbookings.merge(
        df_dest,
        how="left",
        left_on="srch_destination_id",
        right_index=True
    )

    df_model = df_merged.dropna(subset=dest_cols)

    df_model.to_parquet(CACHE_PATH, index=False)
    print("Preprocessing done and cached")


### If we want to load it in the future, we use:


``df = pd.read_parquet("../data/df_model.parquet")``