# Tree Model

## Setup

In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))


In [2]:
import pandas as pd
import numpy as np

from pathlib import Path

from src.config import RANDOM_SEED, TEST_SIZE, TOP_K, N_JOBS, STRATIFY_SPLIT
from src.fe_v1 import make_features
from src.metrics import mapk, hit_rate_at_k


In [3]:
from pathlib import Path
import pandas as pd

DATA_PATH = Path("../data/df_model.parquet")

print("Loading full df_model.parquet ...")
df = pd.read_parquet(DATA_PATH)

print("Done.")
print("Shape:", df.shape)
df.head()


Loading full df_model.parquet ...
Done.
Shape: (2988177, 173)


Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149
0,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,-2.384553,-2.345528,-2.396591,-2.399953,-2.388116,-2.394294,-2.400667,-2.398716,-2.386585,-2.39037
1,2014-02-27 18:01:32,2,3,66,318,52078,,756,0,1,...,-2.298266,-2.145362,-2.289405,-2.299516,-2.293402,-2.298682,-2.299516,-2.293223,-2.299516,-2.217007
2,2013-06-15 15:38:05,30,4,195,548,56440,,1048,0,1,...,-2.269617,-2.158832,-2.273201,-2.137717,-2.237712,-2.235306,-2.273201,-2.273201,-2.273201,-2.273201
3,2014-11-23 18:02:20,30,4,195,991,47725,,1048,0,0,...,-2.264659,-2.233329,-2.188037,-2.265921,-2.265921,-2.265921,-2.265921,-2.264935,-2.265921,-2.265921
4,2014-01-03 16:30:17,2,3,66,462,41898,2454.8588,1482,0,1,...,-2.213802,-2.235346,-2.242869,-2.240471,-2.242869,-2.220603,-2.242869,-2.242869,-2.241668,-2.225471


In [4]:
d_cols_all = [c for c in df.columns if c.startswith("d")]
len(d_cols_all), d_cols_all[:10]


(150, ['date_time', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9'])

In [5]:
# Welche davon sind NICHT numerisch?
bad_d = [
    c for c in d_cols_all
    if not pd.api.types.is_numeric_dtype(df[c])
]

bad_d[:10], len(bad_d)


(['date_time'], 1)

## Using FE_v1

In [11]:
from src.fe_v1 import make_features as make_features_v1

X_v1, y = make_features_v1(df)

X_v1.shape


print("X shape:", X_v1.shape)
print("y shape:", y.shape)
print("Target classes:", y.nunique())
X_v1.head()


X shape: (2988177, 17)
y shape: (2988177,)
Target classes: 100


Unnamed: 0,site_name,posa_continent,user_location_country,user_location_region,srch_destination_id,srch_destination_type_id,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,checkin_month,length_of_stay,stay_type,is_mobile,is_package,channel,distance_missing,distance_bucket
0,2,3,66,348,8250,1,2,0,1,8,4,long,0,1,9,False,far
1,2,3,66,318,8291,1,2,0,1,4,2,short,0,1,4,True,unknown
2,30,4,195,548,1385,1,2,0,1,9,8,long,0,1,9,True,unknown
3,30,4,195,991,8803,1,2,0,1,6,2,short,0,0,9,True,unknown
4,2,3,66,462,12009,1,3,0,2,2,5,long,0,1,1,False,far


In [12]:
from src.fe_v2 import make_features as make_features_v2

X_v2, y_v2 = make_features_v2(df)

# sanity
assert y.equals(y_v2)
assert X_v2.shape[0] == X_v1.shape[0]
assert X_v2.shape[1] > X_v1.shape[1]

X_v2.shape


(2988177, 166)

## Splitting into Train & Test

In [13]:
from sklearn.model_selection import train_test_split

stratify_y = y if STRATIFY_SPLIT else None

X1_train, X1_test, y_train, y_test = train_test_split(
    X_v1, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=stratify_y
)

# gleiche Indizes für v2
X2_train = X_v2.loc[X1_train.index]
X2_test  = X_v2.loc[X1_test.index]


## Preprocessing + Model Pipeline (OneHot + Decision Tree)

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

def build_tree_pipeline(X_sample):
    cat_cols = X_sample.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [c for c in X_sample.columns if c not in cat_cols]

    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", num_cols),
        ],
        remainder="drop",
        n_jobs=-1
    )

    tree = DecisionTreeClassifier(
        random_state=RANDOM_SEED,
        max_depth=12,
        min_samples_leaf=50,
    )

    return Pipeline([
        ("prep", preprocess),
        ("tree", tree),
    ])


## Train & Evaluate Model 1

In [16]:
model_v1 = build_tree_pipeline(X1_train)
model_v1.fit(X1_train, y_train)

proba_v1 = model_v1.predict_proba(X1_test)
classes_v1 = model_v1.named_steps["tree"].classes_

topk_v1 = np.argsort(proba_v1, axis=1)[:, ::-1][:, :TOP_K]
preds_v1 = [[classes_v1[i] for i in row] for row in topk_v1]

map5_v1 = mapk(y_test.to_numpy(), preds_v1, k=TOP_K)
hit5_v1 = hit_rate_at_k(y_test.to_numpy(), preds_v1, k=TOP_K)

map5_v1, hit5_v1



(0.19552731093843068, 0.350879799744326)

In [17]:
model_v2 = build_tree_pipeline(X2_train)
model_v2.fit(X2_train, y_train)

proba_v2 = model_v2.predict_proba(X2_test)
classes_v2 = model_v2.named_steps["tree"].classes_

topk_v2 = np.argsort(proba_v2, axis=1)[:, ::-1][:, :TOP_K]
preds_v2 = [[classes_v2[i] for i in row] for row in topk_v2]

map5_v2 = mapk(y_test.to_numpy(), preds_v2, k=TOP_K)
hit5_v2 = hit_rate_at_k(y_test.to_numpy(), preds_v2, k=TOP_K)

map5_v2, hit5_v2


(0.23214743868620136, 0.40816015099492)

## Comparison 

In [18]:
comparison = pd.DataFrame({
    "features": ["fe_v1", "fe_v2"],
    "n_features": [X_v1.shape[1], X_v2.shape[1]],
    "MAP@5": [map5_v1, map5_v2],
    "HIT@5": [hit5_v1, hit5_v2],
})

comparison



Unnamed: 0,features,n_features,MAP@5,HIT@5
0,fe_v1,17,0.195527,0.35088
1,fe_v2,166,0.232147,0.40816


Save baseline (Model + Encoder)

In [22]:
import joblib
from pathlib import Path

MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(exist_ok=True)

joblib.dump(model_v1, MODEL_DIR / "tree_fe_v1.joblib")
joblib.dump(model_v2, MODEL_DIR / "tree_fe_v2.joblib")

print("saved:")
print(" - tree_fe_v1.joblib")
print(" - tree_fe_v2.joblib")



saved:
 - tree_fe_v1.joblib
 - tree_fe_v2.joblib
