In [1]:
import sys
sys.path.insert(0, "..")

from src.fe_v2 import make_features
from src.config import RANDOM_SEED, TEST_SIZE, TOP_K
from src.metrics import mapk, hit_rate_at_k
from src.model_utils import topk_from_proba


In [4]:
DATA_PATH = "../data/processed/df_model.parquet"

df = pd.read_parquet(DATA_PATH)
df.shape


(2988177, 173)

In [5]:
df = df.sample(n=500_000, random_state=RANDOM_SEED).reset_index(drop=True)


In [6]:
X, y = make_features(df)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=y
)


In [8]:
categorical_features = ["stay_type", "distance_bucket"]

for col in categorical_features:
    X_train[col] = X_train[col].astype("category")
    X_val[col] = X_val[col].astype("category")


In [9]:
import lightgbm as lgb

train_data = lgb.Dataset(
    X_train,
    label=y_train,
    categorical_feature=categorical_features,
    free_raw_data=False
)

val_data = lgb.Dataset(
    X_val,
    label=y_val,
    categorical_feature=categorical_features,
    free_raw_data=False
)


In [10]:
params = {
    "objective": "multiclass",
    "num_class": y.nunique(),
    "metric": "multi_logloss",

    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,

    "min_data_in_leaf": 100,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,

    "verbosity": -1,
    "seed": RANDOM_SEED,
}


In [11]:
model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    valid_names=["valid"],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[159]	valid's multi_logloss: 2.97887


In [13]:
# Tell LightGBM which features are categorical
categorical_features = ["stay_type", "distance_bucket"]

for col in categorical_features:
    X_train[col] = X_train[col].astype("category")
    X_val[col] = X_val[col].astype("category")

In [15]:
train_data = lgb.Dataset(
    X_train,
    label=y_train,
    categorical_feature=categorical_features,
    free_raw_data=False,
)

val_data = lgb.Dataset(
    X_val,
    label=y_val,
    categorical_feature=categorical_features,
    free_raw_data=False,
)


In [17]:
import numpy as np

classes = np.sort(y_train.unique())


In [18]:
proba = model.predict(X_val, num_iteration=model.best_iteration)

top5 = topk_from_proba(
    proba,
    classes,
    k=TOP_K
)

map5 = mapk(y_val, top5, k=TOP_K)
hit5 = hit_rate_at_k(y_val, top5, k=TOP_K)

map5, hit5


(0.31117719999999993, 0.534824)

We intentionally optimized the model on a reduced but representative 500k sample to enable rapid iteration and hyperparameter tuning. Once the modeling choices stabilized, the same pipeline can be scaled to the full dataset for final training.