In [1]:
# Import packages

import csv
import sklearn
import pandas as pd
import numpy as np
import matplotlib

## Load the datasets

In [None]:
# Load training and test datasets
training_set = 'dmt-2025-2nd-assignment/training_set_VU_DM.csv'
test_set = 'dmt-2025-2nd-assignment/test_set_VU_DM.csv'

training_set = pd.read_csv(training_set)
test_set = pd.read_csv(test_set)

In [4]:
training_set.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


The datafields can be interpreted according to the following scheme: https://www.kaggle.com/c/expedia-personalized-sort/data

## Explorotary Data Analysis (EDA)

In [5]:
# Check the shape of the training and test datasets
print("Training set shape:", training_set.shape)
print("Test set shape:", test_set.shape)

# Check the first few rows of the training set
print("First few rows of the training set:")
print(training_set.head())

# Check the columns of the training set
print("Columns in the training set:")
print(training_set.columns)
# Check the columns of the test set
print("Columns in the test set:")
print(test_set.columns)

# Check the data types of the columns in the training set
print("Data types of the columns in the training set:")
print(training_set.dtypes)

# Check the data types of the columns in the test set
print("Data types of the columns in the test set:")
print(test_set.dtypes)

# Check for missing values in the training set
print("Missing values in the training set:")
print(training_set.isnull().sum())
# Check for missing values in the test set
print("Missing values in the test set:")
print(test_set.isnull().sum())

# Check the distribution of the target variable in the training set
print("Distribution of the target variable in the training set:")
print(training_set['prop_id'].value_counts())
# Check the distribution of the target variable in the test set
print("Distribution of the target variable in the test set:")
print(test_set['prop_id'].value_counts())

# Check the unique values in the target variable in the training set
print("Unique values in the target variable in the training set:")
print(training_set['prop_id'].unique())
# Check the unique values in the target variable in the test set
print("Unique values in the target variable in the test set:")
print(test_set['prop_id'].unique())

    

Training set shape: (4958347, 54)
Test set shape: (4959183, 50)
First few rows of the training set:
   srch_id            date_time  site_id  visitor_location_country_id  \
0        1  2013-04-04 08:32:15       12                          187   
1        1  2013-04-04 08:32:15       12                          187   
2        1  2013-04-04 08:32:15       12                          187   
3        1  2013-04-04 08:32:15       12                          187   
4        1  2013-04-04 08:32:15       12                          187   

   visitor_hist_starrating  visitor_hist_adr_usd  prop_country_id  prop_id  \
0                      NaN                   NaN              219      893   
1                      NaN                   NaN              219    10404   
2                      NaN                   NaN              219    21315   
3                      NaN                   NaN              219    27348   
4                      NaN                   NaN              219    29

## Feature engineering

### Target value

The target value is one or multiple property IDs for a given search ID.

### Features

Values not present in the test data (and thus not to be used as features) include: 
- `position`
- `click_bool`
- `gross_bookings_usd`
- `booking_bool`

### Derive date features

In [3]:
# Convert the `date_time` column to:
# Month, Day, Year, Hour, Season, Day of the week, Weekend
def extract_date_features(df):
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['year'] = df['date_time'].dt.year
    df['hour'] = df['date_time'].dt.hour
    df['season'] = (df['month'] % 12 + 3) // 3
    df['day_of_week'] = df['date_time'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    return df

# Apply the function to the training and test sets
training_set = extract_date_features(training_set)
test_set = extract_date_features(test_set)

# Check the first few rows of the training set after feature extraction
print("First few rows of the training set after feature extraction:")
print(training_set.head())

First few rows of the training set after feature extraction:
   srch_id           date_time  site_id  visitor_location_country_id  \
0        1 2013-04-04 08:32:15       12                          187   
1        1 2013-04-04 08:32:15       12                          187   
2        1 2013-04-04 08:32:15       12                          187   
3        1 2013-04-04 08:32:15       12                          187   
4        1 2013-04-04 08:32:15       12                          187   

   visitor_hist_starrating  visitor_hist_adr_usd  prop_country_id  prop_id  \
0                      NaN                   NaN              219      893   
1                      NaN                   NaN              219    10404   
2                      NaN                   NaN              219    21315   
3                      NaN                   NaN              219    27348   
4                      NaN                   NaN              219    29604   

   prop_starrating  prop_review_score

## Train baseline model

Let's start with a LGBMRanker model using all feature (including the engineered features derived from the booking date).

In [13]:
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRanker
import lightgbm as lgb

# Parameters
training_fraction = 1
use_validation = False

# Define the features and target variable
features = ['month', 'day', 'year', 'hour', 'season', 'day_of_week', 'is_weekend', 'site_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag','srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff','comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv', 'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv', 'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv', 'comp8_rate_percent_diff']
target = 'booking_bool'

# Separate numeric and categorical features
numeric_features = ['month', 'day', 'year', 'hour', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_query_affinity_score', 'orig_destination_distance']
categorical_features = ['season', 'day_of_week', 'is_weekend', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_brand_bool', 'promotion_flag', 'srch_destination_id', 'srch_saturday_night_bool', 'random_bool', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff','comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv', 'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv', 'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv', 'comp8_rate_percent_diff']

# Sample the training data
training_sample = training_set.sample(frac=training_fraction, random_state=42)
X = training_sample[features + ['srch_id']]
y = training_sample[target]

group = X.groupby('srch_id').size().to_list()
X = X.drop(columns=['srch_id'])

# Preprocessing
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

categorical_transformer = SimpleImputer(strategy='most_frequent')

preprocessing = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# LightGBM Ranker
lgbm_model = LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    learning_rate=0.1,
    num_leaves=31,
    max_depth=10,
    n_jobs=-1,
    random_state=42
)

# Define the full pipeline with named steps
model = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('lgbm', lgbm_model)
])

model.fit(X, y, lgbm__group=group)

# Test prediction
X_test = test_set[features]
srch_ids = test_set['srch_id']
prop_ids = test_set['prop_id']

scores = model.predict(X_test)

output_df = pd.DataFrame({
    'srch_id': srch_ids,
    'prop_id': prop_ids,
    'score': scores
})

output_df = output_df.sort_values(by=['srch_id', 'score'], ascending=[True, False])
output_df[['srch_id', 'prop_id']].to_csv('ranked_predictions.csv', index=False)
print("Predictions saved to ranked_predictions.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.263207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3981
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 54




Predictions saved to ranked_predictions.csv


The baseline [scores 0.35831](https://www.kaggle.com/competitions/dmt-2025-2nd-assignment/leaderboard).

The following is a first attempt at hyperparameter tuning.

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRanker
import numpy as np
import pandas as pd

# Split data
if use_validation:
    unique_ids = training_sample['srch_id'].unique()
    val_ids = set(np.random.choice(unique_ids, size=int(0.2 * len(unique_ids)), replace=False))

    train_idx = training_sample['srch_id'].isin(val_ids) == False
    val_idx = training_sample['srch_id'].isin(val_ids)

    train_set = training_sample[train_idx]
    val_set = training_sample[val_idx]

    X_train = train_set[features]
    y_train = train_set[target]
    group_train = train_set.groupby('srch_id').size().to_list()

    X_val = val_set[features]
    y_val = val_set[target]
    group_val = val_set.groupby('srch_id').size().to_list()
else:
    X_train = training_sample[features]
    y_train = training_sample[target]
    group_train = training_sample.groupby('srch_id').size().to_list()

# Preprocessing pipeline
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

categorical_transformer = SimpleImputer(strategy='most_frequent')

preprocessing = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Fit and transform training data
X_train_preprocessed = preprocessing.fit_transform(X_train)

if use_validation:
    X_val_preprocessed = preprocessing.transform(X_val)

# LightGBM model
lgbm_model = LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    learning_rate=0.05,
    num_leaves=63,
    max_depth=7,
    min_data_in_leaf=50,
    n_estimators=500,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    random_state=42
)


if use_validation:
    lgbm_model.fit(
        X_train_preprocessed, y_train,
        group=group_train,
        eval_set=[(X_val_preprocessed, y_val)],
        eval_group=[group_val],
        eval_at=[1, 5, 10],
        callbacks=[lgb.early_stopping(stopping_rounds=50)],
        verbose=True
    )
else:
    lgbm_model.fit(X_train_preprocessed, y_train, group=group_train)

X_test = test_set[features]
srch_ids = test_set['srch_id']
prop_ids = test_set['prop_id']

X_test_preprocessed = preprocessing.transform(X_test)
scores = lgbm_model.predict(X_test_preprocessed, num_iteration=lgbm_model.best_iteration_)

output_df = pd.DataFrame({
    'srch_id': srch_ids,
    'prop_id': prop_ids,
    'score': scores
})

output_df = output_df.sort_values(by=['srch_id', 'score'], ascending=[True, False])
output_df[['srch_id', 'prop_id']].to_csv('ranked_predictions.csv', index=False)
print("Predictions saved to ranked_predictions.csv")
print(f"Best iteration: {lgbm_model.best_iteration_}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.266400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3981
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 54




Predictions saved to ranked_predictions.csv
Best iteration: 0


Kaggle allows for only 4 submissions per day. If we evaluate locally we don't have to submit as often.

In [20]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRanker
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import ndcg_score

# Parameters
training_fraction = 1
use_validation = True  # Enable validation split

# Define features and target
features = ['month', 'day', 'year', 'hour', 'season', 'day_of_week', 'is_weekend', 'site_id', 'visitor_location_country_id',
            'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_starrating', 'prop_review_score',
            'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd',
            'promotion_flag','srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
            'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score',
            'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff',
            'comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv', 'comp3_rate_percent_diff',
            'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv', 'comp5_rate_percent_diff',
            'comp6_rate', 'comp6_inv', 'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff',
            'comp8_rate', 'comp8_inv', 'comp8_rate_percent_diff']
target = 'booking_bool'

numeric_features = ['month', 'day', 'year', 'hour', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_starrating',
                    'prop_review_score', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price',
                    'price_usd', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count',
                    'srch_room_count', 'srch_query_affinity_score', 'orig_destination_distance']

categorical_features = ['season', 'day_of_week', 'is_weekend', 'site_id', 'visitor_location_country_id', 'prop_country_id',
                        'prop_brand_bool', 'promotion_flag', 'srch_destination_id', 'srch_saturday_night_bool', 'random_bool',
                        'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff',
                        'comp3_rate', 'comp3_inv', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff',
                        'comp5_rate', 'comp5_inv', 'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv', 'comp6_rate_percent_diff',
                        'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv', 'comp8_rate_percent_diff']

# Sample data
training_sample = training_set.sample(frac=training_fraction, random_state=42)

# Create validation split grouped by srch_id
if use_validation:
    splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
    train_idx, val_idx = next(splitter.split(training_sample, groups=training_sample['srch_id']))
    train_set = training_sample.iloc[train_idx]
    val_set = training_sample.iloc[val_idx]
else:
    train_set = training_sample
    val_set = None

# Prepare training data
X_train = train_set[features + ['srch_id']]
y_train = train_set[target]
group_train = X_train.groupby('srch_id').size().to_list()
X_train = X_train.drop(columns=['srch_id'])

# Prepare validation data if needed
if use_validation:
    X_val = val_set[features + ['srch_id']]
    y_val = val_set[target]
    group_val = X_val.groupby('srch_id').size().to_list()
    X_val = X_val.drop(columns=['srch_id'])

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = SimpleImputer(strategy='most_frequent')

preprocessing = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Model
lgbm_model = LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    learning_rate=0.1,
    num_leaves=31,
    max_depth=10,
    n_jobs=-1,
    random_state=42,
    n_estimators=500
)

# Pipeline combining preprocessing and model
model = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('lgbm', lgbm_model)
])

# Fit model
model.fit(X_train, y_train, lgbm__group=group_train)

# Evaluate on validation set if present
if use_validation:
    from sklearn.metrics import ndcg_score

    X_val_preprocessed = model.named_steps['preprocessing'].transform(X_val)
    val_scores = model.named_steps['lgbm'].predict(X_val_preprocessed, num_iteration=model.named_steps['lgbm'].best_iteration_)

    val_set_with_scores = val_set.copy()
    val_set_with_scores['score'] = val_scores

    # Sort within each group by score descending
    val_set_with_scores = val_set_with_scores.sort_values(by=['srch_id', 'score'], ascending=[True, False])
    grouped_val = val_set_with_scores.groupby('srch_id')

    ndcg_scores = []
    mrr_scores = []

    for _, group in grouped_val:
        true_relevance = group[target].values
        if np.sum(true_relevance) == 0:
            # No relevant items, skip this group
            continue

        # Compute NDCG@5 (sklearn expects arrays of shape (1, n_items))
        ndcg = ndcg_score([true_relevance], [group['score'].values], k=5)
        ndcg_scores.append(ndcg)

        # Compute MRR: reciprocal rank of first relevant item
        ranks = np.where(true_relevance > 0)[0]
        if len(ranks) > 0:
            mrr_scores.append(1.0 / (ranks[0] + 1))

    print(f"Validation NDCG@5: {np.mean(ndcg_scores):.4f}")
    print(f"Validation MRR: {np.mean(mrr_scores):.4f}")

# Predict on test set (no target in test set assumed)
X_test = test_set[features]
srch_ids = test_set['srch_id']
prop_ids = test_set['prop_id']

test_scores = model.predict(X_test)

output_df = pd.DataFrame({
    'srch_id': srch_ids,
    'prop_id': prop_ids,
    'score': test_scores
})

output_df = output_df.sort_values(by=['srch_id', 'score'], ascending=[True, False])
output_df[['srch_id', 'prop_id']].to_csv('ranked_predictions.csv', index=False)
print("Predictions saved to ranked_predictions.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.212474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4069
[LightGBM] [Info] Number of data points in the train set: 3966682, number of used features: 54




Validation NDCG@5: 0.3847
Validation MRR: 0.3680




Predictions saved to ranked_predictions.csv
