In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from common.helpers import logger, has_nvidia_gpu, display_feature_importances
from common.feature_engineering import feature_engineering
from common.imputation import get_imputation_values, apply_imputation

In [18]:
# --- Configuration ---
HAS_GPU = False # has_nvidia_gpu() # False #HAS_GPU = cp.cuda.runtime.getDeviceCount() > 0
FOLD_AMOUNT = 3
TESTSPLIT_RATIO = 10 # Percentage of data to be used for testing
OPTUNA_TRIALS = 2 #20 # Number of trials for hyperparameter optimization
ENSEMBLE_N_ESTIMATORS = 2 #50 # Number of estimators for the final stacking model
TRAIN_WITHOUT_EVALUATION = False # If we should train without evaluation, gives more training data but can't output evaluation metrics
TRAIN_DATA_PERCENTAGE = 1 # Percentage of train data to use for the training, 1 for everything (100%).
TEST_DATA_PERCENTAGE = 1 # Percentage of test data to use for the training, 1 for everything (100%).
categorical_feature = ['site_id', 'visitor_location_country_id', 'prop_country_id']

In [19]:
# --- Load the data ---
training_data_path = r"../input/training_set_VU_DM.csv"
test_data_path = r"../input/test_set_VU_DM.csv"
# df_ori = pd.read_csv(training_data_path)
# df_test_ori = pd.read_csv(test_data_path)
df = pd.read_csv(training_data_path).sample(frac=TRAIN_DATA_PERCENTAGE, random_state=42)
df_test = pd.read_csv(test_data_path).sample(frac=TEST_DATA_PERCENTAGE, random_state=42)
# df = df_ori.sample(frac=TRAIN_DATA_PERCENTAGE, random_state=42)
# df_test = df_test_ori.sample(frac=TEST_DATA_PERCENTAGE, random_state=42)

In [20]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mask = (data[column].between(lower_bound, upper_bound)) | (data[column].isna())
    df_cleaned = df[mask].copy()
    return df_cleaned

def remove_outliers_percentile(data, column, lower_pct=0.01, upper_pct=0.99):
    lower = data[column].quantile(lower_pct)
    upper = data[column].quantile(upper_pct)
    df_cleaned = data[(data[column] >= lower) & (data[column] <= upper)].copy()
    return df_cleaned

In [21]:
def calculate_diff_from_group_mean(data, group_col='srch_id',value_col='prop_starrating'):
    data_tmp = data.copy()
    mean_col_name = f'{value_col}_mean_by_{group_col}'
    data_tmp[mean_col_name] = data_tmp.groupby(group_col)[value_col].transform('mean')

    diff_col_name = f'{value_col}_diff_from_mean'
    data_tmp[diff_col_name] = data_tmp[value_col] - data_tmp[mean_col_name]
    return data_tmp

In [46]:
def get_imputation_values(train_data):

    # Impute missing values with median or specific values
    impute_values = {
        "visitor_hist_starrating": train_data["visitor_hist_starrating"].median(),
        "visitor_hist_adr_usd": train_data["visitor_hist_adr_usd"].median(),
        "prop_review_score": 0,
        "prop_location_score2": 0,
        "srch_query_affinity_score": train_data["srch_query_affinity_score"].min(),
        "orig_destination_distance": train_data["orig_destination_distance"].median()
    }

    return impute_values

def apply_imputation(data, impute_values):
    data_impute = data.copy()

    data_impute["visitor_hist_starrating"] = data_impute["visitor_hist_starrating"].fillna(impute_values["visitor_hist_starrating"])
    data_impute["visitor_hist_adr_usd"] = data_impute["visitor_hist_adr_usd"].fillna(impute_values["visitor_hist_adr_usd"])
    data_impute["prop_review_score"] = data_impute["prop_review_score"].fillna(impute_values["prop_review_score"])
    data_impute["prop_location_score2"] = data_impute["prop_location_score2"].fillna(impute_values["prop_location_score2"])
    data_impute["srch_query_affinity_score"] = data_impute["srch_query_affinity_score"].fillna(impute_values["srch_query_affinity_score"])
    data_impute["orig_destination_distance"] = data_impute["orig_destination_distance"].fillna(impute_values["orig_destination_distance"])

    logger.debug("Imputation finished.")
    return data_impute

In [22]:
MAX_PRICE_NIGHT = 150000

# Generate additional features that may be useful for the model
def feature_engineering(data, type='train'):
    logger.debug("Running feature engineering")

    # Delete rows gross_bookings_usd is significantly different from price_usd
    if type=='train':
        data['booking_price_diff'] = data['gross_bookings_usd'] - data['price_usd']
        data = remove_outliers_iqr(data=data, column='booking_price_diff')
        data.drop(columns=["booking_price_diff"], inplace=True)

    # transfer date to month and day of week
    data['date_time'] = pd.to_datetime(data['date_time'], errors='coerce')
    data['month'] = data['date_time'].dt.month
    data['dayofweek'] = data['date_time'].dt.dayofweek
    data.drop(columns=["date_time"], inplace=True)

    # Feature for total number of adults and children
    data["total_people"] = data["srch_adults_count"] + data["srch_children_count"]

    # History differences
    data["history_starrating_diff"] = data["visitor_hist_starrating"] - data["prop_starrating"]

    # Total price per night per room
    data['price_1room_1night'] = (data['price_usd'] / data['srch_room_count']) / data['srch_length_of_stay']
    data["history_adr_diff"] = data["visitor_hist_adr_usd"] - data["price_1room_1night"]

    # data.drop(columns=['price_usd'], inplace=True)

    # Filter out high prices only for train data
    if type=='train':
        data = data[data['price_1room_1night'] < MAX_PRICE_NIGHT].copy()

    # log transform
    data['price_1room_1night_log'] = np.log1p(data['price_1room_1night'])
    data["price_history_difference"] = data["prop_log_historical_price"] - data["price_1room_1night_log"]
    data['price_1person_1night'] = (data['price_usd'] / data['total_people']) / data['srch_length_of_stay']
    data.drop(columns=['price_1room_1night'], inplace=True)

    # log transform price, min price is 0, so we use log(x+1)
    data['visitor_hist_adr_usd_log'] = np.log1p(data['visitor_hist_adr_usd'])
    data.drop(columns=['visitor_hist_adr_usd'], inplace=True)

    # Transformations of competitor rates
    data["avg_comp_rate"] = data[["comp1_rate", "comp2_rate", "comp3_rate", "comp4_rate", "comp5_rate", "comp6_rate", "comp7_rate", "comp8_rate"]].sum(axis=1)
    data["avg_comp_inv"] = data[["comp1_inv", "comp2_inv", "comp3_inv", "comp4_inv", "comp5_inv", "comp6_inv", "comp7_inv", "comp8_inv"]].sum(axis=1)
    data["avg_comp_rate_percent_diff"] = data[["comp1_rate_percent_diff", "comp2_rate_percent_diff", "comp3_rate_percent_diff", "comp4_rate_percent_diff", "comp5_rate_percent_diff", "comp6_rate_percent_diff", "comp7_rate_percent_diff", "comp8_rate_percent_diff"]].dropna().median(axis=1)

    # Locational features
    data["domestic_travel_bool"] = data["prop_country_id"] == data["visitor_location_country_id"]

    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_starrating')
    data = calculate_diff_from_group_mean(data=data,group_col='prop_id',value_col='price_1room_1night_log')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_location_score1')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_location_score2')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_destination_id',value_col='price_usd')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_review_score')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='promotion_flag')

    # # drop original competitor columns
    # for x in range(1, 9):
    #     data.drop(columns=[f"comp{x}_rate", f"comp{x}_inv", f"comp{x}_rate_percent_diff"], inplace=True, errors='ignore')

    data.drop(columns=['srch_destination_id'], inplace=True, errors='ignore')

    for c in categorical_feature:
        data[c] = data[c].astype('category')

    logger.debug("Feature engineering completed")
    return data

In [23]:
# Determine imputation values on training set
# impute_values = get_imputation_values(df)

# Apply the same imputation on training and test
# df = apply_imputation(df, impute_values)
df = feature_engineering(data=df,type='train')

# NOTE ASSIGNMENT PROVIDED TEST SET CONTAINS NO CLICK_BOOL THUS USELESS FOR TESTING
# df_test = apply_imputation(df_test, impute_values)
df_test = feature_engineering(data=df_test,type='test')


2025-05-16 23:35:17,597 - DEBUG - Running feature engineering
2025-05-16 23:39:25,503 - DEBUG - Feature engineering completed
2025-05-16 23:39:26,214 - DEBUG - Running feature engineering
2025-05-16 23:42:46,354 - DEBUG - Feature engineering completed


In [25]:
# --- Split df into training and validation sets based on srch_id ---
all_srch_ids = df['srch_id'].unique()
train_srch_ids, val_srch_ids = train_test_split(all_srch_ids, test_size=0.1, random_state=42)

train_split_df = df[df['srch_id'].isin(train_srch_ids)].copy()
val_df = df[df['srch_id'].isin(val_srch_ids)].copy()

print(f"\nShape of train_split_df: {train_split_df.shape}")
print(f"Shape of val_df: {val_df.shape}")
print(f"Number of unique srch_id in train_split_df: {train_split_df['srch_id'].nunique()}")
print(f"Number of unique srch_id in val_df: {val_df['srch_id'].nunique()}")



Shape of train_split_df: (4449723, 78)
Shape of val_df: (495871, 78)
Number of unique srch_id in train_split_df: 179808
Number of unique srch_id in val_df: 19979


In [26]:
feature_cols = [col for col in df.columns if col not in ['srch_id', 'prop_id', 'booking_bool','click_bool', 'position','gross_bookings_usd']]

# --- Prepare Training Data ---
train_split_df = train_split_df.sort_values('srch_id') # Sort by srch_id
X_train = train_split_df[feature_cols]
y_train = train_split_df['booking_bool']
group_train = train_split_df.groupby('srch_id', sort=False).size().to_list()

print(f"\nNumber of training groups (searches) for actual training: {len(group_train)}")
print(f"Total training samples for actual training: {sum(group_train)}")

# --- Prepare Validation Data ---
val_df = val_df.sort_values('srch_id') # Sort by srch_id
X_val = val_df[feature_cols]
y_val = val_df['booking_bool']
group_val = val_df.groupby('srch_id', sort=False).size().to_list()

print(f"\nNumber of validation groups (searches): {len(group_val)}")
print(f"Total validation samples: {sum(group_val)}")

# --- Prepare Test Data  ---
X_test = df_test[feature_cols]
# We need srch_id and prop_id from test_df for final output generation
test_ids = df_test[['srch_id', 'prop_id']].copy()
len(X_test)


Number of training groups (searches) for actual training: 179808
Total training samples for actual training: 4449723

Number of validation groups (searches): 19979
Total validation samples: 495871


4959183

In [35]:
# 3. Train LGBMRanker Model
print("\nTraining LGBMRanker model...")

ranker = lgb.LGBMRanker(
    objective="lambdarank",  # Core objective for learning to rank
    metric="ndcg",           # Evaluation metric (Normalized Discounted Cumulative Gain)
    n_estimators=1000,        # Number of boosting rounds
    learning_rate=0.1,
    max_depth=-1,           # No limit on depth
    num_leaves=62,          # Number of leaves in each tree
    importance_type='gain',
    label_gain=[0, 1],      # The gain for unbooked (0) is 0, and the gain for booked (1) is 1
    random_state=42,
    n_jobs=-1,        # Use all available cores
    categorical_feature=categorical_feature
    # Add other parameters as needed, e.g., num_leaves, max_depth, reg_alpha, reg_lambda
)

ranker.fit(
    X_train,
    y_train,
    group=group_train,
    eval_set=[(X_train, y_train), (X_val, y_val)], # Use validation set here
    eval_group=[group_train, group_val],          # Group info for validation set
    eval_names=['train', 'valid'],                # Names for the eval sets
    eval_at=[5, 10],                              # Evaluate NDCG@k
    callbacks=[lgb.early_stopping(200, verbose=True, min_delta=0.0001)] # Adjusted early stopping
)

print("Model training complete.")
print("\nFeature Importances:")
feature_importances = pd.Series(ranker.feature_importances_, index=feature_cols)
print(feature_importances.sort_values(ascending=False))


Training LGBMRanker model...


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.043026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8635
[LightGBM] [Info] Number of data points in the train set: 4449723, number of used features: 72


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
Using 0.0001 as min_delta for all metrics.
Early stopping, best iteration is:
[345]	train's ndcg@5: 0.708683	train's ndcg@10: 0.740337	valid's ndcg@5: 0.634967	valid's ndcg@10: 0.674243
Model training complete.

Feature Importances:
prop_location_score2                     186086.571016
prop_country_id                          109976.917312
price_usd                                 96827.574709
visitor_location_country_id               84605.188828
price_1room_1night_log_diff_from_mean     55515.023365
                                             ...      
comp7_inv                                   193.197928
comp6_rate                                  147.126760
random_bool                                 144.237141
comp1_inv                                    47.658801
avg_comp_rate_percent_diff                    0.000000
Length: 72, dtype: float64


In [36]:
# 4. Make Predictions on Test Data
print("\nPredicting on test data...")
# a NumPy array of scores. Higher scores indicate a higher predicted likelihood of relevance (booking)
test_predictions = ranker.predict(X_test)
# test_ids = ['srch_id', 'prop_id', 'predicted_score']
test_ids['predicted_score'] = test_predictions


Predicting on test data...


In [37]:
len(test_predictions)
feature_importances

site_id                                5682.025463
visitor_location_country_id           84605.188828
visitor_hist_starrating                1608.363454
prop_country_id                      109976.917312
prop_starrating                       21206.869468
                                         ...      
price_usd_diff_from_mean              48044.876659
prop_review_score_mean_by_srch_id      5931.118256
prop_review_score_diff_from_mean      18142.572376
promotion_flag_mean_by_srch_id         4231.576625
promotion_flag_diff_from_mean         28505.691234
Length: 72, dtype: float64

In [38]:
# 5. Rank Properties and Generate Output File
print("Ranking properties and generating output file...")

# Sort properties within each search_id group by the predicted score
test_ids_sorted = test_ids.sort_values(['srch_id', 'predicted_score'], ascending=[True, False])

# Select only srch_id and prop_id for the final output
output_df = test_ids_sorted[['srch_id', 'prop_id']]

output_filename = "LGBMRanker_predict.csv"
output_df.to_csv(output_filename, index=False)

print(f"\nOutput file '{output_filename}' generated successfully.")
print("Sample of the output file:")
print(output_df.head(10))

Ranking properties and generating output file...

Output file 'LGBMRanker_predict.csv' generated successfully.
Sample of the output file:
    srch_id  prop_id
23        1    99484
9         1    54937
12        1    61934
5         1    28181
4         1    24194
6         1    34263
22        1    95031
20        1    90385
8         1    50162
18        1    82231


In [90]:
# --- Load the result ---
result_sample = r"submission_sample.csv"
result = r"ranked_hotel_bookings.csv"
result_sample = pd.read_csv(result_sample)
result = pd.read_csv(result)

Ranking properties and generating output file...

Output file 'ranked_hotel_bookings.csv' generated successfully.
Sample of the output file:
    srch_id  prop_id
9         1    54937
23        1    99484
12        1    61934
5         1    28181
4         1    24194
6         1    34263
18        1    82231
13        1    63894
8         1    50162
20        1    90385


In [91]:
print(len(result))
print(len(result_sample))