In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from common.helpers import logger, has_nvidia_gpu, display_feature_importances
from common.feature_engineering import feature_engineering
from common.imputation import get_imputation_values, apply_imputation

In [None]:
# --- Configuration ---
HAS_GPU = False # has_nvidia_gpu() # False #HAS_GPU = cp.cuda.runtime.getDeviceCount() > 0
FOLD_AMOUNT = 3
TESTSPLIT_RATIO = 10 # Percentage of data to be used for testing
OPTUNA_TRIALS = 2 #20 # Number of trials for hyperparameter optimization
ENSEMBLE_N_ESTIMATORS = 2 #50 # Number of estimators for the final stacking model
TRAIN_WITHOUT_EVALUATION = False # If we should train without evaluation, gives more training data but can't output evaluation metrics
TRAIN_DATA_PERCENTAGE = 0.01 # Percentage of train data to use for the training, 1 for everything (100%).
TEST_DATA_PERCENTAGE = 0.01 # Percentage of test data to use for the training, 1 for everything (100%).
categorical_feature = ['site_id', 'visitor_location_country_id', 'prop_country_id','month','dayofweek']

In [None]:
# --- Load the data ---
training_data_path = r"../input/training_set_VU_DM.csv"
test_data_path = r"../input/test_set_VU_DM.csv"
# df = pd.read_csv(training_data_path).sample(frac=TRAIN_DATA_PERCENTAGE, random_state=42)
# df_test = pd.read_csv(test_data_path).sample(frac=TEST_DATA_PERCENTAGE, random_state=42)
df = pd.read_csv(training_data_path)
df_test = pd.read_csv(test_data_path)

In [None]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mask = (data[column].between(lower_bound, upper_bound)) | (data[column].isna())
    data_cleaned = data[mask].copy()
    return data_cleaned

In [None]:
def calculate_diff_from_group_mean(data, group_col='srch_id',value_col='prop_starrating'):
    data_tmp = data.copy()
    mean_col_name = f'{value_col}_mean_by_{group_col}'
    data_tmp[mean_col_name] = data_tmp.groupby(group_col)[value_col].transform('mean')

    diff_col_name = f'{value_col}_diff_from_mean'
    data_tmp[diff_col_name] = data_tmp[value_col] - data_tmp[mean_col_name]
    return data_tmp

In [None]:
def create_book_feature(data):
    conditions = [
        data['booking_bool'] == 1,
        data['click_bool'] == 1
    ]

    choices = [2, 1]

    data['book_feature'] = np.select(conditions, choices, default=0)
    data.drop(columns=['booking_bool', 'click_bool'], inplace=True, errors='ignore')
    return data

In [None]:
MAX_PRICE_NIGHT = 150000

# Generate additional features that may be useful for the model
def feature_engineering(data, type='train'):
    logger.debug("Running feature engineering")

    # Delete rows gross_bookings_usd is significantly different from price_usd
    if type=='train':
        data['booking_price_diff'] = data['gross_bookings_usd'] - data['price_usd']
        data = remove_outliers_iqr(data=data, column='booking_price_diff')
        data.drop(columns=["booking_price_diff"], inplace=True)

    # transfer date to month and day of week
    data['date_time'] = pd.to_datetime(data['date_time'], errors='coerce')
    data['month'] = data['date_time'].dt.month
    data['dayofweek'] = data['date_time'].dt.dayofweek
    data.drop(columns=["date_time"], inplace=True)

    # Feature for total number of adults and children
    data["total_people"] = data["srch_adults_count"] + data["srch_children_count"]

    # History differences
    data["history_starrating_diff"] = data["visitor_hist_starrating"] - data["prop_starrating"]

    # Total price per night per room
    data['price_1room_1night'] = (data['price_usd'] / data['srch_room_count']) / data['srch_length_of_stay']
    data["history_adr_diff"] = data["visitor_hist_adr_usd"] - data["price_1room_1night"]

    # Filter out high prices only for train data
    if type=='train':
        data = data[data['price_1room_1night'] < MAX_PRICE_NIGHT].copy()

    # log transform
    data['price_1room_1night_log'] = np.log1p(data['price_1room_1night'])
    data["price_history_difference"] = data["prop_log_historical_price"] - data["price_1room_1night_log"]
    data['price_1person_1night'] = (data['price_usd'] / data['total_people']) / data['srch_length_of_stay']
    data['price_1person_1night_log'] = np.log1p(data['price_1person_1night'])
    data.drop(columns=['price_1room_1night'], inplace=True)
    data.drop(columns=['price_1person_1night'], inplace=True)

    # log transform price, min price is 0, so we use log(x+1)
    data['visitor_hist_adr_usd_log'] = np.log1p(data['visitor_hist_adr_usd'])
    data.drop(columns=['visitor_hist_adr_usd'], inplace=True)

    # Transformations of competitor rates
    data["sum_comp_rate"] = data[["comp1_rate", "comp2_rate", "comp3_rate", "comp4_rate", "comp5_rate", "comp6_rate", "comp7_rate", "comp8_rate"]].sum(axis=1)
    data["sum_comp_inv"] = data[["comp1_inv", "comp2_inv", "comp3_inv", "comp4_inv", "comp5_inv", "comp6_inv", "comp7_inv", "comp8_inv"]].sum(axis=1)
    data["median_comp_rate_percent_diff"] = data[["comp1_rate_percent_diff", "comp2_rate_percent_diff", "comp3_rate_percent_diff", "comp4_rate_percent_diff", "comp5_rate_percent_diff", "comp6_rate_percent_diff", "comp7_rate_percent_diff", "comp8_rate_percent_diff"]].dropna().median(axis=1)

    # Locational features
    data["domestic_travel_bool"] = data["prop_country_id"] == data["visitor_location_country_id"]

    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_starrating')
    data = calculate_diff_from_group_mean(data=data,group_col='prop_id',value_col='prop_starrating')
    data = calculate_diff_from_group_mean(data=data,group_col='prop_id',value_col='price_1room_1night_log')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_location_score1')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_location_score2')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_destination_id',value_col='price_usd')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_destination_id',value_col='prop_starrating')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='prop_review_score')
    data = calculate_diff_from_group_mean(data=data,group_col='srch_id',value_col='promotion_flag')

    # # drop original competitor columns
    # for x in range(1, 9):
    #     data.drop(columns=[f"comp{x}_rate", f"comp{x}_inv", f"comp{x}_rate_percent_diff"], inplace=True, errors='ignore')

    data.drop(columns=['srch_destination_id'], inplace=True, errors='ignore')

    for c in categorical_feature:
        data[c] = data[c].astype('category')

    if type=='train':
        data = create_book_feature(data)

    logger.debug("Feature engineering completed")
    return data

In [None]:
df = feature_engineering(data=df,type='train')
df_test = feature_engineering(data=df_test,type='test')


In [None]:
# --- Split df into training and validation sets based on srch_id ---
all_srch_ids = df['srch_id'].unique()
train_srch_ids, val_srch_ids = train_test_split(all_srch_ids, test_size=0.05, random_state=42)

train_split_df = df[df['srch_id'].isin(train_srch_ids)].copy()
val_df = df[df['srch_id'].isin(val_srch_ids)].copy()

print(f"\nShape of train_split_df: {train_split_df.shape}")
print(f"Shape of val_df: {val_df.shape}")
print(f"Number of unique srch_id in train_split_df: {train_split_df['srch_id'].nunique()}")
print(f"Number of unique srch_id in val_df: {val_df['srch_id'].nunique()}")


In [None]:
feature_cols = [col for col in df.columns if col not in ['srch_id', 'prop_id', 'booking_bool','click_bool', 'position','gross_bookings_usd','book_feature']]

label = 'book_feature'
# --- Prepare Training Data ---
train_split_df = train_split_df.sort_values('srch_id') # Sort by srch_id
X_train = train_split_df[feature_cols]
y_train = train_split_df[label]
group_train = train_split_df.groupby('srch_id', sort=False).size().to_list()

print(f"\nNumber of training groups (searches) for actual training: {len(group_train)}")
print(f"Total training samples for actual training: {sum(group_train)}")

# --- Prepare Validation Data ---
val_df = val_df.sort_values('srch_id') # Sort by srch_id
X_val = val_df[feature_cols]
y_val = val_df[label]
group_val = val_df.groupby('srch_id', sort=False).size().to_list()

print(f"\nNumber of validation groups (searches): {len(group_val)}")
print(f"Total validation samples: {sum(group_val)}")

# --- Prepare Test Data  ---
X_test = df_test[feature_cols]
# We need srch_id and prop_id from test_df for final output generation
test_ids = df_test[['srch_id', 'prop_id']].copy()

In [None]:
### resampling
# 1. Identify the samples to be oversampled
# Unprivileged group = independent hotels (prop_brand_bool == 0)
# Positive outcomes = clicked or booked (book_feature > 0)
indep_hotel_pos = train_split_df[
    (train_split_df['prop_brand_bool'] == 0) & (train_split_df[label] > 0)
].copy()

print(f"Number of independent hotels clicked/booked (book_feature > 0): {len(indep_hotel_pos)}")

# 2. Define oversampling factor
# This is how many EXTRA copies you want to add.
oversample_factor = 0.5

# Create the oversampled data
n_samples_to_add = int(len(indep_hotel_pos) * oversample_factor)
oversampled_data = resample(
    indep_hotel_pos,               # DataFrame to sample from
    replace=True,                  # Sample with replacement (bootstrap)
    n_samples=n_samples_to_add,    # Number of samples to generate
    random_state=42                # For reproducibility
)

print(f"Number of samples to add via oversampling: {n_samples_to_add}")

# 3. Create the new resampled training DataFrame
train_df_resampled = pd.concat([train_split_df, oversampled_data], ignore_index=True)
print(f"Resampled training data size: {len(train_df_resampled)}")

In [None]:
# 2.sort resample data again
train_df_resampled = train_df_resampled.sort_values('srch_id', kind='mergesort') # Use 'mergesort' for stable sort

X_train_resampled = train_df_resampled[feature_cols]
y_train_resampled = train_df_resampled[label]
group_train_resampled = train_df_resampled.groupby('srch_id', sort=False).size().to_list()


print(f"Resampled X_train shape: {X_train_resampled.shape}")
print(f"Resampled y_train length: {len(y_train_resampled)}")

In [None]:
# 3. Train LGBMRanker Model
print("\nTraining LGBMRanker model...")

ranker = lgb.LGBMRanker(
    objective="lambdarank",  # Core objective for learning to rank
    metric="ndcg",           # Evaluation metric (Normalized Discounted Cumulative Gain)
    n_estimators=1000,        # Number of boosting rounds
    learning_rate=0.1,
    max_depth=-1,           # No limit on depth
    num_leaves=62,          # Number of leaves in each tree
    importance_type='gain',
    label_gain=[0, 1, 5],      # The gain for unbooked (0) is 0, click (1) is 1,book(2) is 5
    random_state=42,
    n_jobs=-1#,     # Use all available cores
    #boosting='dart'
    # Add other parameters as needed, e.g., num_leaves, max_depth, reg_alpha, reg_lambda
)

ranker.fit(
    X_train,
    y_train,
    group=group_train,
    eval_set=[(X_train, y_train), (X_val, y_val)], # Use validation set here
    eval_group=[group_train, group_val],          # Group info for validation set
    eval_names=['train', 'valid'],                # Names for the eval sets
    eval_at=[5, 10],                              # Evaluate NDCG@k
    callbacks=[lgb.early_stopping(200)] # Adjusted early stopping
)

# ranker.fit(
#     X_train_resampled,
#     y_train_resampled,
#     group=group_train_resampled,
#     eval_set=[(X_train, y_train), (X_val, y_val)], # Use validation set here
#     eval_group=[group_train, group_val],          # Group info for validation set
#     eval_names=['train', 'valid'],                # Names for the eval sets
#     eval_at=[5, 10],                              # Evaluate NDCG@k
#     callbacks=[lgb.early_stopping(200)] # Adjusted early stopping
# )

print("Model training complete.")
print("\nFeature Importances:")
feature_importances = pd.Series(ranker.feature_importances_, index=feature_cols)
print(feature_importances.sort_values(ascending=False))

In [None]:
# Sort the features by importance for plotting
sorted_importances = feature_importances.sort_values(ascending=True)

# Select the top N features to display for clarity
top_n = 20
top_importances = sorted_importances.tail(top_n)

# Create the plot
plt.figure(figsize=(12, 10))
sns.barplot(x=top_importances.values, y=top_importances.index, palette="viridis")

plt.title(f'Top {top_n} Feature Importances (Importance Type: Gain)', fontsize=16)
plt.xlabel('Total Gain', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.tight_layout()  # Adjust layout to make sure labels fit

plt.savefig("../image/lgbm_feature_importance.png")
print("Feature importance plot saved as lgbm_feature_importance.png")

plt.close()

In [None]:
# 4. Make Predictions on Test Data
print("\nPredicting on test data...")
# a NumPy array of scores. Higher scores indicate a higher predicted likelihood of relevance (booking)
test_predictions = ranker.predict(X_test)
# test_ids = ['srch_id', 'prop_id', 'predicted_score']
test_ids['predicted_score'] = test_predictions

In [None]:
# 5. Rank Properties and Generate Output File
print("Ranking properties and generating output file...")

# Sort properties within each search_id group by the predicted score
test_ids_sorted = test_ids.sort_values(['srch_id', 'predicted_score'], ascending=[True, False])

# Select only srch_id and prop_id for the final output
output_df = test_ids_sorted[['srch_id', 'prop_id']]

output_filename = "LGBMRanker_predict.csv"
output_df.to_csv(output_filename, index=False)

print(f"\nOutput file '{output_filename}' generated successfully.")
print("Sample of the output file:")
print(output_df.head(10))

In [None]:
# save model
model_filename_lgb = r'model/lgbm_ranker_model_biasmitigation.txt'
ranker.booster_.save_model(model_filename_lgb)
print(f"save model as LightGBM format: {model_filename_lgb}")

# load model
# loaded_ranker_lgb = lgb.Booster(model_file='lgbm_ranker_model.txt')
# ranker_sklearn_loaded = lgb.LGBMRanker() # new ranker instance
# ranker_sklearn_loaded.booster_ = loaded_ranker_lgb #



In [None]:
# --- 1. Get Predictions for the Validation Set ---
if 'predicted_score' not in val_df.columns:
    val_predictions = ranker.predict(X_val)
    val_df_with_preds = val_df.copy()
    val_df_with_preds['predicted_score'] = val_predictions
else:
    val_df_with_preds = val_df.copy()

In [None]:
# --- 2. Demographic Parity on Predicted Scores ---
# Separate into groups based on prop_brand_bool
independent_hotels = val_df_with_preds[val_df_with_preds['prop_brand_bool'] == 0]
chain_hotels = val_df_with_preds[val_df_with_preds['prop_brand_bool'] == 1]

avg_score_independent = independent_hotels['predicted_score'].mean()
avg_score_chain = chain_hotels['predicted_score'].mean()

print(f"Average predicted score for Independent Hotels (prop_brand_bool=0): {avg_score_independent:.4f}")
print(f"Average predicted score for Major Chain Hotels (prop_brand_bool=1): {avg_score_chain:.4f}")

if avg_score_chain > avg_score_independent:
    print(f"Chain hotels receive, on average, higher scores by {avg_score_chain - avg_score_independent:.4f}.")
    print(f"Ratio (Chain/Independent): {avg_score_chain / avg_score_independent:.4f}")
else:
    print(f"Independent hotels receive, on average, higher scores by {avg_score_independent - avg_score_chain:.4f}.")
    print(f"Ratio (Independent/Chain): {avg_score_independent / avg_score_chain:.4f}")

print("Consider also comparing this to the average true relevance ('book_feature') for these groups.")
avg_true_relevance_independent = independent_hotels['book_feature'].mean()
avg_true_relevance_chain = chain_hotels['book_feature'].mean()
print(f"Average true relevance for Independent Hotels: {avg_true_relevance_independent:.4f}")
print(f"Average true relevance for Major Chain Hotels: {avg_true_relevance_chain:.4f}\n")

pro_click_independent = len(independent_hotels[independent_hotels['book_feature'] == 1])/len(independent_hotels)


In [None]:
# --- 3. Visualizations ---

# Distribution of predicted scores by brand_bool
plt.figure(figsize=(10, 6))
sns.histplot(data=val_df_with_preds, x='predicted_score', hue='prop_brand_bool', kde=True, palette={0: 'blue', 1: 'red'})
plt.title('Distribution of Predicted Scores by Hotel Brand Type')
plt.xlabel('Predicted Score')
plt.ylabel('Density')
plt.legend(title='Hotel Type', labels=['Chain (1)','Independent (0)'])
# plt.show()
# Save the plot
plt.savefig("../image/ass2_bias2_after.png")
plt.close()

In [None]:
# # Ensure 'prop_brand_bool' and 'test_predictions' are aligned and ready
# # It's often easiest to work if they are in the same DataFrame for filtering
# # but we can also work with them as separate aligned arrays/series.
#
# # Assuming df_test['prop_brand_bool'] is aligned with test_predictions
# prop_brand_status = df_test['prop_brand_bool'].values # Get as numpy array for direct boolean indexing
#
# # --- Calculate Mean Predicted Score for prop_brand_bool == 0 (Non-Brand) ---
# scores_non_brand = test_predictions[prop_brand_status == 0]
# mean_score_non_brand = np.mean(scores_non_brand) if scores_non_brand.size > 0 else np.nan
#
# # --- Calculate Mean Predicted Score for prop_brand_bool == 1 (Brand) ---
# scores_brand = test_predictions[prop_brand_status == 1]
# mean_score_brand = np.mean(scores_brand) if scores_brand.size > 0 else np.nan
#
# # --- Print Results ---
# print(f"Mean predicted score for items with prop_brand_bool = 0 (Non-Brand): {mean_score_non_brand:.4f}")
# print(f"Mean predicted score for items with prop_brand_bool = 1 (Brand):     {mean_score_brand:.4f}")