### Learning to rank with XGB - basic example

- based on small subset
- amenities not merged
- only left integer/float variables, no feature pre-processing


code folllows this example https://www.kaggle.com/code/azimuthal01/learning-to-rank

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/expedia/exploded_df.csv")

In [22]:
df.head()

Unnamed: 0,user_id,search_id,search_timestamp,point_of_sale,geo_location_country,is_mobile,destination_id,checkin_date,checkout_date,adult_count,...,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,num_clicks,is_trans,qid,relevance
0,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,250.0,2.0,1,0,3.0,0,0,1,0
1,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,1000.0,2.0,1,0,4.0,0,0,1,0
2,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,5.0,700.0,3.0,1,0,3.0,0,0,1,0
3,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,250.0,2.0,1,0,3.0,0,0,1,0
4,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,625.0,2.0,0,0,1.0,0,0,1,0


In [4]:
# create integer query ID

group_map = {}
counter = 1
for group in df['search_id'].unique():
    group_map[group] = counter
    counter += 1

# Apply the mapping to create a new column with integer identifiers
df['qid'] = df['search_id'].map(group_map)

df.head()

Unnamed: 0,user_id,search_id,search_timestamp,point_of_sale,geo_location_country,is_mobile,destination_id,checkin_date,checkout_date,adult_count,...,is_travel_ad,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,num_clicks,is_trans,qid
0,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,1,4.0,250.0,2.0,1,0,3.0,0,0,1
1,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,1,4.0,1000.0,2.0,1,0,4.0,0,0,1
2,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,0,5.0,700.0,3.0,1,0,3.0,0,0,1
3,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,0,4.0,250.0,2.0,1,0,3.0,0,0,1
4,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,0,4.0,625.0,2.0,0,0,1.0,0,0,1


In [5]:
### create relevance label
# 1 for clicked on
# 2 for transaction

df['relevance'] = 0  # Initialize with default value
df.loc[df['num_clicks'] > 0, 'relevance'] = 1  # Set value to 1 if first_column > 0
df.loc[df['is_trans'] > 0, 'relevance'] += 2  # Add 2 if second_column > 0

df.head(10)

Unnamed: 0,user_id,search_id,search_timestamp,point_of_sale,geo_location_country,is_mobile,destination_id,checkin_date,checkout_date,adult_count,...,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,num_clicks,is_trans,qid,relevance
0,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,250.0,2.0,1,0,3.0,0,0,1,0
1,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,1000.0,2.0,1,0,4.0,0,0,1,0
2,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,5.0,700.0,3.0,1,0,3.0,0,0,1,0
3,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,250.0,2.0,1,0,3.0,0,0,1,0
4,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,625.0,2.0,0,0,1.0,0,0,1,0
5,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,5.0,525.0,3.0,1,0,5.0,0,0,1,0
6,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,75.0,3.0,1,1,2.0,1,0,1,1
7,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,1000.0,2.0,1,0,4.0,0,0,1,0
8,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,250.0,3.0,1,0,2.0,1,0,1,1
9,001bbff3030c95740d4c55bb72e823e9,541933a931216d270f53277c2de068d0,2021-07-02T19:38:00.000Z,2,1,0,898,2021-07-04,2021-07-05,2,...,4.0,575.0,2.0,1,1,4.0,0,0,1,0


In [6]:
df.relevance.mean()

0.010812402709411419

In [7]:
## keep only integer/float columns for XGB

df_model = df.drop(labels = ["user_id", "search_id", "search_timestamp",
                             "checkin_date", "checkout_date", "num_clicks",
                             "is_trans", "sort_type", "applied_filters", "rank"],
                    axis=1)

# if i exclude the initial rank from the data - do i ignore the potential effect
#  on relevance (clicks) the initial ranking has on the relevance proxies?

df_model.head()

Unnamed: 0,point_of_sale,geo_location_country,is_mobile,destination_id,adult_count,child_count,infant_count,room_count,prop_id,is_travel_ad,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,qid,relevance
0,2,1,0,898,2,0,0,1,5695234,1,4.0,250.0,2.0,1,0,3.0,1,0
1,2,1,0,898,2,0,0,1,1734546,1,4.0,1000.0,2.0,1,0,4.0,1,0
2,2,1,0,898,2,0,0,1,5675938,0,5.0,700.0,3.0,1,0,3.0,1,0
3,2,1,0,898,2,0,0,1,5695234,0,4.0,250.0,2.0,1,0,3.0,1,0
4,2,1,0,898,2,0,0,1,2012023,0,4.0,625.0,2.0,0,0,1.0,1,0


In [8]:
df_model.shape

(71307, 18)

In [9]:
# separate train and test by qid, so same queries stay together

train_df = df_model[df_model["qid"] >= 200]
train_df.shape

(55480, 18)

In [10]:
test_df = train_df = df_model[df_model["qid"] < 200]
test_df.shape

(15827, 18)

In [11]:
X = train_df[train_df.columns[0:16]]
y = train_df["relevance"]

In [12]:
X.head()

Unnamed: 0,point_of_sale,geo_location_country,is_mobile,destination_id,adult_count,child_count,infant_count,room_count,prop_id,is_travel_ad,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket
0,2,1,0,898,2,0,0,1,5695234,1,4.0,250.0,2.0,1,0,3.0
1,2,1,0,898,2,0,0,1,1734546,1,4.0,1000.0,2.0,1,0,4.0
2,2,1,0,898,2,0,0,1,5675938,0,5.0,700.0,3.0,1,0,3.0
3,2,1,0,898,2,0,0,1,5695234,0,4.0,250.0,2.0,1,0,3.0
4,2,1,0,898,2,0,0,1,2012023,0,4.0,625.0,2.0,0,0,1.0


In [13]:
X_test = test_df[train_df.columns[0:16]]
y_test = test_df["relevance"]

In [14]:
g = train_df.groupby(by="qid")
size = g.size()
group_train = size.to_list()

g = test_df.groupby(by="qid")
size = g.size()
group_valid = size.to_list()

In [15]:
import xgboost as xgb

In [16]:
ranker = xgb.XGBRanker(
        n_estimators=100,
        learning_rate=0.1,
        objective='rank:ndcg',
        reg_lambda=0.05,
        # verbose = True,
        tree_method = 'gpu_hist'
    )

ranker.fit(
    X.astype(float),
    y.astype(int),
    group=group_train,
    eval_group=[group_valid],
    eval_set=[(X_test.astype(float), y_test.astype(int))],
    early_stopping_rounds=100,
    verbose = True
)


    E.g. tree_method = "hist", device = "cuda"



[0]	validation_0-ndcg@32:0.91034
[1]	validation_0-ndcg@32:0.92574
[2]	validation_0-ndcg@32:0.93421
[3]	validation_0-ndcg@32:0.94623
[4]	validation_0-ndcg@32:0.95040
[5]	validation_0-ndcg@32:0.95596
[6]	validation_0-ndcg@32:0.95542
[7]	validation_0-ndcg@32:0.95431
[8]	validation_0-ndcg@32:0.95903
[9]	validation_0-ndcg@32:0.96669
[10]	validation_0-ndcg@32:0.96828
[11]	validation_0-ndcg@32:0.96894
[12]	validation_0-ndcg@32:0.97006
[13]	validation_0-ndcg@32:0.97122
[14]	validation_0-ndcg@32:0.97146
[15]	validation_0-ndcg@32:0.97183
[16]	validation_0-ndcg@32:0.97162
[17]	validation_0-ndcg@32:0.97265
[18]	validation_0-ndcg@32:0.97465
[19]	validation_0-ndcg@32:0.97480
[20]	validation_0-ndcg@32:0.97638
[21]	validation_0-ndcg@32:0.97636
[22]	validation_0-ndcg@32:0.97766
[23]	validation_0-ndcg@32:0.97752
[24]	validation_0-ndcg@32:0.97800
[25]	validation_0-ndcg@32:0.97989
[26]	validation_0-ndcg@32:0.97998
[27]	validation_0-ndcg@32:0.98004
[28]	validation_0-ndcg@32:0.98022
[29]	validation_0-ndcg@3

In [17]:
test_pred = ranker.predict(X_test.astype(float))
y_test_df = pd.DataFrame({"relevance_score": y_test, "predicted_ranking": test_pred})


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [18]:
true_relevance = y_test.sort_values(ascending=False)
relevance_score = y_test_df.sort_values("predicted_ranking", ascending=False)

In [19]:
import numpy as np
from sklearn.metrics import ndcg_score

In [20]:
print(
        "nDCG score: ",
        ndcg_score(
            [true_relevance.to_numpy()], [relevance_score["relevance_score"].to_numpy()]
        ),
    )

print(
        "nDCG score @ 5: ",
        ndcg_score(
            y_true = [true_relevance.to_numpy()], y_score = [relevance_score["relevance_score"].to_numpy()], k=5
        ),
    )

print(
        "nDCG score @ 10: ",
        ndcg_score(
            y_true = [true_relevance.to_numpy()], y_score = [relevance_score["relevance_score"].to_numpy()], k=10
        ),
    )

print(
        "nDCG score @ 50: ",
        ndcg_score(
            y_true = [true_relevance.to_numpy()], y_score = [relevance_score["relevance_score"].to_numpy()], k=50
        ),
    )

nDCG score:  0.6098304701119245
nDCG score @ 5:  0.3485784195172989
nDCG score @ 10:  0.375954803951898
nDCG score @ 50:  0.44183947328436
