### Learning to rank with XGB and LightGBM

in case of questions on this notebook: contact Lorenz Schmid, schmid.lorenz@web.de
- Settings in Google Colab: T4 GPU, RAM: 50.99 GB

- 1 Million rows - Top 500 destinations

- loss is NDCG-informed

- initial rank und rank noad not used for ranking

- Feature and row selection as in https://docs.google.com/document/d/1R9itgngZUNzkBqWzXstLiBXXoYJa8-ahS66JrqC3kDk/edit

- after feature selection: check for queries with one row/no click/transaction
- preprocessed: no queries with only one result, numerical variables scaled
- click and transaction in one relevance label (click weight: 1, transaction weight: 2)



XGB code based on this example: https://xgboost.readthedocs.io/en/stable/tutorials/learning_to_rank.html

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load pre-pared subset of the data, needs to be included in shared data for code to run

import pandas as pd
# df = pd.read_csv("/content/drive/MyDrive/expedia/millionrows_Top500Destinations.csv")
df = pd.read_csv("path/millionrows_Top500Destinations.csv")

In [None]:
df.head()

Unnamed: 0,user_id,search_id,point_of_sale,geo_location_country,is_mobile,destination_id,checkin_date,checkout_date,adult_count,child_count,...,HighSpeedInternet,HotTub,LaundryFacility,Parking,PetsAllowed,PrivatePool,SpaServices,SwimmingPool,WasherDryer,WiFi
0,2,2,1,1,0,49,2021-07-07,2021-07-12,2,0,...,True,False,True,False,False,False,False,False,False,False
1,2,2,1,1,0,49,2021-07-07,2021-07-12,2,0,...,True,False,True,True,False,False,False,True,False,False
2,2,2,1,1,0,49,2021-07-07,2021-07-12,2,0,...,True,False,True,True,True,False,False,True,False,True
3,2,2,1,1,0,49,2021-07-07,2021-07-12,2,0,...,False,False,True,True,True,False,False,False,False,True
4,2,2,1,1,0,49,2021-07-07,2021-07-12,2,0,...,False,False,True,True,True,False,False,False,False,False


In [None]:
df.shape

(2049518, 49)

In [None]:
df.dtypes

user_id                        int64
search_id                      int64
point_of_sale                  int64
geo_location_country           int64
is_mobile                      int64
destination_id                 int64
checkin_date                  object
checkout_date                 object
adult_count                    int64
child_count                    int64
infant_count                   int64
room_count                     int64
sort_type                     object
length_of_stay               float64
booking_window               float64
search_day_of_week             int64
search_hour_of_day             int64
checkin_day                    int64
checkout_day                   int64
rank                           int64
prop_id                        int64
is_travel_ad                   int64
review_rating                float64
review_count                 float64
star_rating                   object
is_free_cancellation           int64
is_drr                         int64
p

In [None]:
# how many queries in test data?
len(df["search_id"].unique())

75642

In [None]:
len(df["user_id"].unique())

48716

#### drop columns not needed - first step

In [None]:
df = df.drop(columns = ["search_day_of_week", "search_hour_of_day",
                        "checkout_date", "infant_count", "room_count", "rank",
                        "user_id", "rank_noad"])

df.shape

(2049518, 41)

In [None]:
## create relvance label
# 1 for clicked on
# 2 for transaction

df["relevance"] = 0  # Initialize with default value
df.loc[df["num_clicks"] > 0, "relevance"] = 1  # Set value to 1 if first_column > 0
df.loc[df["is_trans"] > 0, "relevance"] = 2  # Add 2 if second_column > 0


In [None]:
### drop num clicks and is_trans
df = df.drop(columns = ["is_trans", "num_clicks"])
df.shape

(2049518, 40)

In [None]:
#### barplot of relevance labels
import matplotlib.pyplot as plt
frequency_counts = df["relevance"].value_counts()

In [None]:
frequency_counts

relevance
0    1908184
1     131053
2      10281
Name: count, dtype: int64

#### Drop rows based on conditions - new data set object is created

In [None]:
## what are values in sort_type
df["sort_type"].value_counts()

sort_type
RECOMMENDED                      1881566
PRICE ASCENDING                   107997
STAR RATING DESCENDING             15118
GUEST RATINGS DESCENDING           14661
SPECIAL - DIRECT HOTEL SEARCH      11913
DEALS                              10316
VACATION RENTAL                     7947
Name: count, dtype: int64

In [None]:
# keep only sort_type == RECOMMENDED
df_model = df[df["sort_type"] == "RECOMMENDED"]
df_model.shape

(1881566, 40)

In [None]:
#### are there sort_type other than recommended
df_model["sort_type"].value_counts()

sort_type
RECOMMENDED    1881566
Name: count, dtype: int64

In [None]:
df_model = df_model.drop(columns = "sort_type")

In [None]:
df_model["star_rating"].value_counts()

star_rating
4.0              715088
3.0              627137
5.0              283367
2.0              133366
0                 85191
Not Available     36587
1.0                 793
0.0                  37
Name: count, dtype: int64

In [None]:
### remove Not Available and transfer to integer
df_model = df_model[df_model["star_rating"] != "Not Available"]
df_model["star_rating"].value_counts()

star_rating
4.0    715088
3.0    627137
5.0    283367
2.0    133366
0       85191
1.0       793
0.0        37
Name: count, dtype: int64

In [None]:
# transfer star rating to a numeric value
df_model["star_rating"] = pd.to_numeric(df_model["star_rating"])

In [None]:
df_model["star_rating"].value_counts()

star_rating
4.0    715088
3.0    627137
5.0    283367
2.0    133366
0.0     85228
1.0       793
Name: count, dtype: int64

In [None]:
df_model["review_rating"].value_counts()

review_rating
4.0    1206031
5.0     493645
3.0     107968
0.0      27600
2.0       9032
1.0        703
Name: count, dtype: int64

In [None]:
## remove if review rating == 0, because indicates that no rating could be given
df_model = df_model[df_model["review_rating"] != 0.0]
df_model["review_rating"].value_counts()

review_rating
4.0    1206031
5.0     493645
3.0     107968
2.0       9032
1.0        703
Name: count, dtype: int64

In [None]:
## rows without valid values are already dropped with review_rating
df_model['review_count'].value_counts()

review_count
1025.0     358711
1000.0     156005
25.0       116550
50.0        46534
75.0        35742
            ...  
22100.0         3
27775.0         3
27750.0         2
14325.0         2
14675.0         1
Name: count, Length: 747, dtype: int64

In [None]:
df_model['price_bucket'].value_counts()

price_bucket
4.0    389951
3.0    383366
2.0    369453
5.0    354746
1.0    319863
Name: count, dtype: int64

In [None]:
import numpy as np

In [None]:
df_model["price_bucket"].mean()

3.049667130521482

#### Exclude ads

In [None]:
df_model['is_travel_ad'].value_counts()

is_travel_ad
0    1633412
1     183967
Name: count, dtype: int64

In [None]:
# only rows that are no ads stay in df
df_model = df_model[df_model['is_travel_ad'] == 0]
df_model.shape

(1633412, 39)

In [None]:
df_model = df_model.drop(columns = "is_travel_ad")

In [None]:
#df_model['rank_noad'].value_counts()

#### Extract month from checkin_date

In [None]:
df_model["in_date"] = pd.to_datetime(df_model["checkin_date"])
df_model["in_date"]

0         2021-07-07
3         2021-07-07
4         2021-07-07
5         2021-07-07
6         2021-07-07
             ...    
2049513   2021-06-21
2049514   2021-06-21
2049515   2021-06-21
2049516   2021-06-21
2049517   2021-06-21
Name: in_date, Length: 1633412, dtype: datetime64[ns]

In [None]:
# Extract the month
df_model["checkin_month"] = df_model["in_date"].dt.month

In [None]:
# transform month to category
df_model["checkin_month"] = df_model["checkin_month"].astype("category")

In [None]:
# drop string and date time of checkin
df_model = df_model.drop(columns = ["checkin_date", "in_date"])

#### Transform amenities from string into category

In [None]:
### replace True False strings with 1 and 0

df_model = df_model.replace({"True": 1, "False": 0})

In [None]:
df_model.head()

Unnamed: 0,search_id,point_of_sale,geo_location_country,is_mobile,destination_id,adult_count,child_count,length_of_stay,booking_window,checkin_day,...,LaundryFacility,Parking,PetsAllowed,PrivatePool,SpaServices,SwimmingPool,WasherDryer,WiFi,relevance,checkin_month
0,2,1,1,0,49,2,0,5.0,26.0,2,...,1,0,0,0,0,0,0,0,1,7
3,2,1,1,0,49,2,0,5.0,26.0,2,...,1,1,1,0,0,0,0,1,0,7
4,2,1,1,0,49,2,0,5.0,26.0,2,...,1,1,1,0,0,0,0,0,0,7
5,2,1,1,0,49,2,0,5.0,26.0,2,...,1,0,1,0,0,0,0,0,0,7
6,2,1,1,0,49,2,0,5.0,26.0,2,...,1,1,0,0,0,0,0,0,0,7


In [None]:
amenities =  ['AirConditioning',
  'AirportTransfer',
  'Bar',
  'FreeAirportTransportation',
  'FreeBreakfast',
  'FreeParking',
  'FreeWiFi',
  'Gym',
  'HighSpeedInternet',
  'HotTub',
  'LaundryFacility',
  'Parking',
  'PetsAllowed',
  'PrivatePool',
  'SpaServices',
  'SwimmingPool',
  'WasherDryer',
  'WiFi']



# change type to category:
for amenity in amenities:
  df_model[amenity] = df_model[amenity].astype('int')

In [None]:
# df_model['is_mobile'] = df_model['is_mobile'].astype('category')
# df_model['is_free_cancellation'] = df_model['is_free_cancellation'].astype('category')
# df_model['is_drr'] = df_model['is_drr'].astype('category')
df_model['checkin_day'] = df_model['checkin_day'].astype('category')
df_model['checkout_day'] = df_model['checkout_day'].astype('category')

In [None]:
df_model.dtypes

search_id                       int64
point_of_sale                   int64
geo_location_country            int64
is_mobile                       int64
destination_id                  int64
adult_count                     int64
child_count                     int64
length_of_stay                float64
booking_window                float64
checkin_day                  category
checkout_day                 category
prop_id                         int64
review_rating                 float64
review_count                  float64
star_rating                   float64
is_free_cancellation            int64
is_drr                          int64
price_bucket                  float64
AirConditioning                 int64
AirportTransfer                 int64
Bar                             int64
FreeAirportTransportation       int64
FreeBreakfast                   int64
FreeParking                     int64
FreeWiFi                        int64
Gym                             int64
HighSpeedInt

In [None]:
## keep only integer/float columns for XGB
df_model.head()

Unnamed: 0,search_id,point_of_sale,geo_location_country,is_mobile,destination_id,adult_count,child_count,length_of_stay,booking_window,checkin_day,...,LaundryFacility,Parking,PetsAllowed,PrivatePool,SpaServices,SwimmingPool,WasherDryer,WiFi,relevance,checkin_month
0,2,1,1,0,49,2,0,5.0,26.0,2,...,1,0,0,0,0,0,0,0,1,7
3,2,1,1,0,49,2,0,5.0,26.0,2,...,1,1,1,0,0,0,0,1,0,7
4,2,1,1,0,49,2,0,5.0,26.0,2,...,1,1,1,0,0,0,0,0,0,7
5,2,1,1,0,49,2,0,5.0,26.0,2,...,1,0,1,0,0,0,0,0,0,7
6,2,1,1,0,49,2,0,5.0,26.0,2,...,1,1,0,0,0,0,0,0,0,7


In [None]:
categorical_features = df_model.select_dtypes(include=['category']).columns.tolist()
# numeric_features = df_model.select_dtypes(exclude=['category']).columns.tolist()


categorical_features

['checkin_day', 'checkout_day', 'checkin_month']

In [None]:
df_model.shape

(1633412, 38)

In [None]:
# drop rows with NaN values
# there should be no more rows dropped
df_model = df_model.dropna()

df_model.shape

(1633412, 38)

In [None]:
#### check data before test - train split

# are there qids with only one row?
# throw out rows where qid is unique - for those ndcg is not possible to calculate
# Find counts of each group
group_counts = df_model["search_id"].value_counts()
unique_groups = group_counts[group_counts == 1].index

filtered_df = df_model[~df_model["search_id"].isin(unique_groups)]
filtered_df.shape

(1633131, 38)

In [None]:
df_model = filtered_df

In [None]:
### drop property id, should not be used as feature

df_model = df_model.drop(columns = "prop_id")

#### Train test split

- check here for qids with only one row/ queries with only zeros in relevance

In [None]:
from sklearn.model_selection import GroupShuffleSplit

def split_by_qid(df, test_size):
    gss = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=42)
    ids = df['search_id'].unique()
    train_idx, test_idx = next(gss.split(ids, groups=ids))
    return df[df['search_id'].isin(ids[train_idx])], df[df['search_id'].isin(ids[test_idx])]

#_, temp_df = split_by_qid(df2, test_size=0.2)
train_df, test_df = split_by_qid(df_model, test_size=0.3)

train_df.shape, test_df.shape

((1142054, 37), (491077, 37))

In [None]:
# data needs to be ordered by qid for XGBRank function
train_df = train_df.sort_values(by = "search_id")
X_train = train_df.drop(columns = ["search_id", "relevance"])


# feature names for plots later on
#features_list = X.columns.tolist()

# scale features
#X = normalize(X)

# relevance label is outcome var
y_train = train_df["relevance"]
qid_train = train_df["search_id"]

In [None]:
X_train.dtypes

point_of_sale                   int64
geo_location_country            int64
is_mobile                       int64
destination_id                  int64
adult_count                     int64
child_count                     int64
length_of_stay                float64
booking_window                float64
checkin_day                  category
checkout_day                 category
review_rating                 float64
review_count                  float64
star_rating                   float64
is_free_cancellation            int64
is_drr                          int64
price_bucket                  float64
AirConditioning                 int64
AirportTransfer                 int64
Bar                             int64
FreeAirportTransportation       int64
FreeBreakfast                   int64
FreeParking                     int64
FreeWiFi                        int64
Gym                             int64
HighSpeedInternet               int64
HotTub                          int64
LaundryFacil

In [None]:
# same data format for test data
test_df = test_df.sort_values(by = "search_id")

X_test = test_df.drop(columns = ["search_id", "relevance"])

#X_test = normalize(X_test)

y_test = test_df["relevance"]
y_test_pred = test_df[["relevance", "search_id"]]
qid_test = test_df["search_id"]

In [None]:
categorical_features = X_train.select_dtypes(include=['category']).columns.tolist()
numeric_features = X_train.select_dtypes(exclude=['category']).columns.tolist()

# take out destination ID and geo_location_country and point_of_sale from list so it can be encoded separately
numeric_features.remove("destination_id")
numeric_features.remove("geo_location_country")
numeric_features.remove("point_of_sale")


categorical_features, numeric_features

(['checkin_day', 'checkout_day', 'checkin_month'],
 ['is_mobile',
  'adult_count',
  'child_count',
  'length_of_stay',
  'booking_window',
  'review_rating',
  'review_count',
  'star_rating',
  'is_free_cancellation',
  'is_drr',
  'price_bucket',
  'AirConditioning',
  'AirportTransfer',
  'Bar',
  'FreeAirportTransportation',
  'FreeBreakfast',
  'FreeParking',
  'FreeWiFi',
  'Gym',
  'HighSpeedInternet',
  'HotTub',
  'LaundryFacility',
  'Parking',
  'PetsAllowed',
  'PrivatePool',
  'SpaServices',
  'SwimmingPool',
  'WasherDryer',
  'WiFi'])

#### Feature pre-processing

In [None]:
!pip install category_encoders



In [None]:
import sklearn
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
### encode categorical with high cardinality
sklearn.set_config(transform_output="pandas")
from category_encoders.target_encoder import TargetEncoder



In [None]:
### encode destination ID and geo_location_country and point_of_sale
enc = TargetEncoder(cols=["destination_id", "geo_location_country", "point_of_sale"]).fit(X_train, y_train)

X_train = enc.transform(X_train)
X_test = enc.transform(X_test)

In [None]:
X_train.dtypes

point_of_sale                 float64
geo_location_country          float64
is_mobile                       int64
destination_id                float64
adult_count                     int64
child_count                     int64
length_of_stay                float64
booking_window                float64
checkin_day                  category
checkout_day                 category
review_rating                 float64
review_count                  float64
star_rating                   float64
is_free_cancellation            int64
is_drr                          int64
price_bucket                  float64
AirConditioning                 int64
AirportTransfer                 int64
Bar                             int64
FreeAirportTransportation       int64
FreeBreakfast                   int64
FreeParking                     int64
FreeWiFi                        int64
Gym                             int64
HighSpeedInternet               int64
HotTub                          int64
LaundryFacil

In [None]:
# remove ordinal features before scaling of numeric
# most binary features are already 0, 1 encoded

numeric_features = ['adult_count',
 'child_count',
 'length_of_stay',
 'booking_window',
 'review_count']

numeric_features, categorical_features

(['adult_count',
  'child_count',
  'length_of_stay',
  'booking_window',
  'review_count'],
 ['checkin_day', 'checkout_day', 'checkin_month'])

In [None]:
# scale numerical
scaler=StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [None]:
# create dummies for categorical

X_train = pd.get_dummies(X_train, columns=categorical_features)
X_test = pd.get_dummies(X_test, columns=categorical_features)

In [None]:
X_train.head()

Unnamed: 0,point_of_sale,geo_location_country,is_mobile,destination_id,adult_count,child_count,length_of_stay,booking_window,review_rating,review_count,...,checkin_month_3,checkin_month_4,checkin_month_5,checkin_month_6,checkin_month_7,checkin_month_8,checkin_month_9,checkin_month_10,checkin_month_11,checkin_month_12
0,0.072653,0.073411,0,0.060241,-0.213174,-0.346087,0.856117,-0.242449,4.0,-0.043666,...,False,False,False,False,True,False,False,False,False,False
27,0.072653,0.073411,0,0.060241,-0.213174,-0.346087,0.856117,-0.242449,5.0,-0.248428,...,False,False,False,False,True,False,False,False,False,False
25,0.072653,0.073411,0,0.060241,-0.213174,-0.346087,0.856117,-0.242449,4.0,-0.092418,...,False,False,False,False,True,False,False,False,False,False
24,0.072653,0.073411,0,0.060241,-0.213174,-0.346087,0.856117,-0.242449,5.0,0.005087,...,False,False,False,False,True,False,False,False,False,False
23,0.072653,0.073411,0,0.060241,-0.213174,-0.346087,0.856117,-0.242449,4.0,-0.131421,...,False,False,False,False,True,False,False,False,False,False


In [None]:
X_train.dtypes

point_of_sale                float64
geo_location_country         float64
is_mobile                      int64
destination_id               float64
adult_count                  float64
child_count                  float64
length_of_stay               float64
booking_window               float64
review_rating                float64
review_count                 float64
star_rating                  float64
is_free_cancellation           int64
is_drr                         int64
price_bucket                 float64
AirConditioning                int64
AirportTransfer                int64
Bar                            int64
FreeAirportTransportation      int64
FreeBreakfast                  int64
FreeParking                    int64
FreeWiFi                       int64
Gym                            int64
HighSpeedInternet              int64
HotTub                         int64
LaundryFacility                int64
Parking                        int64
PetsAllowed                    int64
P

In [None]:
#len(group_train)

In [None]:
import xgboost as xgb

In [None]:
# use the examplary code from xgboost documentation as a starting point:
# https://xgboost.readthedocs.io/en/stable/python/examples/learning_to_rank.html#sphx-glr-python-examples-learning-to-rank-py
# specifically for click data

ranker = xgb.XGBRanker(
        n_estimators=100, #512
        tree_method="hist",
        device="cuda",
        learning_rate=0.01,
        reg_lambda=0.05,
        subsample=0.8,
        sampling_method="gradient_based",
        # LTR specific parameters
        objective="rank:ndcg",
        # - Enable bias estimation
        lambdarank_unbiased=True,
        # - normalization (1 / (norm + 1))
        lambdarank_bias_norm=1,
        # - Focus on the top 12 documents
        lambdarank_num_pair_per_sample=12,
        lambdarank_pair_method="topk",
        ndcg_exp_gain=True,
        eval_metric=["ndcg@1", "ndcg@5", "ndcg@10", "ndcg@20", "ndcg@30"],
        lambdarank_normalization = False
 )

In [None]:
# ranker = xgb.XGBRanker(
#        n_estimators=1000,
#        learning_rate=0.1,
#        objective='rank:pairwise', # this is RankNet loss or the pairwise logistic loss
#        lambdarank_num_pair_per_sample=25,
#        lambdarank_pair_method="topk",
#        reg_lambda=0.05,
#        # verbose = True,
#        tree_method = 'hist',
#        eval_metric = 'ndcg',
#        eta = 0.3
#    )



In [None]:
# was used to check if there are heavily correlated features

# corr = X_train[["point_of_sale", "geo_location_country", "is_mobile",
#                "destination_id", "adult_count", "child_count", "length_of_stay",
#                "booking_window", "review_rating", "review_count", "star_rating",
#                "is_free_cancellation", "is_drr", "price_bucket"]].corr(numeric_only=True)

In [None]:
import seaborn as sns
# sns.heatmap(corr)

In [None]:
# fit on training data

ranker.fit(
    X_train,
    y_train,
    qid=qid_train,
    eval_set=[(X_test, y_test)],
    eval_qid=[qid_test],
    verbose = True,
    early_stopping_rounds = 15
)

Parameters: { "lambdarank_normalization" } are not used.



[0]	validation_0-ndcg@1:0.16074	validation_0-ndcg@5:0.27037	validation_0-ndcg@10:0.34229	validation_0-ndcg@20:0.42182	validation_0-ndcg@30:0.45360
[1]	validation_0-ndcg@1:0.16649	validation_0-ndcg@5:0.27450	validation_0-ndcg@10:0.34817	validation_0-ndcg@20:0.42848	validation_0-ndcg@30:0.45800
[2]	validation_0-ndcg@1:0.18196	validation_0-ndcg@5:0.29216	validation_0-ndcg@10:0.36545	validation_0-ndcg@20:0.44180	validation_0-ndcg@30:0.47013
[3]	validation_0-ndcg@1:0.18813	validation_0-ndcg@5:0.30151	validation_0-ndcg@10:0.37311	validation_0-ndcg@20:0.44898	validation_0-ndcg@30:0.47600
[4]	validation_0-ndcg@1:0.19208	validation_0-ndcg@5:0.30472	validation_0-ndcg@10:0.37727	validation_0-ndcg@20:0.45280	validation_0-ndcg@30:0.47914
[5]	validation_0-ndcg@1:0.19389	validation_0-ndcg@5:0.30677	validation_0-ndcg@10:0.38095	validation_0-ndcg@20:0.45556	validation_0-ndcg@30:0.48131
[6]	validation_0-ndcg@1:0.19660	validation_0-ndcg@5:0.31116	validation_0-ndcg@10:0.38439	validation_0-ndcg@20:0.45798	

In [None]:
#ranker.evals_result_

In [None]:
# from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
# Works with cv in scikit-learn, along with HPO utilities like GridSearchCV
# kfold = StratifiedGroupKFold(n_splits = 5, shuffle=False)

# what is the default scoring for XGBRanker -> is it eval_metric from the model above

# cross_val_score(ranker, X, y, cv=kfold, groups=X.qid)

Prediction: needs to be done by query in query_test

In [None]:
import numpy as np

### NDCG function as used by us

def ndcg(y_score, y_true, k):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

In [None]:
X_test["search_id"] = qid_test


qids = np.unique(qid_test)


In [None]:
qids = np.unique(qid_test)


ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test_pred.loc[y_test_pred['search_id'] == qid, "relevance"]

    if np.sum(y) == 0:
        continue
    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])

    p = ranker.predict(X_pred)

    idcg = ndcg(y, y, k=10)
    ndcg_.append(ndcg(p, y, k=10) / idcg)

np.mean(ndcg_)

0.3757188698715466

In [None]:

ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test_pred.loc[y_test_pred['search_id'] == qid, "relevance"]

    if np.sum(y) == 0:
        continue
    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])

    p = ranker.predict(X_pred)

    idcg = ndcg(y, y, k=5)
    ndcg_.append(ndcg(p, y, k=5) / idcg)

np.mean(ndcg_)

0.3029246237874578

In [None]:

ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test_pred.loc[y_test_pred['search_id'] == qid, "relevance"]

    if np.sum(y) == 0:
        continue
    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])

    p = ranker.predict(X_pred)

    idcg = ndcg(y, y, k=20)
    ndcg_.append(ndcg(p, y, k=20) / idcg)

np.mean(ndcg_)

0.4501900026041219

In [None]:

ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test_pred.loc[y_test_pred['search_id'] == qid, "relevance"]

    if np.sum(y) == 0:
        continue
    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])

    p = ranker.predict(X_pred)

    idcg = ndcg(y, y, k=30)
    ndcg_.append(ndcg(p, y, k=30) / idcg)

np.mean(ndcg_)

0.475356249777268

#### Light GBM

In [None]:
X_train.shape, y_train.shape, qid_train.shape

((1142054, 58), (1142054,), (1142054,))

In [None]:
X_test.shape, y_test.shape, qid_test.shape

((491077, 59), (491077,), (491077,))

In [None]:
import lightgbm as lgb

X_train_arr = X_train.values

y_train_array = y_train.values.ravel()

query_group_sizes_train = []
for i in train_df['search_id'].value_counts():
  query_group_sizes_train.append(i)

query_group_sizes_train_arr = np.array(query_group_sizes_train)

train_data = lgb.Dataset(X_train_arr, label=y_train_array, group=query_group_sizes_train_arr)

In [None]:
# Define LightGBM parameters for ranker
params = {
    'objective': 'lambdarank',  # Ranking objective
    'metric': 'ndcg',  # Evaluation metric
    'learning_rate': 0.01,
    'verbose': 0
}

In [None]:
num_rounds = 1000
ranker_model = lgb.train(params, train_data, num_rounds)

In [None]:
ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test[qid_test == qid]

    if np.sum(y) == 0:
        continue

    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])
    p = ranker_model.predict(X_pred)

    idcg = ndcg(y, y, k=5)
    ndcg_.append(ndcg(p, y, k=5) / idcg)

np.mean(ndcg_)

0.31290167210137176

In [None]:
ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test[qid_test == qid]

    if np.sum(y) == 0:
        continue

    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])
    p = ranker_model.predict(X_pred)

    idcg = ndcg(y, y, k=10)
    ndcg_.append(ndcg(p, y, k=10) / idcg)

np.mean(ndcg_)

0.3866071194108189

In [None]:
ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test[qid_test == qid]

    if np.sum(y) == 0:
        continue

    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])
    p = ranker_model.predict(X_pred)

    idcg = ndcg(y, y, k=20)
    ndcg_.append(ndcg(p, y, k=20) / idcg)

np.mean(ndcg_)

0.4596371441497926

In [None]:
ndcg_ = list()

for i, qid in enumerate(qids):
    y = y_test[qid_test == qid]

    if np.sum(y) == 0:
        continue

    X_pred = X_test[X_test["search_id"] == qid].drop(columns =["search_id"])
    p = ranker_model.predict(X_pred)

    idcg = ndcg(y, y, k=30)
    ndcg_.append(ndcg(p, y, k=30) / idcg)

np.mean(ndcg_)

0.4832222449785688