# Assignment 2: Expedia Hotel Recommendations

#### Import relevant packages

In [1]:
from lightgbm import LGBMRanker
import pandas as pd
import os
import numpy as np
import json
from sklearn.model_selection import GroupShuffleSplit
import sweetviz as sv
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from tqdm import tqdm
import numba
from project_modules.preprocessing import *
from datetime import datetime

pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


### Load train and test data

In [2]:
# Add target variable for training set
raw_train = add_target(load_data(train=True))
raw_test = load_data(train=False)

In [3]:
# raw_train.info()

In [4]:
# raw_test.info()

In [15]:
raw_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,1,2.2,0.0149,5.03,26,170.74,0,23246,1,0,4,0,1,1,,,1,,,,,,,0.0,0.0,,,,,0.0,1.0,,,,,,,,0.0,0.0,,0,,0,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,1,2.2,0.0245,4.92,21,179.8,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,1,2.83,0.0125,4.39,34,602.77,0,23246,1,0,4,0,1,1,,,1,,,,-1.0,0.0,5.0,-1.0,0.0,5.0,,,,0.0,1.0,,,,,,,,-1.0,0.0,5.0,0,,0,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4,143.58,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,0


### Preprocess data

In [5]:
df = raw_train

train_idx, val_idx = next(
        GroupShuffleSplit(n_splits=1, test_size=0.1).split(df, groups=df["srch_id"])
)

train, val = df.iloc[train_idx, :], df.iloc[val_idx, :]

### Feature engineering

In [6]:
# df = featurizing(raw_train)

## Fitting models

#### LightGBM Ranker (LambdaRank)

First define the features and target

In [6]:
target = "target"

cols = [
    "visitor_hist_starrating",
    "visitor_hist_adr_usd",
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "price_usd",
    "srch_length_of_stay",
    "srch_booking_window",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_query_affinity_score",
    "orig_destination_distance",
]

id_cols = ["srch_id", "prop_id"]

Define the group ssizes for the training and validation set, which are needed for the LightGBMRanker

In [7]:
train_groups = train.groupby("srch_id").size().to_numpy()
val_groups = val.groupby("srch_id").size().to_numpy()

In [8]:
params = {
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "max_depth": 5,
    "learning_rate": 0.05,
    "n_estimators": 300,
    "objective": "lambdarank",
}

model = LGBMRanker(**params)
model.fit(
    train[cols],
    train[target],
    group=train_groups,
    eval_at=[5],
    eval_set=[(val[cols], val[target])],
    eval_group=[val_groups],
)

[1]	valid_0's ndcg@5: 0.290534
[2]	valid_0's ndcg@5: 0.317424
[3]	valid_0's ndcg@5: 0.327746
[4]	valid_0's ndcg@5: 0.335312
[5]	valid_0's ndcg@5: 0.339814
[6]	valid_0's ndcg@5: 0.34151
[7]	valid_0's ndcg@5: 0.342974
[8]	valid_0's ndcg@5: 0.345917
[9]	valid_0's ndcg@5: 0.347501
[10]	valid_0's ndcg@5: 0.347764
[11]	valid_0's ndcg@5: 0.35034
[12]	valid_0's ndcg@5: 0.349851
[13]	valid_0's ndcg@5: 0.350779
[14]	valid_0's ndcg@5: 0.351189
[15]	valid_0's ndcg@5: 0.351594
[16]	valid_0's ndcg@5: 0.352694
[17]	valid_0's ndcg@5: 0.353005
[18]	valid_0's ndcg@5: 0.35403
[19]	valid_0's ndcg@5: 0.354866
[20]	valid_0's ndcg@5: 0.355471
[21]	valid_0's ndcg@5: 0.355687
[22]	valid_0's ndcg@5: 0.355995
[23]	valid_0's ndcg@5: 0.356167
[24]	valid_0's ndcg@5: 0.356085
[25]	valid_0's ndcg@5: 0.356984
[26]	valid_0's ndcg@5: 0.356941
[27]	valid_0's ndcg@5: 0.358037
[28]	valid_0's ndcg@5: 0.358358
[29]	valid_0's ndcg@5: 0.358042
[30]	valid_0's ndcg@5: 0.357834
[31]	valid_0's ndcg@5: 0.358084
[32]	valid_0's ndcg@

LGBMRanker(learning_rate=0.05, max_depth=5, n_estimators=300,
           objective='lambdarank')

#### Evaluation of LightGBMRanker using NDCG@5

In [18]:
preds_val = predict_in_batches(model, val, cols, id_cols)

calc_ndcg_submission(preds_val, val, k=5)

25it [00:08,  2.86it/s]


0.5015169106593081

In [12]:
preds_test = predict_in_batches(model, raw_test, cols, id_cols)

# Save predictions for test set to csv

preds_test.to_csv(f"predictions_{datetime.now().strftime('%m-%d_%H:%M')}.csv", index=False)

248it [01:45,  2.35it/s]
