# Assignment 2: Expedia Hotel Recommendations

#### Import relevant packages

In [1]:
from lightgbm import LGBMRanker
import pandas as pd
import os
import numpy as np
import json
from sklearn.model_selection import GroupShuffleSplit
#import sweetviz as sv
#from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from tqdm import tqdm
import numba
from project_modules.preprocessing import *

pd.set_option('display.max_columns', None)

### Load train and test data

In [2]:
# Add target variable for training set
raw_train = add_target(load_data(train=True))
raw_test = load_data(train=False)

In [3]:
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4958347 entries, 0 to 4958346
Data columns (total 55 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   srch_id                      int64  
 1   date_time                    object 
 2   site_id                      object 
 3   visitor_location_country_id  int64  
 4   visitor_hist_starrating      float64
 5   visitor_hist_adr_usd         float64
 6   prop_country_id              int64  
 7   prop_id                      int64  
 8   prop_starrating              int64  
 9   prop_review_score            float64
 10  prop_brand_bool              int64  
 11  prop_location_score1         float64
 12  prop_location_score2         float64
 13  prop_log_historical_price    float64
 14  position                     int64  
 15  price_usd                    float64
 16  promotion_flag               int64  
 17  srch_destination_id          int64  
 18  srch_length_of_stay          int64  
 19  

In [4]:
raw_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4959183 entries, 0 to 4959182
Data columns (total 50 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   srch_id                      int64  
 1   date_time                    object 
 2   site_id                      object 
 3   visitor_location_country_id  int64  
 4   visitor_hist_starrating      float64
 5   visitor_hist_adr_usd         float64
 6   prop_country_id              int64  
 7   prop_id                      int64  
 8   prop_starrating              int64  
 9   prop_review_score            float64
 10  prop_brand_bool              int64  
 11  prop_location_score1         float64
 12  prop_location_score2         float64
 13  prop_log_historical_price    float64
 14  price_usd                    float64
 15  promotion_flag               int64  
 16  srch_destination_id          int64  
 17  srch_length_of_stay          int64  
 18  srch_booking_window          int64  
 19  

### Preprocess data

In [3]:
df = raw_train

df = ImputeStarrating0(df, ModelperCountry(df)) # Impute starratings that are 0 by a prediction using linear regression
df = add_norm_features(df) # Add 6x prices normalized prices. Each time normalized by other feature. Also adds month feature.

train_idx, val_idx = next(
        GroupShuffleSplit(n_splits=1, test_size=0.1).split(df, groups=df["srch_id"])
)

train, val = df.iloc[train_idx, :], df.iloc[val_idx, :]

TypeError: 'NoneType' object is not subscriptable

### Feature engineering

In [4]:
#df = featurizing(df)

## Fitting models

#### LightGBM Ranker (LambdaRank)

First define the features and target

In [14]:
target = "target"

cols = [
    "visitor_hist_starrating",
    "visitor_hist_adr_usd",
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1",
    "prop_location_score2",
    "prop_log_historical_price",
    "price_usd",
    "srch_length_of_stay",
    "srch_booking_window",
    "srch_adults_count",
    "srch_children_count",
    "srch_room_count",
    "srch_query_affinity_score",
    "orig_destination_distance",
    "norm_price_wrt_srch_id",
    "norm_price_wrt_prop_id",
    "norm_price_wrt_srch_destination_id",
    "month",
    "norm_price_wrt_month",
    "norm_price_wrt_srch_booking_window",
    "norm_price_wrt_prop_country_id"
]

id_cols = ["srch_id", "prop_id"]

Define the group ssizes for the training and validation set, which are needed for the LightGBMRanker

In [15]:
train_groups = train.groupby("srch_id").size().to_numpy()
val_groups = val.groupby("srch_id").size().to_numpy()

In [16]:
params = {
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "max_depth": 5,
    "learning_rate": 0.05,
    "n_estimators": 300,
    "objective": "lambdarank",
}

model = LGBMRanker(**params)
model.fit(
    train[cols],
    train[target],
    group=train_groups,
    eval_at=[5],
    eval_set=[(val[cols], val[target])],
    eval_group=[val_groups],
)

[1]	valid_0's ndcg@5: 0.30299
[2]	valid_0's ndcg@5: 0.32633
[3]	valid_0's ndcg@5: 0.336864
[4]	valid_0's ndcg@5: 0.343532
[5]	valid_0's ndcg@5: 0.349664
[6]	valid_0's ndcg@5: 0.3509
[7]	valid_0's ndcg@5: 0.352823
[8]	valid_0's ndcg@5: 0.35462
[9]	valid_0's ndcg@5: 0.356616
[10]	valid_0's ndcg@5: 0.35848
[11]	valid_0's ndcg@5: 0.358977
[12]	valid_0's ndcg@5: 0.359742
[13]	valid_0's ndcg@5: 0.360269
[14]	valid_0's ndcg@5: 0.360861
[15]	valid_0's ndcg@5: 0.360477
[16]	valid_0's ndcg@5: 0.361369
[17]	valid_0's ndcg@5: 0.361503
[18]	valid_0's ndcg@5: 0.362487
[19]	valid_0's ndcg@5: 0.363041
[20]	valid_0's ndcg@5: 0.363466
[21]	valid_0's ndcg@5: 0.363107
[22]	valid_0's ndcg@5: 0.364031
[23]	valid_0's ndcg@5: 0.363382
[24]	valid_0's ndcg@5: 0.364335
[25]	valid_0's ndcg@5: 0.36469
[26]	valid_0's ndcg@5: 0.365742
[27]	valid_0's ndcg@5: 0.366318
[28]	valid_0's ndcg@5: 0.366573
[29]	valid_0's ndcg@5: 0.366795
[30]	valid_0's ndcg@5: 0.366888
[31]	valid_0's ndcg@5: 0.367619
[32]	valid_0's ndcg@5: 0

LGBMRanker(learning_rate=0.05, max_depth=5, n_estimators=300,
           objective='lambdarank')

#### Evaluation of LightGBMRanker using NDCG@5

In [17]:
preds_val = predict_in_batches(model, val, cols, id_cols)

calc_ndcg_submission(preds_val, val)

25it [00:23,  1.08it/s]


0.3820363595976368

In [None]:
preds_test = predict_in_batches(model, raw_test, cols, id_cols)

# Save predictions for test set to csv
preds_test.to_csv("predictions_try1.csv", index=False)

248it [01:04,  3.84it/s]
