In [1]:
#import xgboost as xgb
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import random
#from sklearn.model_selection import GroupShuffleSplit

### Feature eng

In [2]:
### reading and sampling the data

def read_file(path):
    """
    reads the file in pandas df and converts the date_time column to datetime type
    """
    df = pd.read_csv(path)
    df['date_time'] = pd.to_datetime(df['date_time'])
    return df

def sample_on_srch_id(df, frac = 0.1):
    """
    samples the dataframe based on the fraction of srach_id
    """
    # get unique srch_ids
    srch_ids = np.unique(df.srch_id)
    # calculate how many ids to return
    chosen_k = int(len(srch_ids) * frac)
    # sample ids
    chosen_ids = random.sample(list(srch_ids), k = chosen_k)
    # filter the df to only have sampled ids
    return df[df['srch_id'].isin(chosen_ids)]

### Feature Engineering --------------------------

## missing data ----------------------------------

def remove_missing_values(df):
    """
    removes columns with more than 50 percent missing data
    """
    missing_values = df.isna().mean().round(4) * 100
    missing_values = pd.DataFrame(missing_values).reset_index()
    missing_values.columns = ["column", "missing"]
    # filter where there are missing values
    missing_values.query("missing > 50", inplace=True)  # remove columns with more than 50 % of missing values
    missing_values.sort_values("missing", inplace=True)
    #print(missing_values)
    df.drop(missing_values.column, axis=1, inplace=True)

def replace_missing_values(df):
    """
    imputes missing values with -1
    """
    df.fillna(value=-1, inplace=True) 

## new features ----------------------------------

def extract_time(df):
    """ 
    month, week, day of the week and hour of search
    """
    df_datetime = pd.DatetimeIndex(df.date_time)
    df["month"] = df_datetime.month
    df["week"] = df_datetime.week
    df["day"] = df_datetime.dayofweek + 1
    df["hour"] = df_datetime.hour
    del df['date_time']

def new_historical_price(df):
    """
    'unlogs' prop_log_historical_price column
    """
    df["prop_historical_price"] = (np.e ** df.prop_log_historical_price).replace(1.0, 0)
    df.drop("prop_log_historical_price", axis=1, inplace=True)

def add_price_position(df, rank_type = "dense"):
    """
    adds hotel price position ("price_position") inside "srch_id" column
    """
    ranks = df.groupby('srch_id')['price_usd'].rank(ascending=True, method = rank_type)
    df["price_position"] = ranks


def average_numerical_features(df, group_by = ["prop_id"], columns = ["prop_starrating", "prop_review_score", "prop_location_score1", "prop_location_score2"]):
    """
    adds mean, median and standard deviation per prop_id (default) 
    for columns that are related to property (default)
    """
    # caulcate means and rename columns
    means = df.groupby(group_by)[columns].mean().reset_index()
    means.columns = [means.columns[0]] + [x + "_mean" for x in means.columns[1:]]
    # caulcate median and rename columns
    medians = df.groupby(group_by)[columns].median().reset_index()
    medians.columns = [medians.columns[0]] + [x + "_median" for x in medians.columns[1:]]
    # caulcate means and rename columns
    stds = df.groupby(group_by)[columns].std().reset_index()
    stds.columns = [stds.columns[0]] + [x + "_std" for x in stds.columns[1:]]
    ## attach aggregated data to the df
    df = pd.merge(df, means, on=group_by)
    df = pd.merge(df, medians, on=group_by)
    df = pd.merge(df, stds, on=group_by)
    return df

def add_historical_booking_click(df):
    """
    creates a column with the percentage of the prop_id booked/clicked rate overall
    """
    # there are more prop_id in the test data than in train. 
    # Maybe we could still use this but would need to impute
    # with the most common value (or something else)
    
    historical = df.groupby("prop_id")[["click_bool", "booking_bool"]].mean().reset_index()
    historical.columns = [historical.columns[0]] + [x + "_rate" for x in historical.columns[1:]]
    df = pd.merge(df, historical, on="prop_id")
    df.sort_values("srch_id", inplace = True)
    return df

def join_historical_data(df, path = "hist_click_book.csv"):
    """
    joins historical data according to prop_id. 
    path - location of historical data csv file
    
    """
    to_join = pd.read_csv(path)
    joined = pd.merge(df, to_join, on="prop_id")
    return joined.sort_values("srch_id")

def create_comp_rate_mode(df, fillna_ = -100):
    """
    creates a column with the mode of comp_rate columns and fills the rest with -100 (default)
    """
    #subset comp_rate
    comp_rate_cols = [col for col in df.columns if col.endswith("_rate")]
    df["comp_rate_mode"] = df[comp_rate_cols].mode(axis = 1, dropna = True)[0]
    df["comp_rate_mode"].fillna(fillna_ , inplace = True)

def create_comp_inv_mode(df, fillna_ = -100):
    """
    creates a column with the mode of comp_inv columns and fills the rest with -100 (default)
    """
    comp_inv = [col for col in df.columns if col.endswith("_inv")]
    df["comp_inv_mode"] = df[comp_inv].mode(axis = 1, dropna = True)[0]
    df["comp_inv_mode"].fillna(fillna_ , inplace = True)

def normalize_features(df_mod, normalizing_var, column):
    # df_mod = dataframe
    # normalizing_var = variable that will be used for normalizing
    # column = variable that will be normalized

    methods = ["mean", "std"]

    df = df_mod.groupby(normalizing_var).agg({column: methods})

    df.columns = df.columns.droplevel()
    col = {}
    for method in methods:
        col[method] = column + "_" + method

    df.rename(columns=col, inplace=True)
    df_merge = df_mod.merge(df.reset_index(), on=normalizing_var)
    df_merge[column + "_norm_by_" + normalizing_var] = (
        df_merge[column] - df_merge[column + "_mean"]
    ) / df_merge[column + "_std"]
    df_merge = df_merge.drop(labels=[col["mean"], col["std"]], axis=1)

    return df_merge   

def add_normalisation(df, target_list = ["prop_starrating", "prop_review_score", "prop_location_score1", "prop_location_score2"]):
    for column in target_list:
        df = normalize_features(df, normalizing_var="srch_id", column=column)
    return df 
    
## other ----------------------------------

def remove_cols(df, cols = ["position", "prop_id"]):
    df.drop(cols, axis=1, inplace=True)

def remove_positions(df, positions = [5, 11, 17, 23]):
    """
    removes hotels with specified positions 
    (based on the fact that hotels in those positions were not as booked)
    """
    df = df[df["position"].isin(positions) == False]

def add_score(df):
    """
    adds 'score' column to the df: 5 for booked, 1 for clicked
    """
    score = []
    for book, click in zip(df.booking_bool, df.click_bool):
        if book == 1:
            score.append(5)
            continue
        if click == 1:
            score.append(1)
            continue
        else:
            score.append(0)
    df["score"] = score
    del df['booking_bool']
    del df['click_bool']

def onehot(df, cols):
    """ 
    returns a df with one-hot encoded columns (cols)
    """
    
    return pd.get_dummies(df, columns=cols)


### Feature engineering function -----------

def feature_engineering_train(df):
    
    extract_time(df)
    create_comp_rate_mode(df)
    create_comp_inv_mode(df)
    remove_missing_values(df)
    replace_missing_values(df)
    new_historical_price(df)
    add_price_position(df)
    #df = average_numerical_features(df)
    #df = add_historical_booking_click(df)
    df = add_normalisation(df)
    add_score(df)
    # remove_cols(df)
    return df

def feature_engineering_train_2(df):
    
    extract_time(df)
    remove_missing_values(df)
    replace_missing_values(df)
    new_historical_price(df)
    add_price_position(df)
    #df = average_numerical_features(df)
    #df = add_historical_booking_click(df)
    df = add_normalisation(df)
    add_score(df)
    #remove_cols(df)
    return df

def feature_engineering_test(df):
    
    extract_time(df)
    create_comp_rate_mode(df)
    create_comp_inv_mode(df)
    remove_missing_values(df)
    replace_missing_values(df)
    new_historical_price(df)
    add_price_position(df)
    #df = average_numerical_features(df)
    df = add_normalisation(df)
    return df
    


In [3]:
def NDCG(predictions, df, path_idcg = "idcg.csv"):
    """
    takes predicted positions and calulates average ndcg.
    predictions - dataframe must have "srch_id" and "prop_id" ordered by relevance (inside "srch_id") (basically Lotte's model "out" dataframe)
    df - training dataset (must contain "srch_id", "prop_id", "score")
    path_idcg - path to idcg scores per "srch_id"
    """
    # reset index 
    predictions.reset_index(drop = True, inplace = True)
    # add position + 1
    predictions["position"] = predictions.groupby(by = ['srch_id']).cumcount()+1
    # filter to only have positions up to 5
    predictions = predictions[predictions.position < 6]
    # attach scores to predictions
    predictions = pd.merge(predictions, df[["srch_id", "prop_id", "score"]], on = ["srch_id", "prop_id"])
    predictions["numerator"] = predictions["score"]
    predictions["denominator"] = np.log2(predictions["position"])
    predictions.loc[predictions.position == 1, "denominator"] = 1
    predictions["intermediate_dcg"] = predictions["numerator"]/predictions["denominator"]
    dcg = predictions.groupby("srch_id")["intermediate_dcg"].sum().reset_index()
    dcg.columns = ["scrh_id", "DCG"]
    # read idcg
    idcg = pd.read_csv(path_idcg)
    # attach idcg to dcg
    joined = pd.merge(dcg, idcg, on = "scrh_id")
    # calculate NDCG
    joined["NDCG"] = joined["DCG"]/joined["iDCG"]
    # calculate mean NDCG
    return joined["NDCG"].mean()

In [4]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])

### Data

In [None]:
import xgboost as xgb
from xgboost import DMatrix
from tqdm.notebook import tqdm, trange
from sklearn.model_selection import GroupShuffleSplit

In [None]:
pwd

In [None]:
df_test = read_file("/Users/IggyMac/OneDrive - UvA/2020-2021/Data mining/Assignment2/data/test_set_VU_DM.csv")

In [5]:
df_train = read_file("/Users/IggyMac/OneDrive - UvA/2020-2021/Data mining/Assignment2/data/training_set_VU_DM.csv")

In [None]:
df_train = pd.read_csv('/Users/IggyMac/OneDrive - UvA/2020-2021/Data mining/Github/assignment2/Model/engineered_training_data.csv')

In [6]:
df_train.columns

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

In [None]:
d1_test = df_test.copy()

In [None]:
d1_train = df_train.copy()

In [None]:
df = feature_engineering_train(df_train)
#df_2 = feature_engineering_train(d1_train)
#testset = feature_engineering_test(d1_test) 

  df["week"] = df_datetime.week


In [None]:
df.to_csv("engineered_training_data.csv", index = False)

In [None]:
df.columns

In [None]:
df_2.columns

In [None]:
#properties = testset['prop_id']

In [None]:
#del df['prop_id']
#del df['position']
#del testset['prop_id']

In [None]:
df.info()

In [None]:
testset.info()

### Parameter tuning

In [None]:
def tuning_model(df, learning_rate, max_depth, n_estimators, objective):
    
    # data
    #del df['position']
    gss = GroupShuffleSplit(test_size=.3, n_splits=1, random_state = 7).split(df, groups=df['srch_id'])

    X_train_inds, X_test_inds = next(gss)
    train_data= df.iloc[X_train_inds]
    test_data= df.iloc[X_test_inds]
    properties = test_data['prop_id']
    del train_data['prop_id']
    del test_data['prop_id']
    #del df['prop_id']


    X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','score'])]
    #X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id'])]

    y_train = train_data.loc[:, train_data.columns.isin(['score'])]

    groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()


    #We need to keep the id for later predictions
    X_test = test_data.loc[:, ~test_data.columns.isin(['score'])]
    y_test = test_data.loc[:, test_data.columns.isin(['score'])]


    model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective=objective,
    random_state=42,    
    learning_rate=learning_rate,
    colsample_bytree=0.9,  
    max_depth=max_depth, 
    n_estimators=n_estimators, 
    subsample=0.75 
    )
    
    model.fit(X_train, y_train, group=groups, verbose=True)
    

    predictions = (X_test.groupby('srch_id').apply(lambda x: predict(model, x)))
    output = pd.DataFrame()
    output["srch_id"] = test_data["srch_id"]
    output["prop_id"] = properties

    # Add scores
    pred_scores_list = []

    for i in predictions:
        for j in i:
            pred_scores_list.append(j)      

    output["pred_scores"] = pred_scores_list
    
    out = output.groupby('srch_id').apply(pd.DataFrame.sort_values, 'pred_scores', ascending=False)
    del out["pred_scores"]
    #out.to_csv('../data/submission_cate.csv', index=False)
    
    return NDCG(out, df, path_idcg = "idcg.csv")
    
    

In [None]:
df_reduct = sample_on_srch_id(df, frac = 0.1)

learning_rate=0.025
colsample_bytree=0.9
max_depth=6
n_estimators=800
subsample=0.75
objective = "rank:pairwise"
tuning_model(df = df, learning_rate = learning_rate, colsample_bytree = 0.9, max_depth = max_depth, n_estimators = n_estimators, objective = objective)

In [None]:
# attempt 1

learning_rate_ = [0.4, 0.3, 0.1, 0.05, 0.025, 0.01]
max_depth_=[6, 8, 10, 12, 15, 20, 30] # Maximum depth of a tree
n_estimators_ =[110, 150, 200, 300, 500]
objective_ = ["rank:pairwise", "rank:ndcg", "rank:map"]

l_r = []
m_d = []
n_e = []
obj = []
ndcg = []
i = 1

df_reduct = sample_on_srch_id(df, frac = 0.1)


for learning_rate in learning_rate_:
    for max_depth in max_depth_:
        for n_estimators in n_estimators_:
            for objective in objective_:
                ndcg_ = tuning_model(df = df_reduct, learning_rate = learning_rate, colsample_bytree = 0.9, max_depth = max_depth, n_estimators = n_estimators, objective = objective)
                ndcg.append(ndcg_)
                l_r.append(learning_rate)
                m_d.append(max_depth)
                n_e.append(n_estimators)
                obj.append(objective)
                #print(learning_rate, max_depth, n_estimators, objective, ndcg_, i)
                data_frame = pd.DataFrame({"learning_rate": [learning_rate],
                                            "max_depth": [max_depth],
                                            "n_estimators": [n_estimators],
                                            "objective": [objective],
                                            "NDCG": [ndcg_]})
                data_frame.to_csv("tuning_outputs/"+ str(i) + ".csv")
                i += 1


In [None]:
# attempt 2

learning_rate_ = [0.1, 0.05, 0.025, 0.01, 0.075]
max_depth_=[6, 8, 10, 12, 15, 20] # Maximum depth of a tree
n_estimators_ =[750, 800, 1000]
objective = "rank:pairwise"

l_r = []
m_d = []
n_e = []
obj = []
ndcg = []
i = 1

df_reduct = sample_on_srch_id(df, frac = 0.1)


for learning_rate in learning_rate_:
    for max_depth in max_depth_:
        for n_estimators in n_estimators_:
                ndcg_ = tuning_model(df = df_reduct, learning_rate = learning_rate, colsample_bytree = 0.9, max_depth = max_depth, n_estimators = n_estimators, objective = objective)
                ndcg.append(ndcg_)
                l_r.append(learning_rate)
                m_d.append(max_depth)
                n_e.append(n_estimators)
                obj.append(objective)
                #print(learning_rate, max_depth, n_estimators, objective, ndcg_, i)
                data_frame = pd.DataFrame({"learning_rate": [learning_rate],
                                            "max_depth": [max_depth],
                                            "n_estimators": [n_estimators],
                                            "objective": [objective],
                                            "NDCG": [ndcg_]})
                data_frame.to_csv("tuning_outputs_02/"+ str(i) + ".csv")
                i += 1


In [None]:
tuning = pd.DataFrame(list(zip(e, d, est, ndcg)),
               columns =['eta', 'max_depth', 'n_estimators', 'NDCG'])
print(tuning)

### Model

In [None]:
def modeling(df, testset, learning_rate, colsample_bytree, eta, max_depth, n_estimators):
    
    # data
    properties = testset['prop_id']
    del testset['prop_id']
    

    X_train = df.loc[:, ~df.columns.isin(['srch_id','score'])]
    y_train = df.loc[:, df.columns.isin(['score'])]
    X_test = testset
    groups = df.groupby('srch_id').size().to_frame('size')['size'].to_numpy()


    model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:pairwise',
    #eval_metric = ["ndcg", "map"],
    random_state=42,    
    learning_rate=learning_rate,
    colsample_bytree=colsample_bytree, 
    eta=eta, 
    max_depth=max_depth, 
    n_estimators=n_estimators, 
    subsample=subsample 
    )
    
    model.fit(X_train, y_train, group=groups, verbose=True)
    

    predictions = (testset.groupby('srch_id').apply(lambda x: predict(model, x)))    output = pd.DataFrame()
    output["srch_id"] = test_data["srch_id"]
    output["prop_id"] = properties

    # Add scores
    pred_scores_list = []

    for i in predictions:
        for j in i:
            pred_scores_list.append(j)      

    output["pred_scores"] = pred_scores_list
    
    out = output.groupby('srch_id').apply(pd.DataFrame.sort_values, 'pred_scores', ascending=False)
    del out["pred_scores"]
    out.to_csv('../data/submission_cate.csv', index=False)
    

In [None]:
learning_rate=0.1
colsample_bytree=0.9
eta=0.05
max_depth=6
n_estimators=150
subsample=0.75

In [None]:
modeling(df, testset, learning_rate, colsample_bytree, eta, max_depth, n_estimators)

In [None]:
X_train = df.loc[:, ~df.columns.isin(['srch_id','score'])]
y_train = df.loc[:, df.columns.isin(['score'])]

X_test = testset

In [None]:
groups = df.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

In [None]:
model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:pairwise',
    eval_metric = ["ndcg", "map"],
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)

In [None]:
predictions = (testset.groupby('srch_id')
               .apply(lambda x: predict(model, x)))

In [None]:
predictions

In [None]:
# This could be done more efficiently

# Prepare output file
output = pd.DataFrame()
output["srch_id"] = testset["srch_id"]
output["prop_id"] = properties

# Add scores
pred_scores_list = []

for i in predictions:
    for j in i:
        pred_scores_list.append(j)      

output["pred_scores"] = pred_scores_list

In [None]:
out = output.groupby('srch_id').apply(pd.DataFrame.sort_values, 'pred_scores', ascending=False)

In [None]:
del out["pred_scores"]

In [None]:
out.head()

In [None]:
out.to_csv('../data/submission_cate.csv', index=False)

In [None]:
xgb.plot_importance(model)