In [1]:
# Imports

import pandas as pd
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
import time

In [2]:
### reading and sampling the data

def read_file(path):
    """
    reads the file in pandas df and converts the date_time column to datetime type
    """
    df = pd.read_csv(path)
    df['date_time'] = pd.to_datetime(df['date_time'])
    return df

def sample_on_srch_id(df, frac = 0.1):
    """
    samples the dataframe based on the fraction of srach_id
    """
    # get unique srch_ids
    srch_ids = np.unique(df.srch_id)
    # calculate how many ids to return
    chosen_k = int(len(srch_ids) * frac)
    # sample ids
    chosen_ids = random.sample(list(srch_ids), k = chosen_k)
    # filter the df to only have sampled ids
    return df[df['srch_id'].isin(chosen_ids)]

### Feature Engineering --------------------------

## missing data ----------------------------------

def remove_missing_values(df):
    """
    removes columns with more than 50 percent missing data
    """
    missing_values = df.isna().mean().round(4) * 100
    missing_values = pd.DataFrame(missing_values).reset_index()
    missing_values.columns = ["column", "missing"]
    # filter where there are missing values
    missing_values.query("missing > 50", inplace=True)  # remove columns with more than 50 % of missing values
    missing_values.sort_values("missing", inplace=True)
    #print(missing_values)
    df.drop(missing_values.column, axis=1, inplace=True)

def replace_missing_values(df):
    """
    imputes missing values with -1
    """
    df.fillna(value=-1, inplace=True) 

## new features ----------------------------------

def extract_time(df):
    """ 
    month, week, day of the week and hour of search
    """
    df_datetime = pd.DatetimeIndex(df.date_time)
    df["month"] = df_datetime.month
    df["week"] = df_datetime.week
    df["day"] = df_datetime.dayofweek + 1
    df["hour"] = df_datetime.hour
    del df['date_time']

def new_historical_price(df):
    """
    'unlogs' prop_log_historical_price column
    """
    df["prop_historical_price"] = (np.e ** df.prop_log_historical_price).replace(1.0, 0)
    df.drop("prop_log_historical_price", axis=1, inplace=True)

def add_price_position(df, rank_type = "dense"):
    """
    adds hotel price position ("price_position") inside "srch_id" column
    """
    ranks = df.groupby('srch_id')['price_usd'].rank(ascending=True, method = rank_type)
    df["price_position"] = ranks


def average_numerical_features(df, group_by = ["prop_id"], columns = ["prop_starrating", "prop_review_score", "prop_location_score1", "prop_location_score2"]):
    """
    adds mean, median and standard deviation per prop_id (default) 
    for columns that are related to property (default)
    """
    # caulcate means and rename columns
    means = df.groupby(group_by)[columns].mean().reset_index()
    means.columns = [means.columns[0]] + [x + "_mean" for x in means.columns[1:]]
    # caulcate median and rename columns
    medians = df.groupby(group_by)[columns].median().reset_index()
    medians.columns = [medians.columns[0]] + [x + "_median" for x in medians.columns[1:]]
    # caulcate means and rename columns
    stds = df.groupby(group_by)[columns].std().reset_index()
    stds.columns = [stds.columns[0]] + [x + "_std" for x in stds.columns[1:]]
    ## attach aggregated data to the df
    df = pd.merge(df, means, on=group_by)
    df = pd.merge(df, medians, on=group_by)
    df = pd.merge(df, stds, on=group_by)
    return df

def add_historical_booking_click(df):
    """
    creates a column with the percentage of the prop_id booked/clicked rate overall
    """
    # there are more prop_id in the test data than in train. 
    # Maybe we could still use this but would need to impute
    # with the most common value (or something else)
    
    historical = df.groupby("prop_id")[["click_bool", "booking_bool"]].mean().reset_index()
    historical.columns = [historical.columns[0]] + [x + "_rate" for x in historical.columns[1:]]
    df = pd.merge(df, historical, on="prop_id")
    df.sort_values("srch_id", inplace = True)
    return df

def join_historical_data(df, path = "hist_click_book.csv"):
    """
    joins historical data according to prop_id. 
    path - location of historical data csv file
    
    """
    to_join = pd.read_csv(path)
    joined = pd.merge(df, to_join, on="prop_id")
    return joined.sort_values("srch_id")
    
    
## other ----------------------------------

def remove_cols(df, cols = ["position", "prop_id"]):
    df.drop(cols, axis=1, inplace=True)

def remove_positions(df, positions = [5, 11, 17, 23]):
    """
    removes hotels with specified positions 
    (based on the fact that hotels in those positions were not as booked)
    """
    df = df[df["position"].isin(positions) == False]

def add_score(df):
    
    """
    adds 'score' column to the df: 5 for booked, 1 for clicked
    """
    
    score = []
    for book, click in zip(df.booking_bool, df.click_bool):
        if book == 1:
            score.append(5)
            continue
        if click == 1:
            score.append(1)
            continue
        else:
            score.append(0)
    df["score"] = score
    del df['booking_bool']
    del df['click_bool']

def onehot(df, cols):
    """ 
    returns a df with one-hot encoded columns (cols)
    """
    
    return pd.get_dummies(df, columns=cols)

def sampling(df, target, method="undersampling", frac=0.3):
    
    """
    df: input dataframe
    targetcol: target column of majority class
    method: specifies method of sampling - 'undersampling' or 'combination' of undersampling and oversampling.
    frac: final fraction minority wrt majority class (default fraction 0.15/0.85)
    
    returns: df with undersampled majority and oversampled minority class
    
    note that this only has to be performed on the training data!
    
    """
    
    # Split df in minority and majority
    minority = df.loc[df[target] > 0]
    majority = df.loc[df[target] == 0]
    lenmin = len(minority)
    lenmaj = len(majority)
    
    # Calculate current fraction
    frac_min = lenmin/(lenmin+lenmaj)
    frac_maj = 1-frac_min
    print(f"Current fraction:\nMinority class: {frac_min}, Majority class: {frac_maj}")
    
    if method == "undersampling":
        
        sampling_frac = ((1-frac)/frac*lenmin)/lenmaj
        sampled_df = df.groupby('srch_id').sample(frac=sampling_frac)
        fin_frac = lenmin / (len(sampled_df) + lenmin)

    elif method == "combination":
        
        # This still needs to be implemented
        
        return
    else:
        raise ExceptionError("Invalid argument for 'method'")
        
    
    print(f"Final fraction:\nMinority class: {fin_frac}, Majority class: {1-fin_frac}")
    
    dfs = [minority, sampled_df]
    finaldf = pd.concat(dfs)
    finaldf.sort_values("srch_id", inplace = True)
    finaldf = finaldf.reset_index(drop=True)

    print("Done")
    return finaldf


### Feature engineering function -----------

def feature_engineering_train(df):
    
    extract_time(df)
    remove_missing_values(df)
    replace_missing_values(df)
    new_historical_price(df)
    add_price_position(df)
#     df = average_numerical_features(df)
#     df = add_historical_booking_click(df)
    add_score(df)
    #remove_cols(df)
    return df

def feature_engineering_test(df):
    
    extract_time(df)
    remove_missing_values(df)
    replace_missing_values(df)
    new_historical_price(df)
    add_price_position(df)
#     df = average_numerical_features(df)
#     df = join_historical_data(df, path="data/hist_click_book.csv")
    return df
    
def create_df_queries_freq(df):
    df_queries = pd.DataFrame()
    df_queries = pd.crosstab(index=df['srch_id'], columns='count', colnames=['srch_id'])
    df_queries.head()
    df_queries.to_csv("../df_queries.csv")
    return pd.read_csv("../df_queries.csv")

In [3]:
df = pd.read_csv("data/training_set_VU_DM.csv")
testset = pd.read_csv("data/test_set_VU_DM.csv")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# add historical bookings to test and trainingset

df = feature_engineering_train(df)
testset = feature_engineering_test(testset)

In [None]:
# check
df.head()

In [None]:
start = time.time()
df = sampling(df, "score", method="undersampling")
end = time.time()

print(f"Execution took {end-start:.2f} seconds")

In [None]:
# Second check
df.head()

In [None]:
# save properties
properties = testset['prop_id']

# delete 

del df['prop_id']
del df['position']
del testset['prop_id']

In [None]:
# trainingset (df) should have 1 column more; the target column score

print(len(testset.columns), len(df.columns))
print(set(df.columns)-set(testset.columns))

### NDCG

In [None]:
def NDCG(predictions, df, path_idcg = "idcg.csv"):
    """
    takes predicted positions and calulates average ndcg.
    predictions - dataframe must have "srch_id" and "prop_id" ordered by relevance (inside "srch_id") (basically Lotte's model "out" dataframe)
    df - training dataset (must contain "srch_id", "prop_id", "score")
    path_idcg - path to idcg scores per "srch_id"
    """
    # reset index 
    predictions.reset_index(drop = True, inplace = True)
    # add position + 1
    predictions["position"] = predictions.groupby(by = ['srch_id']).cumcount()+1
    # filter to only have positions up to 5
    predictions = predictions[predictions.position < 6]
    # attach scores to predictions
    predictions = pd.merge(predictions, df[["srch_id", "prop_id", "score"]], on = ["srch_id", "prop_id"])
    predictions["numerator"] = predictions["score"]
    predictions["denominator"] = np.log2(predictions["position"])
    predictions.loc[predictions.position == 1, "denominator"] = 1
    predictions["intermediate_dcg"] = predictions["numerator"]/predictions["denominator"]
    dcg = predictions.groupby("srch_id")["intermediate_dcg"].sum().reset_index()
    dcg.columns = ["scrh_id", "DCG"]
    # read idcg
    idcg = pd.read_csv(path_idcg)
    # attach idcg to dcg
    joined = pd.merge(dcg, idcg, on = "scrh_id")
    # calculate NDCG
    joined["NDCG"] = joined["DCG"]/joined["iDCG"]
    # calculate mean NDCG
    return joined["NDCG"].mean()

### XGboost

In [None]:
# imports
import xgboost as xgb
from xgboost import DMatrix
from sklearn.model_selection import GroupShuffleSplit

In [None]:
# Only useful for experimenting

# split sample in 80 percent training and 20 percent test by srch_id

# gss = GroupShuffleSplit(test_size=.20, n_splits=1, random_state = 7).split(df, groups=df['srch_id'])

# X_train_inds, X_test_inds = next(gss)

In [None]:
# train_data= df.iloc[X_train_inds]

X_train = df.loc[:, ~df.columns.isin(['srch_id','score'])]
y_train = df.loc[:, df.columns.isin(['score'])]

In [None]:
groups = df.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

In [None]:
# learning rate to 0.1
# max_depth 10
# n_estimators
# with the hist thing.

model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75,
    )

model.fit(X_train, y_train, group=groups, verbose=True)

In [None]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])

In [None]:
# this is not necessary right?

del df['score']

In [None]:
predictions = (testset.groupby('srch_id')
               .apply(lambda x: predict(model, x)))

In [None]:
predictions

In [None]:
# Most important features

xgb.plot_importance(model)

In [None]:
raise NotImplementedError("Stop running")

### Preparation for submission

In [None]:
# This could be done more efficiently

# Prepare output file
output = pd.DataFrame()
output["srch_id"] = testset["srch_id"]
output["prop_id"] = properties

# Add scores
pred_scores_list = []

for i in predictions:
    for j in i:
        pred_scores_list.append(j)      

output["pred_scores"] = pred_scores_list

In [None]:
# Sort on predicted_score output within srch_id
start = time.time()

out = output.groupby('srch_id').apply(pd.DataFrame.sort_values, 'pred_scores', ascending=False)

end = time.time()
print(f"Execution took {end-start:.2f} seconds")

In [None]:
del out["pred_scores"]

In [None]:
out.head()

In [None]:
# write to csv

out.to_csv('data/15052021-2.csv', index=False)