# **Importing libraries**

In [128]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import timeit
import hashlib
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt



pd.set_option('display.max_columns', None)

In [129]:
# Load dataset with tqdm progress bar
df_amenities = pd.read_csv('../expedia_data/amenities.tsv.gz', compression='gzip', sep='\t')

with tqdm(total=1, desc="Loading dataset") as pbar:
   df = pd.read_csv('../expedia_data/main.tsv', sep='\t', nrows=1000)
   pbar.update(1)

print("Dataset loaded successfully!")

# Function to convert string to integer ID

def convert_to_int_id(string):
    # Convert string to bytes and hash it using MD5
    hash_object = hashlib.md5(string.encode())
    # Convert hexadecimal digest to integer
    int_id = int(hash_object.hexdigest(), 16)
    return int_id


df['search_id'] = pd.factorize(df['search_id'].apply(convert_to_int_id))[0]
df['user_id'] = pd.factorize(df['user_id'].apply(convert_to_int_id))[0]


Loading dataset: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 68.43it/s]

Dataset loaded successfully!





# **Introducing new features**

- length of stay (checkout_date-checkin_date)
- booking window (day of year(checkin)-day of year(serch_timestamp))
- search_day_of_week= df['search_timestamp'].dt.dayofweek
- search_hour_of_day= df['search_timestamp'].dt.hour

In [130]:
df['length_of_stay']=(pd.to_datetime(df['checkout_date'])-pd.to_datetime(df['checkin_date'])).dt.days
df['booking_window'] = ((pd.to_datetime(df['checkin_date'], utc=True))  - pd.to_datetime(df['search_timestamp'], utc=True)).dt.days
df['booking_window'] = df['booking_window'].apply(lambda x: 0 if x < 0 else x) # ensures that all negative values in the 'booking_window' column are set to 0.
df.dropna(subset=['booking_window', 'checkin_date', 'checkout_date'], inplace=True)# drop queries where no information on checkin_date and checkout_date
df['search_timestamp']=pd.to_datetime(df['search_timestamp'])
df['search_day_of_week'], df['search_hour_of_day']=df['search_timestamp'].dt.dayofweek, df['search_timestamp'].dt.hour
df['checkin_day']=pd.to_datetime(df['checkin_date']).dt.dayofweek
df['checkout_day']=pd.to_datetime(df['checkin_date']).dt.dayofweek



# **Explode of impression columns**

In [131]:
from tqdm import tqdm
impression_column_names=['rank','prop_id','is_travel_ad','review_rating','review_count','star_rating','is_free_cancellation', 'is_drr','price_bucket','num_clicks','is_trans']

# Assuming you already have 'df' loaded and 'impression_column_names' defined
base_columns = list(df.columns)
exploded_column_names = base_columns + impression_column_names
exploded_rows = []

for index, row in tqdm(df.iterrows(), total=len(df), desc="Splitting impressions", unit="row"):
    impressions = row['impressions'].split('|')
    for impression in impressions:
        impression_data = impression.split(',')
        full_row_data = list(row.values) + impression_data  # Ensure this concatenates correctly
        if len(full_row_data) != len(exploded_column_names):
            print(f"Data length mismatch at index {index}: expected {len(exploded_column_names)}, got {len(full_row_data)}")
        else:
            exploded_rows.append(full_row_data)

# Only create DataFrame if data lengths match
if exploded_rows and len(exploded_rows[0]) == len(exploded_column_names):
    df1 = pd.DataFrame(exploded_rows, columns=exploded_column_names)
else:
    print("Error: Data length mismatch detected, DataFrame not created.")

Splitting impressions: 100%|█████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3337.60row/s]


In [132]:
df1=df1.drop({'impressions','applied_filters'},axis=1)
df_searchid_grouped = df1.groupby('search_id', as_index=False).agg({'num_clicks':'sum', 'is_trans':'sum'})

searchid_tokeep = df_searchid_grouped[df_searchid_grouped['num_clicks'] != 0]['search_id'].tolist()  #filters out rows in df_m where the sum of 'num_clicks' isn't zero, then extracts the corresponding 'search_id' values into a list.
df1 = df1[df1['search_id'].isin(searchid_tokeep)]

# Grouping the DataFrame by 'search_id' and filtering out groups with only one row
df1 = df1.groupby('search_id').filter(lambda x: len(x) > 1)

columns_converted = ['rank', 'is_travel_ad', 'review_rating', 'review_count', 'star_rating',
                     'is_free_cancellation', 'is_drr', 'price_bucket', 'num_clicks', 'is_trans']


# Use tqdm to track progress across columns
for column in tqdm(columns_converted, desc="Converting columns"):
    df1[column] = pd.to_numeric(df1[column], errors='coerce')

searchid_withtrans_nobooking = df1[((df1['is_trans'] > 0) & (df1['num_clicks'] == 0))]['search_id'].tolist()
# Filter 'df1' to remove rows with 'search_id's in 'searchid_withtrans_nobooking'
df1 = df1[~df1['search_id'].isin(searchid_withtrans_nobooking)]

# Keep properties without travel advertisement
df1 = df1[df1['is_travel_ad'] == 0]

df_amenities['prop_id'] = df_amenities['prop_id'].astype(int)
df1['prop_id'] = df1['prop_id'].astype(int)

df_merged = df1.drop(columns=['search_timestamp'], axis=1).merge(df_amenities, on='prop_id', how='left')

for col in df_amenities.columns:
    if col in df_merged.columns:
        df_merged[col] = df_merged[col]


# Clip values of 'star_rating' column to a range of 0 to 5
df_merged['star_rating'] = df_merged['star_rating'].clip(upper=5)
# Convert 'review_count' to numeric type
df_merged['review_count'] = pd.to_numeric(df_merged['review_count'], errors='coerce')

# Drop rows where 'review_count' is NA
df_merged.dropna(subset=['review_count'], inplace=True)

# Fill NA values based on conditions
df_merged.loc[df_merged['review_count'] == 0, 'star_rating'] = df_merged.loc[df_merged['review_count'] == 0, 'star_rating'].fillna('Not Available')
df_merged.loc[(df_merged['review_count'] > 0) & (df_merged['star_rating'].isna()), 'star_rating'] = 0

# Fill NA values based on conditions
df_merged.loc[df_merged['review_count'] == 0, 'review_rating'] = df_merged.loc[df_merged['review_count'] == 0, 'review_rating'].fillna('Not Available')
df_merged.loc[(df_merged['review_count'] > 0) & (df_merged['review_rating'].isna()), 'review_rating'] = 0

df_merged.dropna(subset=['price_bucket'], inplace=True)

df_searchid_grouped = df_merged.groupby('search_id', as_index=False).agg({'num_clicks':'sum', 'is_trans':'sum'})

searchid_tokeep = df_searchid_grouped[df_searchid_grouped['num_clicks'] != 0]['search_id'].tolist()  #filters out rows in df_m where the sum of 'num_clicks' isn't zero, then extracts the corresponding 'search_id' values into a list.
df_merged = df_merged[df_merged['search_id'].isin(searchid_tokeep)]

# Group by search_id and aggregate the sum of num_clicks and count of rows per search_id
df_searchid_grouped = df_merged.groupby('search_id', as_index=False).agg({'num_clicks':'sum', 'is_trans':'count'})

# Filter search_ids with 0 clicks or just one value
searchids_to_keep = df_searchid_grouped[(df_searchid_grouped['num_clicks'] != 0) & (df_searchid_grouped['is_trans'] > 1)]['search_id'].tolist()

# Filter df_merged based on search_ids to keep
df_filtered = df_merged[df_merged['search_id'].isin(searchids_to_keep)]

grouped_df = df_amenities.groupby('prop_id').sum()

# Calculate the number_of_amenities column as the sum of all other columns
grouped_df['number_of_amenities'] = grouped_df.sum(axis=1)

df_filtered = pd.merge(df_filtered, grouped_df[['number_of_amenities']], left_on='prop_id', right_index=True, how='left')

#df_filtered['number_of_amenities'].fillna(0, inplace=True)

df_filtered['number_of_amenities'] = df_filtered['number_of_amenities'].fillna(0)



Converting columns: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 69.12it/s]
  df_merged.loc[df_merged['review_count'] == 0, 'star_rating'] = df_merged.loc[df_merged['review_count'] == 0, 'star_rating'].fillna('Not Available')


In [133]:
df_filtered.to_csv('../expedia_data/HandoverTest/preprocess.csv')

In [134]:
df_preprocess=pd.read_csv('../expedia_data/HandoverTest/preprocess.csv')

In [135]:
main_columns = {
    'qid': int,
    'point_of_sale': 'O',
    'geo_location_country': 'O',
    'is_mobile': bool,
    'destination_id': 'O',
    'adult_count': float,
    'child_count': float,
    'infant_count': float,
    'room_count': float,
    'sort_type': 'O',
    'length_of_stay': float,
    'booking_window': float,
    'search_day_of_week': 'O',
    'search_hour_of_day': 'O',
    'checkin_day': 'O',
    'checkout_day': 'O'
}

impression_columns = {
    'rank': float,
    'is_travel_ad': bool,
    'review_rating': 'O',
    'review_count': float,
    'star_rating': 'O',
    'is_free_cancellation': bool,
    'is_drr': bool,
    'price_bucket': float,
    'num_clicks': int,
    'is_trans': bool
}

amenities_columns = {
    'prop_id': 'O',
    'AirConditioning': bool,
    'AirportTransfer': bool,
    'Bar': bool,
    'FreeAirportTransportation': bool,
    'FreeBreakfast': bool,
    'FreeParking': bool,
    'FreeWiFi': bool,
    'Gym': bool,
    'HighSpeedInternet': bool,
    'HotTub': bool,
    'LaundryFacility': bool,
    'Parking': bool,
    'PetsAllowed': bool,
    'PrivatePool': bool,
    'SpaServices': bool,
    'SwimmingPool': bool,
    'WasherDryer': bool,
    'WiFi': bool
}

features_engineered= {'user_id': 'O',
 'search_id': 'O',
 'checkin_date': 'datetime',
 'checkout_date': 'datetime',
 'rank_noad': float,
 'number_of_amenities': float}

# Given list of all columns
all_columns = df_preprocess.columns.tolist()
# Columns present in any of the dictionaries
present_columns = set(list(main_columns.keys()) + list(impression_columns.keys()) + list(amenities_columns.keys()))

# Columns not present in any of the dictionaries
#features_engineered = {column: None for column in all_columns if column not in present_columns}

for column in tqdm(df_preprocess.columns, desc="Converting columns"):
    if column in main_columns:
        df_preprocess[column] = df_preprocess[column].astype(main_columns[column])
    elif column in impression_columns:
        df_preprocess[column] = df_preprocess[column].astype(impression_columns[column])
    elif column in amenities_columns:
        df_preprocess[column] = df_preprocess[column].astype(amenities_columns[column])

df_preprocess.rename(columns={'search_id':'qid'}, inplace=True)

df_preprocess['relevance'] = df_preprocess['num_clicks'] + 2 * df_preprocess['is_trans']
df_preprocess.loc[df_preprocess['relevance'] > 3, 'relevance'] = 3


Converting columns: 100%|█████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 5903.98it/s]


# **Columns to be dropped**

Columns to be not dropped are '#' or commented

In [136]:
main_columns_dropped = {
    #'qid': int, # to be dropped after train, test split
    'point_of_sale': 'O',
    'geo_location_country': 'O',
    #'is_mobile': bool,
    #'destination_id': 'O', # dropping destination_id
    #'adult_count': float,
    #'child_count': float,
    #'infant_count': float,
    #'room_count': float,
    #'sort_type': 'O',
    'length_of_stay': int,
    'booking_window': int,
    'search_day_of_week': 'O',
    'search_hour_of_day': 'O',
    #'checkin_day': 'O',
    #'checkout_day': 'O'
}

impression_columns_dropped = {
    #'rank': float, # rank_dropped
    #'is_travel_ad': bool,
    #'review_rating': 'O',
    #'review_count': float,
    #'star_rating': 'O',
    #'is_free_cancellation': bool,
    #'is_drr': bool,
    #'price_bucket': float,
    'num_clicks': int,
    'is_trans': bool
}
amenities_columns_dropped={'prop_id': 'O'} #
'''
amenities_columns_dropped = {

    'prop_id': 'O',
    'AirConditioning': bool,
    'AirportTransfer': bool,
    'Bar': bool,
    'FreeAirportTransportation': bool,
    'FreeBreakfast': bool,
    'FreeParking': bool,
    'FreeWiFi': bool,
    'Gym': bool,
    'HighSpeedInternet': bool,
    'HotTub': bool,
    'LaundryFacility': bool,
    'Parking': bool,
    'PetsAllowed': bool,
    'PrivatePool': bool,
    'SpaServices': bool,
    'SwimmingPool': bool,
    'WasherDryer': bool,
    'WiFi': bool

}
'''

features_engineered_dropped= {
    'user_id': 'O',
    'search_id': 'O',
    'checkin_date': 'datetime',
    'checkout_date': 'datetime',
    #'rank_noad': float,
    #'number_of_amenities': int,
    'checkin_day' :int,
    'checkout_day':int,
    #'relevance':int
 }


In [137]:

# Define the lists of columns to be dropped
columns_to_drop = {**main_columns_dropped, **impression_columns_dropped,
                   **amenities_columns_dropped, **features_engineered_dropped}

# Loop through each column in df_model and perform conversions
for column in tqdm(df_preprocess.columns, desc="Converting columns"):
    if column in columns_to_drop:
        df_preprocess.drop(column, axis=1, inplace=True)
    elif column in main_columns:
        df_preprocess[column] = df_preprocess[column].astype(main_columns[column])
    elif column in impression_columns:
        df_preprocess[column] = df_preprocess[column].astype(impression_columns[column])
    elif column in amenities_columns:
        df_preprocess[column] = df_preprocess[column].astype(amenities_columns[column])


Converting columns: 100%|█████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 1912.21it/s]


# **Numeric and Categoric data handling**

# Train, validation and test split

In [138]:
df_model=df_preprocess.copy()

In [139]:
from sklearn.model_selection import GroupShuffleSplit

# Define the function for splitting based on qids
def split_by_qid(df, test_size):
    gss = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=42)
    ids = df['qid'].unique()
    train_idx, test_idx = next(gss.split(ids, groups=ids))
    return df[df['qid'].isin(ids[train_idx])], df[df['qid'].isin(ids[test_idx])]

# Perform the splits
train_df, temp_df = split_by_qid(df_model, test_size=0.4)
valid_df, test_df = split_by_qid(temp_df, test_size=0.5)

# Display shapes
train_df.shape, valid_df.shape, test_df.shape

((6819, 37), (2165, 37), (2085, 37))

In [140]:
# Reordering the columns
columns_reordered = ['relevance'] + [col for col in train_df.columns if col != 'relevance']
train_df = train_df[columns_reordered]
valid_df = valid_df[columns_reordered]
test_df = test_df[columns_reordered]

X_train = train_df.drop(columns=['relevance'],axis=1)
y_train = train_df['relevance']

X_valid = valid_df.drop(columns=['relevance'],axis=1)
y_valid = valid_df['relevance']

X_test = test_df.drop(columns=['relevance'],axis=1)
y_test = test_df['relevance']

query_id_train = X_train['qid']
query_id_valid = X_valid['qid']
query_id_test = X_test['qid']

# Handling categorical with pd.get_dummies(OHE) and numerical columns with standard scaling

In [141]:
categorical_features = X_train.select_dtypes(include=['object','bool']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int','float']).columns.tolist()

combined_df=pd.concat([X_train, X_valid, X_test])
combined_df=pd.get_dummies(combined_df, columns=categorical_features)

X_train=combined_df[:len(X_train)]
X_valid=combined_df[len(X_train): len(X_train)+len(X_valid)]
X_test=combined_df[len(X_train)+len(X_valid):]
X_train.shape, X_test.shape, X_valid.shape

((6819, 227), (2085, 227), (2165, 227))

In [142]:
def check_qids_unique(X_train, X_test, X_valid):
    train_qids = set(X_train.iloc[:, 1])  # Assuming qid is at index 1
    test_qids = set(X_test.iloc[:, 1])    # Assuming qid is at index 1
    valid_qids = set(X_valid.iloc[:, 1])  # Assuming qid is at index 1

    # Check if the lengths of sets of qids are equal to the lengths of arrays
    return len(train_qids) == X_train.shape[0] and \
           len(test_qids) == X_test.shape[0] and \
           len(valid_qids) == X_valid.shape[0]

# Example usage:
# Replace X_train, X_test, X_valid with your actual datasets
# Replace the index 1 with the index of qid column in your datasets
check_qids_unique(X_train, X_test, X_valid)


False

In [143]:
X_train_2 = X_train.drop(columns=['qid'])
X_valid_2 = X_valid.drop(columns=['qid'])
X_test_2 = X_test.drop(columns=['qid'])

In [144]:
scaler=StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_valid[numerical_features] = scaler.transform(X_valid[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid[numerical_features] = scaler.transform(X_valid[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical_features] = scaler.transform(X_t

In [145]:
X_train.shape, X_test.shape, X_valid.shape

((6819, 227), (2085, 227), (2165, 227))

In [146]:

# Define function to replace values in DataFrame
def replace_values(df):
    # Convert DataFrame to numpy array
    df_array = df.values

    # Use tqdm to track progress
    for i in tqdm(range(df_array.shape[0]), desc="Replacing values"):
        for j in range(df_array.shape[1]):
            if df_array[i, j] == False:
                df_array[i, j] = 0
            elif df_array[i, j] == True:
                df_array[i, j] = 1

    # Convert numpy array back to DataFrame
    df_result = pd.DataFrame(df_array, columns=df.columns)
    return df_result

# Replace values in X_train, X_valid, X_test
X_train, X_valid, X_test = replace_values(X_train), replace_values(X_valid), replace_values(X_test)



Replacing values: 100%|██████████████████████████████████████████████████████████| 6819/6819 [00:00<00:00, 44645.96it/s]
Replacing values: 100%|██████████████████████████████████████████████████████████| 2165/2165 [00:00<00:00, 45436.51it/s]
Replacing values: 100%|██████████████████████████████████████████████████████████| 2085/2085 [00:00<00:00, 46071.58it/s]


In [147]:
from sklearn.datasets import dump_svmlight_file

# Save X_train as SVMLight file with .txt extension
file_path_train = 'path/train.txt'
dump_svmlight_file(X_train, y_train, f=file_path_train, query_id=query_id_train)

# Save X_test as SVMLight file with .txt extension
file_path_test = 'path/test.txt'
dump_svmlight_file(X_test, y_test, f=file_path_test, query_id=query_id_test)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Expedia/allRankNew/allRank/expedia_data/train_574C_rank.txt'

# **Light GBM Implementation**

In [None]:
import lightgbm as lgb

X_train_arr = X_train_2.values, X_valid_arr=X_valid_2.values

y_train_array, y_valid_array = y_train.values.ravel(), y_valid.values.ravel()

query_group_sizes_train = []
for i in X_train['qid'].value_counts():
  query_group_sizes_train.append(i)

query_group_sizes_train_arr = np.array(query_group_sizes_train)


# Define LightGBM parameters for ranker
params = {
    'objective': 'lambdarank',  # Ranking objective
    'metric': 'ndcg',  # Evaluation metric
    'learning_rate': 0.01,
    'verbose': 0
}


train_data = lgb.Dataset(X_train_arr, label=y_train_array, group=query_group_sizes_train_arr)


num_rounds = 1000
ranker_model = lgb.train(params, train_data, num_rounds)


In [None]:
X_test_2 = X_test.drop(columns=['qid'])
X_test_arr = X_test_2.values

y_test_array = y_test.values.ravel()

query_group_sizes_test = []
for i in X_test['qid'].value_counts():
  query_group_sizes_test.append(i)
query_group_sizes_test_arr = np.array(query_group_sizes_test)

test_data = lgb.Dataset(X_test_arr, label=y_test_array, group=query_group_sizes_test_arr, reference=train_data)


In [None]:
X_train_arr.shape, X_test_arr.shape

In [None]:
test_predictions = ranker_model.predict(X_test_arr.astype(float))

y_test_df = pd.DataFrame({'relevance_score': y_test, 'predicted_ranking': test_predictions})
y_test_df.head()

test_df['predicted_score'] = test_predictions
y_test_df.shape

In [None]:
test_df['relevance'], test_df['predicted_score']

# **Visualization on amenities**

In [None]:
# Select columns from df8 based on the selected column names
selected_columns = ['price_bucket', 'AirConditioning', 'AirportTransfer', 'Bar', 'FreeAirportTransportation', 'FreeBreakfast', 'FreeParking', 'FreeWiFi', 'Gym', 'HighSpeedInternet', 'HotTub', 'LaundryFacility', 'Parking', 'PetsAllowed', 'PrivatePool', 'SpaServices', 'SwimmingPool', 'WasherDryer', 'WiFi']
df8_amen_plot = df_preprocess[selected_columns]

amenities_correlation = df8_amen_plot.corr()

mask = np.triu(np.ones_like(amenities_correlation, dtype=bool))

plt.figure(figsize=(16, 12))

sns.heatmap(amenities_correlation, mask=mask, cmap='coolwarm', annot=True, fmt=".2f")

plt.title('Correlation between Amenities', fontdict={'size': 28, 'color': 'black'})
plt.tight_layout()

plt.xlabel('Amenities', fontdict={'size': 16, 'color': 'black'})
plt.ylabel('Amenities', fontdict={'size': 16, 'color': 'black'})


plt.show()
