# **Mounting google drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **Importing libraries**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import timeit
import hashlib
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import dump_svmlight_file
import lightgbm as lgb
from sklearn.metrics import ndcg_score
import plotly.graph_objects as go
from tqdm.auto import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

pd.set_option('display.max_columns', None)

In [5]:
df_amenities = pd.read_csv('/content/drive/MyDrive/Expedia/Data/amenities.tsv.gz', compression='gzip', sep='\t')


In [3]:
df7=pd.read_csv('/content/drive/My Drive/Expedia/Data/workstation_1millionrows/millionrows_Top500Destinations.csv')

In [6]:
df8=df7.copy()

In [7]:
for col in df_amenities.columns:
    if col != 'prop_id':
        df8[col] = df8[col].map({'True': 1, 'False': 0, '0': 0})





In [8]:
df8['number_of_amenities'] = df8[df_amenities.columns.difference(['prop_id'])].sum(axis=1)

In [10]:
df8 = df8[~df8['star_rating'].isin(['0', '0.0', 'Not Available'])]
df8['star_rating'] = df8['star_rating'].astype(float)
df8 = df8[~df8['review_rating'].isin([0])]
df8 = df8[df8['is_travel_ad'] == 0]

df8 = df8[df8['sort_type']=='RECOMMENDED']
df8['checkin_month']=pd.to_datetime(df8['checkin_date']).dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df8['star_rating'] = df8['star_rating'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df8['checkin_month']=pd.to_datetime(df8['checkin_date']).dt.month


In [11]:
# Define conditions
condition_1 = (df8['num_clicks'] > 0) & (df8['is_trans'] == 0)
condition_2 = (df8['num_clicks'] > 0) & (df8['is_trans'] > 0)

# Apply conditions and assign values using numpy's where
df8['relevance'] = np.where(condition_1, 1, np.where(condition_2, 2, 0))

# Count the values
df8['relevance'].value_counts()


df8['price_bucket_category'] = np.where(df8['price_bucket'] <= 3, 1, 2)
df8['comb_click_pricebucket_relevance'] = np.where(df8['relevance'].isin([1, 2]), 3 * df8['relevance'] + df8['price_bucket_category'], df8['relevance'])
df8['comb_click_pricebucket_relevance'].value_counts()


comb_click_pricebucket_relevance
0    1443900
4      61008
5      35614
7       5395
8       2964
Name: count, dtype: int64

In [12]:
df8['review_rating_category'] = np.where(df8['review_rating'] <= 3, 0, np.where(df8['review_rating'] == 4, 1, 2))
df8['combined_click_reviewrating_relevance'] = np.where(df8['relevance'].isin([1, 2]), 3 * df8['relevance'] + df8['review_rating_category'], df8['relevance'])
df8['combined_click_reviewrating_relevance'].value_counts()


combined_click_reviewrating_relevance
0    1443900
4      67066
5      23444
3       6112
7       5992
8       1885
6        482
Name: count, dtype: int64

In [13]:
df8['combined_click_reviewrating_review_count_relevance'] = np.where(df8['relevance'] != 0,
    np.where(
        (df8['review_count'] > 200) & (df8['review_rating'] == 4),
        df8['combined_click_reviewrating_relevance'] + 2,
        np.where(
            (df8['review_count'] > 200) & (df8['review_rating'] == 5),
            df8['combined_click_reviewrating_relevance'] + 3,
            np.where(
                (df8['review_count'] > 200) & (df8['review_rating'] <= 3),
                df8['combined_click_reviewrating_relevance'] - 1,
                np.where(
                    (df8['review_count'] < 200) & ((df8['review_rating'] == 4) | (df8['review_rating'] == 5)),
                    df8['combined_click_reviewrating_relevance'] + 1,
                    df8['combined_click_reviewrating_relevance']
                )
            )
        )
    ),
    0
)


# **Data types for amenities and impression columns**

In [14]:
main_columns = {
    'qid': int,
    'point_of_sale': 'O',
    'geo_location_country': 'O',
    'is_mobile': int,
    'destination_id': 'O',
    'adult_count': float,
    'child_count': float,
    'infant_count': float,
    'room_count': float,
    'sort_type': 'O',
    'length_of_stay': float,
    'booking_window': float,
    'search_day_of_week': 'O',
    'search_hour_of_day': 'O',
    'checkin_day': 'O',
    'checkout_day': 'O'
}

impression_columns = {
    'rank': int,
    'is_travel_ad': int,
    'review_rating': int,
    'review_count': float,
    'star_rating': 'int',
    'is_free_cancellation': int,
    'is_drr': int,
    'price_bucket': int,
    'num_clicks': int,
    'is_trans': bool
}

amenities_columns = {
    'prop_id': 'O',
    'AirConditioning': int,
    'AirportTransfer': int,
    'Bar': int,
    'FreeAirportTransportation': int,
    'FreeBreakfast': int,
    'FreeParking': int,
    'FreeWiFi': int,
    'Gym': int,
    'HighSpeedInternet': int,
    'HotTub': int,
    'LaundryFacility': int,
    'Parking': int,
    'PetsAllowed': int,
    'PrivatePool': int,
    'SpaServices': int,
    'SwimmingPool': int,
    'WasherDryer': int,
    'WiFi': int
}

features_engineered= {'user_id': 'O',
 'search_id': 'O',
 'checkin_date': 'datetime',
 'checkout_date': 'datetime',
 'checkin_month': 'O',
 'rank_noad': int,
 'number_of_amenities': float,
 'relevance': int,
 'price_bucket_category': int,
 'comb_click_pricebucket_relevance': int,
 'review_rating_category': int,
 'combined_click_reviewrating_relevance': int,
 'combined_click_reviewrating_review_count_relevance': int
                      }

# Given list of all columns
all_columns = df8.columns.tolist()
# Columns present in any of the dictionaries
present_columns = set(list(main_columns.keys()) + list(impression_columns.keys()) + list(amenities_columns.keys()))

# Columns not present in any of the dictionaries
features_engineered = {column: None for column in all_columns if column not in present_columns}




In [15]:
for column in tqdm(df8.columns, desc="Converting columns"):
    if column in main_columns:
        df8[column] = df8[column].astype(main_columns[column])
    elif column in impression_columns:
        df8[column] = df8[column].astype(impression_columns[column])
    elif column in amenities_columns:
        df8[column] = df8[column].astype(amenities_columns[column])


Converting columns:   0%|          | 0/57 [00:00<?, ?it/s]

In [16]:
df8.rename(columns={'search_id':'qid'}, inplace=True)


# **Target encode prop_id, destination_id, geo_location_country,point_of_sale**

In [17]:
!pip install -q category_encoders

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [18]:
import category_encoders as ce
target_encoder = ce.TargetEncoder(cols=['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id'])
encoded_data = target_encoder.fit_transform(df8[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df8['relevance'])
df8 = df8.join(encoded_data.add_suffix('_target'))
df8.head()

Unnamed: 0,user_id,qid,point_of_sale,geo_location_country,is_mobile,destination_id,checkin_date,checkout_date,adult_count,child_count,infant_count,room_count,sort_type,length_of_stay,booking_window,search_day_of_week,search_hour_of_day,checkin_day,checkout_day,rank,prop_id,is_travel_ad,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,num_clicks,is_trans,rank_noad,AirConditioning,AirportTransfer,Bar,FreeAirportTransportation,FreeBreakfast,FreeParking,FreeWiFi,Gym,HighSpeedInternet,HotTub,LaundryFacility,Parking,PetsAllowed,PrivatePool,SpaServices,SwimmingPool,WasherDryer,WiFi,number_of_amenities,checkin_month,relevance,price_bucket_category,comb_click_pricebucket_relevance,review_rating_category,combined_click_reviewrating_relevance,combined_click_reviewrating_review_count_relevance,point_of_sale_target,geo_location_country_target,destination_id_target,prop_id_target
0,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,1,1624072,0,4,1250.0,4,1,1,2,1,False,1.0,1,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,5,7,1,1,4,1,4,6,0.073831,0.074679,0.062312,0.039179
3,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,4,5866319,0,4,2225.0,4,1,1,1,0,False,2.0,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,6,7,0,1,0,1,0,0,0.073831,0.074679,0.062312,0.041522
4,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,5,2935403,0,5,1050.0,4,1,1,2,0,False,3.0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,6,7,0,1,0,2,0,0,0.073831,0.074679,0.062312,0.025943
5,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,6,4518551,0,5,1375.0,4,1,1,1,0,False,4.0,0,0,1,0,0,0,1,1,1,0,1,0,1,0,0,0,0,0,6,7,0,1,0,2,0,0,0.073831,0.074679,0.062312,0.098901
6,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,7,3704800,0,4,1700.0,4,1,1,2,0,False,5.0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,4,7,0,1,0,1,0,0,0.073831,0.074679,0.062312,0.033248


 'price_bucket_category': int,
 'comb_click_pricebucket_relevance': int,
 'review_rating_category': int,
 'combined_click_reviewrating_relevance': int,
 'combined_click_reviewrating_review_count_relevance': int

In [19]:
encoded_data = target_encoder.fit_transform(df8[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df8['price_bucket_category'])
df8= df8.join(encoded_data.add_suffix('_pbc'))
df8.head()

Unnamed: 0,user_id,qid,point_of_sale,geo_location_country,is_mobile,destination_id,checkin_date,checkout_date,adult_count,child_count,infant_count,room_count,sort_type,length_of_stay,booking_window,search_day_of_week,search_hour_of_day,checkin_day,checkout_day,rank,prop_id,is_travel_ad,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,num_clicks,is_trans,rank_noad,AirConditioning,AirportTransfer,Bar,FreeAirportTransportation,FreeBreakfast,FreeParking,FreeWiFi,Gym,HighSpeedInternet,HotTub,LaundryFacility,Parking,PetsAllowed,PrivatePool,SpaServices,SwimmingPool,WasherDryer,WiFi,number_of_amenities,checkin_month,relevance,price_bucket_category,comb_click_pricebucket_relevance,review_rating_category,combined_click_reviewrating_relevance,combined_click_reviewrating_review_count_relevance,point_of_sale_target,geo_location_country_target,destination_id_target,prop_id_target,point_of_sale_pbc,geo_location_country_pbc,destination_id_pbc,prop_id_pbc
0,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,1,1624072,0,4,1250.0,4,1,1,2,1,False,1.0,1,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,5,7,1,1,4,1,4,6,0.073831,0.074679,0.062312,0.039179,1.395464,1.39048,1.326515,1.117537
3,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,4,5866319,0,4,2225.0,4,1,1,1,0,False,2.0,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,6,7,0,1,0,1,0,0,0.073831,0.074679,0.062312,0.041522,1.395464,1.39048,1.326515,1.020761
4,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,5,2935403,0,5,1050.0,4,1,1,2,0,False,3.0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,6,7,0,1,0,2,0,0,0.073831,0.074679,0.062312,0.025943,1.395464,1.39048,1.326515,1.150943
5,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,6,4518551,0,5,1375.0,4,1,1,1,0,False,4.0,0,0,1,0,0,0,1,1,1,0,1,0,1,0,0,0,0,0,6,7,0,1,0,2,0,0,0.073831,0.074679,0.062312,0.098901,1.395464,1.39048,1.326515,1.082418
6,2,2,1,1,0,49,2021-07-07,2021-07-12,2.0,0.0,0.0,1.0,RECOMMENDED,5.0,26.0,3,0,2,2,7,3704800,0,4,1700.0,4,1,1,2,0,False,5.0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,4,7,0,1,0,1,0,0,0.073831,0.074679,0.062312,0.033248,1.395464,1.39048,1.326515,1.01023


In [20]:
encoded_data = target_encoder.fit_transform(df8[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df8['comb_click_pricebucket_relevance'])
df8 = df8.join(encoded_data.add_suffix('_cpbc'))

In [21]:
encoded_data = target_encoder.fit_transform(df8[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df8['review_rating_category'])
df8 = df8.join(encoded_data.add_suffix('_rrc'))

In [22]:
encoded_data = target_encoder.fit_transform(df8[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df8['combined_click_reviewrating_relevance'])
df8 = df8.join(encoded_data.add_suffix('_ccrr'))

In [23]:
encoded_data = target_encoder.fit_transform(df8[['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']], df8['combined_click_reviewrating_review_count_relevance'])
df8 = df8.join(encoded_data.add_suffix('_ccrrcr'))

In [35]:
df8['combined_click_reviewrating_review_count_relevance']

0          6
3          0
4          0
5          0
6          0
          ..
2049513    0
2049514    0
2049515    0
2049516    0
2049517    0
Name: combined_click_reviewrating_review_count_relevance, Length: 1548881, dtype: int64

In [24]:
df8.shape

(1548881, 81)

# **Columns to be dropped**

Columns to be not dropped are '#' or commented

In [25]:
main_columns_dropped = {
    #'qid': int, # to be dropped after train, test split
    'point_of_sale': 'O',
    'geo_location_country': 'O',
    #'is_mobile': int,
    #'destination_id': 'O',
    #'adult_count': float,
    #'child_count': float, # dropping child count and infant count
    'infant_count': float,
    'room_count': float,
    'sort_type': 'O',
    #'length_of_stay': int,
    #'booking_window': int,
    'search_day_of_week': 'O',
    'search_hour_of_day': 'O',
    #'checkin_day': 'O',
    #'checkout_day': 'O',
    'prop_id': 'int'
}


impression_columns_dropped = {
    'rank': int, # rank_dropped
    'is_travel_ad': int,
    #'review_rating': int,
    #'review_count': float,
    #'star_rating': 'int',
    #'is_free_cancellation': int,
    #'is_drr': int,
    #'price_bucket': int,
    'num_clicks': int,
    'is_trans': bool
}
amenities_columns_dropped={} #
'''
amenities_columns_dropped = {

    'prop_id': 'O',
    'AirConditioning': int,
    'AirportTransfer': int,
    'Bar': int,
    'FreeAirportTransportation': int,
    'FreeBreakfast': int,
    'FreeParking': int,
    'FreeWiFi': int,
    'Gym': int,
    'HighSpeedInternet': int,
    'HotTub': int,
    'LaundryFacility': int,
    'Parking': int,
    'PetsAllowed': int,
    'PrivatePool': int,
    'SpaServices': int,
    'SwimmingPool': int,
    'WasherDryer': int,
    'WiFi': int

}
'''

features_engineered_dropped= {
    'user_id': 'O',
    'search_id': 'O',
    'checkin_date': 'datetime',
    'checkout_date': 'datetime',
    #'rank_noad': float,
    #'number_of_amenities': int,
    #'checkin_day' :int,
    #'checkout_day':int,
    #'relevance':int
    #'price_bucket_category': int,
    #'comb_click_pricebucket_relevance': int,
    #'review_rating_category': int,
    #'combined_click_reviewrating_relevance': int,
    #'combined_click_reviewrating_review_count_relevance': int
 }

df8_dropped=df8.copy()

In [36]:
df8['combined_click_reviewrating_review_count_relevance']

0          6
3          0
4          0
5          0
6          0
          ..
2049513    0
2049514    0
2049515    0
2049516    0
2049517    0
Name: combined_click_reviewrating_review_count_relevance, Length: 1548881, dtype: int64

In [26]:
df8_dropped.isna().sum()

user_id                        0
qid                            0
point_of_sale                  0
geo_location_country           0
is_mobile                      0
                              ..
prop_id_ccrr                   0
point_of_sale_ccrrcr           0
geo_location_country_ccrrcr    0
destination_id_ccrrcr          0
prop_id_ccrrcr                 0
Length: 81, dtype: int64

In [37]:

# Define the lists of columns to be dropped
columns_to_drop = {**main_columns_dropped, **impression_columns_dropped,
                   **amenities_columns_dropped, **features_engineered_dropped}

# Loop through each column in df_model and perform conversions
for column in tqdm(df8_dropped.columns, desc="Converting columns"):
    if column in columns_to_drop:
        df8_dropped.drop(column, axis=1, inplace=True)
    elif column in main_columns:
        df8_dropped[column] = df8_dropped[column].astype(main_columns[column])
    elif column in impression_columns:
        df8_dropped[column] = df8_dropped[column].astype(impression_columns[column])
    elif column in amenities_columns:
        df8_dropped[column] = df8_dropped[column].astype(amenities_columns[column])


Converting columns:   0%|          | 0/66 [00:00<?, ?it/s]

# **Numeric and Categoric data handling**

# Train, validation and test split

In [75]:
df_model=df8_dropped.copy()

In [76]:
df_model=df_model[df_model['rank_noad']<=30]

In [77]:
# Group by 'qid' and calculate the sum of 'relevance'
df_temp = df_model.groupby('qid').agg(sum_relevance=('relevance', 'sum'))

# Extract qids where sum_relevance is 0
qids_to_remove = df_temp[df_temp['sum_relevance'] == 0].index

# Remove qids from df_model
df_model = df_model[~df_model['qid'].isin(qids_to_remove)]


In [78]:
from sklearn.model_selection import GroupShuffleSplit

# Define the function for splitting based on qids
def split_by_qid(df, test_size):
    gss = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=42)
    ids = df['qid'].unique()
    train_idx, test_idx = next(gss.split(ids, groups=ids))
    return df[df['qid'].isin(ids[train_idx])], df[df['qid'].isin(ids[test_idx])]

# Perform the splits
train_df, temp_df = split_by_qid(df_model, test_size=0.4)
valid_df, test_df = split_by_qid(temp_df, test_size=0.5)

# Display shapes
train_df.shape, valid_df.shape, test_df.shape

((876875, 66), (290786, 66), (293086, 66))

In [56]:
train_df['combined_click_reviewrating_review_count_relevance']

35         0
36         0
37         8
38         0
39         0
          ..
2049491    0
2049492    0
2049493    0
2049494    0
2049495    0
Name: combined_click_reviewrating_review_count_relevance, Length: 876875, dtype: int64

In [57]:
test2=df_model.copy()

In [94]:
target_relevance = {
    'relevance': '_target',
    'price_bucket_category': '_pbc',
    'comb_click_pricebucket_relevance': '_cpbc',
    'review_rating_category': '_rrc',
    'combined_click_reviewrating_relevance': '_ccrr',
    'combined_click_reviewrating_review_count_relevance': '_ccrrcr'
}

selected_key = 'combined_click_reviewrating_review_count_relevance'
selected_value = target_relevance[selected_key]

# Extract names of keys to be removed
non_selected_keys = [key for key in target_relevance.keys() if key != selected_key]

# Extract columns names of values to be removed, columns are named with _suffix
#non_selected_values = [value for value in target_relevance.values() if value != selected_value]

non_selected_values = [target_relevance[key] for key in non_selected_keys]

# Extract columns names of keys to be removed
columns_to_remove_keys = non_selected_keys


# Extract columns names of values to be removed, columns are named with _suffix
columns_to_remove_values = [col for col in train_df.columns if any(col.endswith(suffix) for suffix in non_selected_values)]

# Combine both lists to get the final columns to remove
columns_to_remove = columns_to_remove_keys + columns_to_remove_values

columns_to_remove

['relevance',
 'price_bucket_category',
 'comb_click_pricebucket_relevance',
 'review_rating_category',
 'combined_click_reviewrating_relevance',
 'point_of_sale_target',
 'geo_location_country_target',
 'destination_id_target',
 'prop_id_target',
 'point_of_sale_pbc',
 'geo_location_country_pbc',
 'destination_id_pbc',
 'prop_id_pbc',
 'point_of_sale_cpbc',
 'geo_location_country_cpbc',
 'destination_id_cpbc',
 'prop_id_cpbc',
 'point_of_sale_rrc',
 'geo_location_country_rrc',
 'destination_id_rrc',
 'prop_id_rrc',
 'point_of_sale_ccrr',
 'geo_location_country_ccrr',
 'destination_id_ccrr',
 'prop_id_ccrr']

In [95]:
# Remove columns with keys and suffixes to be removed

train_df = train_df.drop(columns=columns_to_remove)
test_df = test_df.drop(columns=columns_to_remove)
valid_df = valid_df.drop(columns=columns_to_remove)

# Reorder columns
columns_reordered = [selected_key] + [col for col in train_df.columns if col != selected_key]
train_df = train_df[columns_reordered]
test_df = test_df[columns_reordered]
valid_df = valid_df[columns_reordered]

In [96]:
train_df.shape, valid_df.shape, test_df.shape

((876875, 41), (290786, 41), (293086, 41))

In [97]:
X_train = train_df.drop(columns=[selected_key],axis=1)
y_train = train_df[selected_key]

X_valid = valid_df.drop(columns=[selected_key],axis=1)
y_valid = valid_df[selected_key]

X_test = test_df.drop(columns=[selected_key],axis=1)
y_test = test_df[selected_key]

query_id_train = X_train['qid']
query_id_valid = X_valid['qid']
query_id_test = X_test['qid']


# Handling categorical with pd.get_dummies(OHE) and numerical columns with standard scaling

In [98]:
categorical_features = ['checkin_day','checkout_day','checkin_month']
numerical_features = ['adult_count','length_of_stay','booking_window','review_count']


In [99]:
combined_df=pd.concat([X_train, X_valid, X_test])
combined_df=pd.get_dummies(combined_df, columns=categorical_features)

In [100]:
X_train=combined_df[:len(X_train)]
X_valid=combined_df[len(X_train): len(X_train)+len(X_valid)]
X_test=combined_df[len(X_train)+len(X_valid):]
X_train.shape, X_test.shape, X_valid.shape

((876875, 63), (293086, 63), (290786, 63))

In [None]:
X_train.head()

Unnamed: 0,qid,is_mobile,destination_id,adult_count,child_count,length_of_stay,booking_window,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,rank_noad,AirConditioning,AirportTransfer,Bar,FreeAirportTransportation,FreeBreakfast,FreeParking,FreeWiFi,Gym,HighSpeedInternet,HotTub,LaundryFacility,Parking,PetsAllowed,PrivatePool,SpaServices,SwimmingPool,WasherDryer,WiFi,number_of_amenities,point_of_sale_target,geo_location_country_target,destination_id_target,prop_id_target,checkin_day_0,checkin_day_1,checkin_day_2,checkin_day_3,checkin_day_4,checkin_day_5,checkin_day_6,checkout_day_0,checkout_day_1,checkout_day_2,checkout_day_3,checkout_day_4,checkout_day_5,checkout_day_6,checkin_month_1,checkin_month_2,checkin_month_3,checkin_month_4,checkin_month_5,checkin_month_6,checkin_month_7,checkin_month_8,checkin_month_9,checkin_month_10,checkin_month_11,checkin_month_12
0,2,0,49,2.0,0.0,5.0,26.0,4,1250.0,4,1,1,2,1.0,1,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,5,0.073831,0.074679,0.062312,0.039179,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,2,0,49,2.0,0.0,5.0,26.0,4,2225.0,4,1,1,1,2.0,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,6,0.073831,0.074679,0.062312,0.041522,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,2,0,49,2.0,0.0,5.0,26.0,5,1050.0,4,1,1,2,3.0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,6,0.073831,0.074679,0.062312,0.025943,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
5,2,0,49,2.0,0.0,5.0,26.0,5,1375.0,4,1,1,1,4.0,0,0,1,0,0,0,1,1,1,0,1,0,1,0,0,0,0,0,6,0.073831,0.074679,0.062312,0.098901,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
6,2,0,49,2.0,0.0,5.0,26.0,4,1700.0,4,1,1,2,5.0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,4,0.073831,0.074679,0.062312,0.033248,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False


In [101]:
def check_qids_unique(X_train, X_test, X_valid):
    train_qids = set(X_train.iloc[:, 1])  # Assuming qid is at index 1
    test_qids = set(X_test.iloc[:, 1])    # Assuming qid is at index 1
    valid_qids = set(X_valid.iloc[:, 1])  # Assuming qid is at index 1

    # Check if the lengths of sets of qids are equal to the lengths of arrays
    return len(train_qids) == X_train.shape[0] and \
           len(test_qids) == X_test.shape[0] and \
           len(valid_qids) == X_valid.shape[0]

# Example usage:
# Replace X_train, X_test, X_valid with your actual datasets
# Replace the index 1 with the index of qid column in your datasets
check_qids_unique(X_train, X_test, X_valid)


False

In [102]:
X_train_2 = X_train.drop(columns=['qid'])
X_valid_2 = X_valid.drop(columns=['qid'])
X_test_2 = X_test.drop(columns=['qid'])

In [103]:
scaler=StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_valid[numerical_features] = scaler.transform(X_valid[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

'''
X_train = pd.get_dummies(X_train, columns=categorical_features)
X_valid = pd.get_dummies(X_valid, columns=categorical_features)
X_test = pd.get_dummies(X_test, columns=categorical_features)
'''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid[numerical_features] = scaler.transform(X_valid[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical_features] = scaler.transform(X_t

'\nX_train = pd.get_dummies(X_train, columns=categorical_features)\nX_valid = pd.get_dummies(X_valid, columns=categorical_features)\nX_test = pd.get_dummies(X_test, columns=categorical_features)\n'

In [104]:
encoded_columns = ['point_of_sale', 'geo_location_country', 'destination_id', 'prop_id']

# Remove columns from X_train, X_valid, and X_test
for col in encoded_columns:
    if col in X_train.columns:
        X_train.drop(columns=[col], inplace=True)
    if col in X_valid.columns:
        X_valid.drop(columns=[col], inplace=True)
    if col in X_test.columns:
        X_test.drop(columns=[col], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=[col], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid.drop(columns=[col], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=[col], inplace=True)


In [105]:
X_train.shape, X_test.shape, X_valid.shape

((876875, 62), (293086, 62), (290786, 62))

In [106]:
X_train[X_train['rank_noad']<=30]

Unnamed: 0,qid,is_mobile,adult_count,child_count,length_of_stay,booking_window,review_rating,review_count,star_rating,is_free_cancellation,is_drr,price_bucket,rank_noad,AirConditioning,AirportTransfer,Bar,FreeAirportTransportation,FreeBreakfast,FreeParking,FreeWiFi,Gym,HighSpeedInternet,HotTub,LaundryFacility,Parking,PetsAllowed,PrivatePool,SpaServices,SwimmingPool,WasherDryer,WiFi,number_of_amenities,point_of_sale_ccrrcr,geo_location_country_ccrrcr,destination_id_ccrrcr,prop_id_ccrrcr,checkin_day_0,checkin_day_1,checkin_day_2,checkin_day_3,checkin_day_4,checkin_day_5,checkin_day_6,checkout_day_0,checkout_day_1,checkout_day_2,checkout_day_3,checkout_day_4,checkout_day_5,checkout_day_6,checkin_month_1,checkin_month_2,checkin_month_3,checkin_month_4,checkin_month_5,checkin_month_6,checkin_month_7,checkin_month_8,checkin_month_9,checkin_month_10,checkin_month_11,checkin_month_12
35,47,0,1.020496,1.0,-0.315826,-0.393425,4,-0.157157,3,1,0,2,1.0,1,0,1,0,0,0,1,1,1,0,1,1,0,0,0,1,0,0,8,0.454103,0.433497,0.639255,0.431818,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
36,47,0,1.020496,1.0,-0.315826,-0.393425,4,-0.157157,3,1,1,1,2.0,0,0,0,0,1,0,1,1,1,0,1,1,1,0,0,0,0,0,7,0.454103,0.433497,0.639255,0.693902,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
37,47,0,1.020496,1.0,-0.315826,-0.393425,5,-0.128536,4,1,0,3,3.0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,0,9,0.454103,0.433497,0.639255,1.605834,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
38,47,0,1.020496,1.0,-0.315826,-0.393425,4,-0.347965,4,1,0,5,4.0,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,6,0.454103,0.433497,0.639255,0.106880,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
39,47,0,1.020496,1.0,-0.315826,-0.393425,4,-0.395666,4,1,0,4,5.0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,4,0.454103,0.433497,0.639255,0.437818,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049491,999205,0,-0.209876,2.0,0.101863,-0.641414,4,-0.491070,4,1,0,5,20.0,1,0,1,0,1,1,1,1,0,0,0,0,1,0,1,1,0,0,9,0.322511,0.352471,0.449880,0.160000,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
2049492,999205,0,-0.209876,2.0,0.101863,-0.641414,4,-0.414747,4,0,1,4,21.0,1,0,1,0,0,1,1,1,0,0,1,0,0,0,1,0,0,0,7,0.322511,0.352471,0.449880,0.336449,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
2049493,999205,0,-0.209876,2.0,0.101863,-0.641414,4,0.186297,5,1,0,5,22.0,1,0,1,0,1,1,1,1,0,0,1,0,0,0,1,0,0,0,8,0.322511,0.352471,0.449880,0.197740,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
2049494,999205,0,-0.209876,2.0,0.101863,-0.641414,4,-0.157157,4,0,0,4,23.0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,5,0.322511,0.352471,0.449880,0.304569,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False


In [108]:
from sklearn.datasets import dump_svmlight_file

# Save X_train as SVMLight file with .txt extension
file_path_train = '/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/train.txt'
dump_svmlight_file(X_train, y_train, f=file_path_train, query_id=query_id_train)

# Save X_valid as SVMLight file with .txt extension
file_path_valid = '/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/vali.txt'
dump_svmlight_file(X_valid, y_valid, f=file_path_valid, query_id=query_id_valid)

# Save X_test as SVMLight file with .txt extension
file_path_test = '/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/test.txt'
dump_svmlight_file(X_test, y_test, f=file_path_test, query_id=query_id_test)

In [109]:
# Save X_train as CSV file
X_train.to_csv('/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/train.csv', index=False)

# Save X_valid as CSV file
X_valid.to_csv('/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/vali.csv', index=False)

# Save X_test as CSV file
X_test.to_csv('/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/test.csv', index=False)


In [110]:
# Save X_train as CSV file
y_train.to_csv('/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/y_train.csv', index=False)

# Save X_valid as CSV file
y_valid.to_csv('/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/y_vali.csv', index=False)

# Save X_test as CSV file
y_test.to_csv('/content/drive/MyDrive/Expedia/Data/workstation_1millionrows/target_combined_click_reviewrating_review_count_relevance/y_test.csv', index=False)