In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


1. Data Preparation

a. Load and Merge Tables

In [None]:
import pandas as pd

TRAIN_OR_VAL = "train"
PATH = "/content/drive/MyDrive/kaggle_competition_data"

matches_df = pd.read_csv(f"{PATH}/data/{TRAIN_OR_VAL}_matches.csv")
reviews_df = pd.read_csv(f"{PATH}/data/{TRAIN_OR_VAL}_reviews.csv")
users_df = pd.read_csv(f"{PATH}/data/{TRAIN_OR_VAL}_users.csv")

In [None]:
import pandas as pd

TRAIN_OR_VAL = "train"
PATH = "/content/drive/MyDrive/kaggle_competition_data"

In [None]:
import calendar

#Here we modify the column location
def transform_location(row):
    location = ""
    if row["location_is_ski"] == 1:
        location = "in a ski resort"
    elif row["location_is_beach"] == 1:
        location = "near a beach"
    elif row["location_is_city_center"] == 1:
        location = "in a city center"
    else:
        location = "not mentionned"
    return location

def get_review_text_representation(row):
  review = ""
  if row['review_title']:
      review += f"review_title: {row['review_title']} / "
  if row['review_positive']:
      review = f"positive review: {row['review_positive']} / "
  if row['review_negative']:
      review += f"negative review: {row['review_negative']}"
  text = f"""The guest gave a score of {row['review_score']}/10 to the accommodation, Here is what the guest thought about the accommodation during his stay: {review}. The guest's review has been helpful for {row['review_helpful_votes']} other guests."""
  return text

def get_accommodation_text_representation(row):
  month = calendar.month_name[row["month"]]
  text = f"""The guest was a {row['guest_type']} coming from {row['guest_country']}, he booked a {row['accommodation_type']} in {row['accommodation_country']} for {row['room_nights']} nights during the month of {month}.
  The accommodation he booked had an everage grade of {row['accommodation_score']}/10 and a star rating of {row['accommodation_star_rating']} stars. The location of the accommodation was {row['location']}.
  """
  return text

In [None]:
users_df['location'] = users_df.apply(transform_location, axis=1)
users_df['accommodation_text'] = users_df.apply(get_accommodation_text_representation, axis=1)
reviews_df['review_text'] = reviews_df.apply(get_review_text_representation, axis=1)
reviews_df = reviews_df.drop(columns=["review_title", "review_positive", "review_negative"])

In [None]:
# # Step 1: Merge `matches` with `users` on `user_id` and `accommodation_id`
merged_df = users_df.merge(
    matches_df,
    on="user_id",
    how="left",
)

# Step 2: Merge the resulting DataFrame with `reviews` on `review_id` and `accommodation_id`
unified_dataset = merged_df.merge(
    reviews_df,
    on="review_id",
    how="inner",
)

# Check the result
unified_dataset = unified_dataset.drop(columns=["accommodation_id_x", "accommodation_id_y"])

id_columns = ["user_id", "accommodation_id", "review_id"]
columns = id_columns + [c for c in unified_dataset.columns if c not in id_columns]

unified_dataset = unified_dataset[columns]

unified_dataset['text_representation'] = unified_dataset['accommodation_text'] + unified_dataset['review_text']
unified_dataset = unified_dataset.drop(columns=["accommodation_text", "review_text"])

2. Generate labels

In [None]:
# At that point, if the rows exists in the table it means that there is a match between (accommodation_id and user_id) and a review_id. Therefore we can add a label = 1 for all the rows in the current dataset.add

label = [1 for _ in range(len(unified_dataset))]

unified_dataset["label"] = label

unified_dataset.head(5)

Unnamed: 0,user_id,accommodation_id,review_id,guest_type,guest_country,room_nights,month,accommodation_type,accommodation_country,accommodation_score,accommodation_star_rating,location_is_ski,location_is_beach,location_is_city_center,location,review_score,review_helpful_votes,text_representation,label
0,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,ca87eb9f-3b2a-4e3b-9f86-6a8d14722a9e,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,not mentionned,10.0,0,The guest was a Family with children coming fr...,1
1,776accc3-9f1b-4a2e-8616-aafcac7eeb1d,-1189343073,b555f9d4-176a-4813-b88d-54e0b676be9c,Solo traveller,Gobuf,2,3,Hotel,Italy,8.0,4.0,0,0,0,not mentionned,8.0,0,The guest was a Solo traveller coming from Gob...,1
2,e32f90a1-1580-4a87-86a1-7602d5543a15,-1454980525,baf7a395-049f-4ea7-a380-58e9836b5da2,Couple,Qehoj,3,6,Hotel,Germany,8.5,4.0,0,0,1,in a city center,10.0,0,"The guest was a Couple coming from Qehoj, he b...",1
3,9f246c70-def0-4b93-8c8a-296f5014f6fc,-773005129,a29ebd72-c49e-4a4c-a68e-093d88c87cf3,Family with children,Mejok,1,8,Apartment,United Kingdom,9.3,0.0,0,0,0,not mentionned,10.0,0,The guest was a Family with children coming fr...,1
4,2958080f-b80d-4bcb-84e9-2e053c314e38,462909752,da5f55ce-43c3-4ce5-8f7c-3ef02bedcf68,Family with children,Zuc,6,6,Hotel,Australia,8.7,4.0,0,1,0,near a beach,10.0,0,The guest was a Family with children coming fr...,1


In [None]:
# Now we need to generate the negative samples
# Apply Mixed Negative Sampling Strategy
# 1. Random Sampling (50%):
# - Assign a random review from other accommodations to a user-accommodation pair.
# 2. Similar Accommodation Sampling (30%):
# - Assign reviews from accommodations with similar features (e.g., location, type, or star rating) but not the exact match.
# 3. Feature-Based Sampling (20%):
# - Pair users with reviews from accommodations with significantly different features (e.g., ski resort reviews for beach accommodations).


# ALL THE METHODS NEEDS TO BE IMPLEMENTED HERE

# import pandas as pd
# import random

# import random

# # Function to sample a fraction of user accommodations
# def sample_user_accommodations(user_accommodations, fraction=0.5):
#     num_samples = int(len(user_accommodations) * fraction)
#     sampled_user_ids = random.sample(list(user_accommodations.keys()), num_samples)
#     sampled_user_accommodations = {user_id: user_accommodations[user_id] for user_id in sampled_user_ids}
#     return sampled_user_accommodations


# def random_sampling(reviews_df, all_accommodations, user_accommodations):
#     negative_matches = []
#     counter = 0
#     total = len(user_accommodations.keys())
#     for user_id, stayed_accommodations in user_accommodations.items():
#         counter +=1
#         if counter % 1000 == 0:
#             print(f"{counter}/{total}")
#         # Find accommodations not visited by this user
#         negative_accommodations = list(all_accommodations - stayed_accommodations)

#         # Randomly sample accommodations
#         for accommodation_id in random.sample(negative_accommodations, len(stayed_accommodations)):
#             # Pick a random review for the negative accommodation
#             possible_reviews = reviews_df[reviews_df['accommodation_id'] == accommodation_id]
#             if not possible_reviews.empty:
#                 review_id = possible_reviews.sample(1)['review_id'].iloc[0]
#                 negative_matches.append({
#                     'user_id': user_id,
#                     'accommodation_id': accommodation_id,
#                     'review_id': review_id
#                 })
#     return pd.DataFrame(negative_matches)


# def similar_accommodation_sampling(unified_dataset, grouped_dict, user_accommodations):
#     negative_matches = []
#     counter = 0
#     total = len(user_accommodations.keys())
#     for user_id, stayed_accommodations in user_accommodations.items():
#         counter +=1
#         if counter % 1000 == 0:
#             print(f"{counter}/{total}")
#         for accommodation_id in stayed_accommodations:
#             accommodation = unified_dataset.loc[accommodation_id].iloc[0].to_dict()
#             similar_accommodations = grouped_dict[accommodation["accommodation_features"]]

#             for sa in similar_accommodations:
#                 if sa["accommodation_id"] != accommodation_id:
#                     review_id = sa['review_id']
#                     negative_matches.append({
#                         'user_id': user_id,
#                         'accommodation_id': sa['accommodation_id'],
#                         'review_id': review_id
#                     })
#                     break
#     return pd.DataFrame(negative_matches)


# def feature_based_sampling(unified_dataset, grouped_dict, user_accommodations):
#     negative_matches = []
#     counter = 0
#     total = len(user_accommodations.keys())
#     for user_id, stayed_accommodations in user_accommodations.items():
#         counter +=1
#         if counter % 1000 == 0:
#             print(f"{counter}/{total}")
#         for accommodation_id in stayed_accommodations:
#             accommodation = unified_dataset.loc[accommodation_id].iloc[0].to_dict()
#             different_accommodation_features = random.choice([key for key in grouped_dict.keys() if key != accommodation["accommodation_features"]])
#             different_accommodation = grouped_dict[different_accommodation_features][0]
#             # Find accommodations with significantly different features
#             review_id = different_accommodation['review_id']
#             negative_matches.append({
#                 'user_id': user_id,
#                 'accommodation_id': different_accommodation['accommodation_id'],
#                 'review_id': review_id
#             })
#     return pd.DataFrame(negative_matches)



In [None]:
# Set up initial data
# all_accommodations = set(reviews_df['accommodation_id'].unique())
# user_accommodations = users_df.groupby('user_id')['accommodation_id'].apply(set).to_dict()

In [None]:
# # Example usage:
# user_accommodations_20_percent = sample_user_accommodations(user_accommodations, fraction=0.2)

# # Generate negative samples
# random_negatives = random_sampling(reviews_df, all_accommodations, user_accommodations_20_percent)

In [None]:
# #Here we modify the column location
# def unify_accommodation_features(row):
#     return f"""{row["accommodation_type"]} {row["accommodation_country"]} {row["location"]}"""

# #optimize data
# unified_dataset_optimized = unified_dataset.copy()
# unified_dataset_optimized["accommodation_features"] = unified_dataset_optimized.apply(unify_accommodation_features, axis=1)

# grouped_dict = (
#     unified_dataset_optimized.groupby("accommodation_features")
#     .apply(lambda group: group.to_dict(orient="records"))
#     .to_dict()
# )

# unified_dataset_optimized = unified_dataset_optimized.set_index(['accommodation_id'])

# # Example usage:
# user_accommodations_30_percent = sample_user_accommodations(user_accommodations, fraction=0.3)


# similar_negatives = similar_accommodation_sampling(unified_dataset_optimized, grouped_dict, user_accommodations_30_percent)

In [None]:
# # Example usage:
# user_accommodations_20_percent = sample_user_accommodations(user_accommodations, fraction=0.2)

# feature_negatives = feature_based_sampling(unified_dataset_optimized, grouped_dict, user_accommodations_20_percent)

In [None]:
# # Merge all negative samples
# negative_matches = pd.concat([random_negatives, similar_negatives, feature_negatives], ignore_index=True)

In [None]:
import pandas as pd
import random

def same_accommodation_sampling(accommodation_reviews_dict, matches_df):
    negative_matches = []
    counter = 0
    total = len(matches_df)
    for row in matches_df.iterrows():
        counter +=1
        if counter % 10000 == 0:
            print(f"{counter}/{total}")
        accommodation_id = row[1]["accommodation_id"]
        user_id = row[1]["user_id"]
        review_id = row[1]["review_id"]

        all_accommodation_reviews = accommodation_reviews_dict[accommodation_id]
        selected_reviews = random.sample(all_accommodation_reviews, min(10, len(all_accommodation_reviews)))
        for selected_review in selected_reviews:
          if selected_review != review_id:
            negative_matches.append({
                'user_id': user_id,
                'accommodation_id': accommodation_id,
                'review_id': selected_review
            })
    return pd.DataFrame(negative_matches)

In [None]:
accommodation_reviews_dict = matches_df.groupby("accommodation_id")["review_id"].apply(list).to_dict()

negative_matches = same_accommodation_sampling(accommodation_reviews_dict, matches_df)

10000/1628989
20000/1628989
30000/1628989
40000/1628989
50000/1628989
60000/1628989
70000/1628989
80000/1628989
90000/1628989
100000/1628989
110000/1628989
120000/1628989
130000/1628989
140000/1628989
150000/1628989
160000/1628989
170000/1628989
180000/1628989
190000/1628989
200000/1628989
210000/1628989
220000/1628989
230000/1628989
240000/1628989
250000/1628989
260000/1628989
270000/1628989
280000/1628989
290000/1628989
300000/1628989
310000/1628989
320000/1628989
330000/1628989
340000/1628989
350000/1628989
360000/1628989
370000/1628989
380000/1628989
390000/1628989
400000/1628989
410000/1628989
420000/1628989
430000/1628989
440000/1628989
450000/1628989
460000/1628989
470000/1628989
480000/1628989
490000/1628989
500000/1628989
510000/1628989
520000/1628989
530000/1628989
540000/1628989
550000/1628989
560000/1628989
570000/1628989
580000/1628989
590000/1628989
600000/1628989
610000/1628989
620000/1628989
630000/1628989
640000/1628989
650000/1628989
660000/1628989
670000/1628989
6800

In [None]:
len(negative_matches)

15889548

In [None]:
# Now that we have our negative matches, we need to add those lines to the unified dataset with a label of 0

# # Step 1: Merge `matches` with `users` on `user_id` and `accommodation_id`
merged_df = users_df.merge(
    negative_matches,
    on="user_id",
    how="left",
)

# Step 2: Merge the resulting DataFrame with `reviews` on `review_id` and `accommodation_id`
unified_dataset_negative_matches = merged_df.merge(
    reviews_df,
    on="review_id",
    how="inner",
)

# Check the result
unified_dataset_negative_matches = unified_dataset_negative_matches.drop(columns=["accommodation_id_x", "accommodation_id_y"])

id_columns = ["user_id", "accommodation_id", "review_id"]
columns = id_columns + [c for c in unified_dataset_negative_matches.columns if c not in id_columns]

unified_dataset_negative_matches = unified_dataset_negative_matches[columns]

label = [0 for _ in range(len(unified_dataset_negative_matches))]

unified_dataset_negative_matches["label"] = label
unified_dataset_negative_matches['text_representation'] = unified_dataset_negative_matches['accommodation_text'] + unified_dataset_negative_matches['review_text']
unified_dataset_negative_matches = unified_dataset_negative_matches.drop(columns=["accommodation_text", "review_text"])

unified_dataset_negative_matches.head(5)

Unnamed: 0,user_id,accommodation_id,review_id,guest_type,guest_country,room_nights,month,accommodation_type,accommodation_country,accommodation_score,accommodation_star_rating,location_is_ski,location_is_beach,location_is_city_center,location,review_score,review_helpful_votes,label,text_representation
0,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,3e20b407-5b69-4b7a-8fb7-ad4d917dbe36,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,not mentionned,10.0,0,0,The guest was a Family with children coming fr...
1,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,aee8babe-b202-4e8f-9ae7-07406907eed3,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,not mentionned,10.0,0,0,The guest was a Family with children coming fr...
2,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,b9ab386d-9942-4bc9-8a0a-ba01555dd49d,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,not mentionned,10.0,0,0,The guest was a Family with children coming fr...
3,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,58276e4f-0c53-43e1-a7da-25b745a63ecd,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,not mentionned,9.0,0,0,The guest was a Family with children coming fr...
4,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,6c3361a3-6417-4143-b35f-9ba518e5b034,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,not mentionned,8.0,1,0,The guest was a Family with children coming fr...


In [None]:
final_dataset = pd.concat([unified_dataset, unified_dataset_negative_matches], ignore_index=True)

len(final_dataset)

17518537

3. Generate text representation

In [None]:
# import calendar

# # THESE FUNCTION NEEDS TO BE APPLIED TO EACH ROW #
# def transform_to_text(row):
#     month = calendar.month_name[row["month"]]
#     review = ""
#     if row['review_title']:
#         review += f"review_title: {row['review_title']} / "
#     if row['review_positive']:
#         review = f"positive review: {row['review_positive']} / "
#     if row['review_negative']:
#         review += f"negative review: {row['review_negative']}"
#     text = f"""The guest was a {row['guest_type']} coming from {row['guest_country']}, he booked a {row['accommodation_type']} in {row['accommodation_country']} for {row['room_nights']} nights during the month of {month}.
#     The accommodation he booked had an everage grade of {row['accommodation_score']}/10 and a star rating of {row['accommodation_star_rating']} stars. The location of the accommodation was {row['location']}.
#     The guest gave a score of {row['review_score']}/10 to the accommodation, Here is what the guest thought about the accommodation during his stay: {review}. The guest's review has been helpful for {row['review_helpful_votes']} other guests."""
#     return text

In [None]:
#Lets test the text representation function

# final_dataset['text_representation'] = final_dataset.apply(transform_to_text, axis=1)
final_dataset = final_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# final_dataset.to_csv(f"{PATH}/{TRAIN_OR_VAL}_final_dataset.csv", index=False)  # Use index=False to exclude row numbers
# print("DataFrame saved to output.csv.")

In [None]:
# text_representation_dataset = final_dataset[['text_representation', 'label']]
# text_representation_dataset.to_csv(f"{PATH}/{TRAIN_OR_VAL}_final_dataset_text_representation.csv", index=False)  # Use index=False to exclude row numbers
# print("DataFrame saved to output.csv.")

In [None]:
TRAIN_OR_VAL = "train"

In [None]:
import pandas as pd

# final_dataset_text_representation = pd.read_csv(f"/content/drive/MyDrive/kaggle_competition_data/{TRAIN_OR_VAL}_final_dataset_text_representation.csv")

text_representation_list = list(final_dataset['text_representation'])
text_representation_list[:5]

["The guest was a Couple coming from Xazas, he booked a ApartHotel in Greece for 3 nights during the month of July.\n  The accommodation he booked had an everage grade of 9.1/10 and a star rating of 0.0 stars. The location of the accommodation was not mentionned.\n  The guest gave a score of 10.0/10 to the accommodation, Here is what the guest thought about the accommodation during his stay: positive review: Everything was perfect. Everyone from hotel staff was so kind and helpful - especially a guy from reception - Vaggelis. They managed everything what we needed, they even managed our very specific tasks. Anessis is a great place Where to stay while you are in Santorini - they have everything u need, it is really close to center of Fira and big plus is privacy - tourists cant see you in the pool while you are out of summer shape. / negative review: nan. The guest's review has been helpful for 0 other guests.",
 "The guest was a Couple coming from Mejok, he booked a Guest house in Uni

In [None]:
!pip install sentence-transformers



In [None]:
import torch
print("Using GPU:", torch.cuda.is_available())  # Should return True

Using GPU: True


In [None]:
from sentence_transformers import SentenceTransformer

# Load the model onto the GPU
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from torch.utils.data import DataLoader

# Create a DataLoader for batching
batch_size = 1000000
data_loader = DataLoader(text_representation_list[4000000:], batch_size=batch_size)

embeddings = []
count = 0
part = 5
for batch in data_loader:
    count += 1
    print(count)
    batch_embeddings = embedding_model.encode(batch)  # Convert to tensor for GPU
    torch.save(batch_embeddings, f"/content/drive/MyDrive/kaggle_competition_data/train_embeddings_part{part}.pt")
    part += 1

# Combine all embeddings
# embeddings = torch.cat(embeddings)


1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [None]:
# embeddings

tensor([[ 0.0144,  0.0627, -0.0015,  ...,  0.0628, -0.0476, -0.0324],
        [ 0.0743,  0.0518, -0.0506,  ..., -0.0073, -0.0984, -0.0111],
        [ 0.0534,  0.0634, -0.0560,  ...,  0.0160, -0.0923, -0.0096],
        ...,
        [ 0.0534,  0.0819,  0.0746,  ...,  0.0268, -0.1076, -0.0055],
        [ 0.0464,  0.1151, -0.0068,  ...,  0.0205, -0.1077,  0.0207],
        [ 0.0511,  0.0677, -0.0330,  ...,  0.0362, -0.1597, -0.0048]],
       device='cuda:0')

In [None]:
# import torch

# torch.save(embeddings, f"/content/drive/MyDrive/kaggle_competition_data/{TRAIN_OR_VAL}_embeddings.pt")

LETS START TRAINING OUR MODEL


In [None]:
import pandas as pd

TRAIN_OR_VAL = "train"
PATH = "/content/drive/MyDrive/kaggle_competition_data"

In [None]:
# import torch

# Load the tensor from the file
# embeddings = torch.load(f"{PATH}/{TRAIN_OR_VAL}_embeddings.pt")
# print(embeddings.shape)  # Check tensor shape


  embeddings = torch.load(f"{PATH}/{TRAIN_OR_VAL}_embeddings.pt")


torch.Size([3251256, 384])


In [None]:
# embeddings[:100]

tensor([[ 0.0144,  0.0627, -0.0015,  ...,  0.0628, -0.0476, -0.0324],
        [ 0.0743,  0.0518, -0.0506,  ..., -0.0073, -0.0984, -0.0111],
        [ 0.0534,  0.0634, -0.0560,  ...,  0.0160, -0.0923, -0.0096],
        ...,
        [ 0.0578,  0.0308, -0.0312,  ...,  0.0562, -0.1003,  0.0087],
        [ 0.0918,  0.0130, -0.0295,  ..., -0.0210, -0.0800,  0.0409],
        [ 0.0350,  0.0658, -0.0414,  ...,  0.0298, -0.1084,  0.0019]],
       device='cuda:0')

In [None]:
# sample_embeddings = embeddings[:100]
# sample_embeddings.shape

torch.Size([100, 384])

In [None]:
import json

# labels = list(final_dataset['label'])

with open(f"{PATH}/train_labels.json", "r") as file:
    labels = json.load(file)

# json.dump(labels, open(f"{PATH}/train_labels.json", 'w'))
# sample_labels = labels[:100]
# len(sample_labels)

In [None]:
len(labels)

17518537

In [None]:
import torch

N = 5

embeddings = []
for i in range(1, N + 1):
  print(i)
  embedding = torch.load(f"{PATH}/train_embeddings_part{i}.pt")
  embeddings.append(embedding)

embeddings = torch.cat(embeddings)

1


  embedding = torch.load(f"{PATH}/train_embeddings_part{i}.pt")


2
3
4
5


TypeError: expected Tensor as element 4 in argument 0, but got numpy.ndarray

In [None]:
embeddings_list = [emb.tolist() if isinstance(emb, (torch.Tensor, np.ndarray)) else emb for emb in embeddings]



In [None]:
embeddings

NameError: name 'embeddings' is not defined

In [None]:
import torch
import numpy as np

embeddings_tensor_format = [
    tensor.to('cuda:0') if isinstance(tensor, torch.Tensor) else torch.tensor(tensor, device='cuda:0')
    for tensor in embeddings
]



In [None]:
torch.save(embeddings_tensor_format, "tensors.pt")
torch.save(batch_embeddings, f"/content/drive/MyDrive/kaggle_competition_data/train_embeddings_complete.pt")

In [None]:
embeddings_tensor_format = torch.cat(embeddings_tensor_format)

In [None]:
embeddings_tensor_format

NameError: name 'embeddings_tensor_format' is not defined

In [None]:
torch.save(embeddings_tensor_format, f"/content/drive/MyDrive/kaggle_competition_data/train_embeddings_complete.pt")

In [None]:
embeddings_tensor_format.shape[0]

9000000

In [None]:
labels = labels[:embeddings_tensor_format.shape[0]]

LETS TRY WITH XGBOOST


In [None]:
import torch
import json

TRAIN_OR_VAL = "train"
PATH = "/content/drive/MyDrive/kaggle_competition_data"

embeddings = torch.load(f"{PATH}/final_train_embeddings.pt")

# labels = list(final_dataset['label'])

with open(f"{PATH}/train_labels.json", "r") as file:
    labels = json.load(file)

labels = labels[:embeddings.shape[0]]

  embeddings = torch.load(f"{PATH}/final_train_embeddings.pt")


In [None]:
torch.save(embeddings, f"/content/drive/MyDrive/kaggle_competition_data/final_train_embeddings.pt")

In [None]:
# import xgboost as xgb

# # Calculate scale_pos_weight
# scale_pos_weight = 5.0

# # Initialize the model
# xgb_model = xgb.XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='logloss',
#     use_label_encoder=False,
#     scale_pos_weight=scale_pos_weight,
#     learning_rate=0.01, # Step size shrinkage
# )

# # Train the model
# xgb_model.fit(embeddings, labels)



Parameters: { "use_label_encoder" } are not used.



AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Calculate scale_pos_weight
# scale_pos_weight = 5.0

# Initialize the model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    learning_rate=0.01, # Step size shrinkage
    max_depth=10,
)

X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.05, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions (probabilities)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]  # Get probabilities for class 1

# Convert probabilities to binary labels if needed
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.906488


In [None]:
y_pred_proba[:100]

array([0.11733637, 0.11831962, 0.11839627, 0.11817162, 0.11933474,
       0.11847782, 0.11836696, 0.11402187, 0.11768287, 0.12407191,
       0.112363  , 0.11657668, 0.12424216, 0.11845138, 0.12205874,
       0.118581  , 0.11800554, 0.11687811, 0.11969081, 0.12066403,
       0.11445337, 0.11248534, 0.12114537, 0.1225624 , 0.12206762,
       0.1226998 , 0.11321754, 0.11655415, 0.11737301, 0.11780299,
       0.11928795, 0.12017022, 0.1178345 , 0.12463495, 0.11911879,
       0.11948612, 0.12311982, 0.12144068, 0.11910941, 0.12059721,
       0.11733387, 0.12241323, 0.11849786, 0.12131378, 0.11730786,
       0.11706787, 0.12006275, 0.11801354, 0.11785412, 0.11729328,
       0.11466236, 0.1197876 , 0.11883122, 0.11852495, 0.11724789,
       0.12061368, 0.11934851, 0.12185351, 0.1183395 , 0.11465792,
       0.11939625, 0.11330035, 0.12053665, 0.11877012, 0.12104145,
       0.11855411, 0.12226178, 0.11773928, 0.12270951, 0.11926609,
       0.1195626 , 0.11623127, 0.11486035, 0.1166375 , 0.11876

In [None]:
import pickle

with open(f"{PATH}/xgboost_model.pkl", 'wb') as file:
    pickle.dump(xgb_model, file)

In [None]:
# import pickle

# with open(f"{PATH}/xgboost_model.pkl", 'rb') as file:
#     xgb_model = pickle.load(file)

# Make predictions with the loaded model
# y_pred = loaded_model.predict_proba(X_test)

In [None]:
# y_pred

array([[0.24688643, 0.75311357],
       [0.50252044, 0.4974796 ],
       [0.3978541 , 0.6021459 ],
       ...,
       [0.34072846, 0.65927154],
       [0.4512365 , 0.5487635 ],
       [0.3464067 , 0.6535933 ]], dtype=float32)

In [None]:
# y_test[:20]

[1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1]

In [None]:
# Convert probabilities to binary labels if needed
# y_pred = (y_pred_proba > 0.5).astype(int)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy}')

Accuracy: 0.6894292569524733


NOW LETS TEST THE MODEL ON VALUATION DATASET WITH THE COMPETITION MEASURE

In [None]:
import pandas as pd

TRAIN_OR_VAL = "test"
PATH = "/content/drive/MyDrive/kaggle_competition_data"

reviews_df = pd.read_csv(f"{PATH}/data/{TRAIN_OR_VAL}_reviews.csv")
users_df = pd.read_csv(f"{PATH}/data/{TRAIN_OR_VAL}_users.csv")

users_df['location'] = users_df.apply(transform_location, axis=1)
users_df = users_df.drop(columns=["location_is_ski", "location_is_beach", "location_is_city_center"])
users_df.head()

NameError: name 'transform_location' is not defined

In [None]:
def get_review_text_representation(row):
  review = ""
  if row['review_title']:
      review += f"review_title: {row['review_title']} / "
  if row['review_positive']:
      review = f"positive review: {row['review_positive']} / "
  if row['review_negative']:
      review += f"negative review: {row['review_negative']}"
  text = f"""The guest gave a score of {row['review_score']}/10 to the accommodation, Here is what the guest thought about the accommodation during his stay: {review}. The guest's review has been helpful for {row['review_helpful_votes']} other guests."""
  return text

In [None]:
import calendar

def get_accommodation_text_representation(row):
  month = calendar.month_name[row["month"]]
  text = f"""The guest was a {row['guest_type']} coming from {row['guest_country']}, he booked a {row['accommodation_type']} in {row['accommodation_country']} for {row['room_nights']} nights during the month of {month}.
  The accommodation he booked had an everage grade of {row['accommodation_score']}/10 and a star rating of {row['accommodation_star_rating']} stars. The location of the accommodation was {row['location']}.
  """
  return text

In [None]:
users_df['accommodation_text'] = users_df.apply(get_accommodation_text_representation, axis=1)
reviews_df['review_text'] = reviews_df.apply(get_review_text_representation, axis=1)


In [None]:
reviews_df = reviews_df.drop(columns=["review_title", "review_positive", "review_negative"])

In [None]:
# reviews_df.head(5)

Unnamed: 0,review_id,accommodation_id,review_score,review_helpful_votes,review_text
0,d50f830f-fd60-492f-924b-46f948c2d90d,-663110570,7.0,0,The guest gave a score of 7.0/10 to the accomm...
1,1f03e5f0-f15e-4c7b-997e-0490ce121de8,-558978085,10.0,0,The guest gave a score of 10.0/10 to the accom...
2,01121198-c633-47df-aa4a-b42ef0a31efe,1477624081,10.0,0,The guest gave a score of 10.0/10 to the accom...
3,73e3f388-3f51-4091-ad3a-bb6db106348e,-1273110867,9.0,0,The guest gave a score of 9.0/10 to the accomm...
4,f181394d-35a0-4e27-9cdc-84427c53d321,1244380562,10.0,1,The guest gave a score of 10.0/10 to the accom...


In [None]:
# # Step 1: Merge `matches` with `users` on `user_id` and `accommodation_id`
unified_dataset = users_df.merge(
    reviews_df,
    on="accommodation_id",
    how="left",
)
id_columns = ["user_id", "accommodation_id", "review_id"]
columns = id_columns + [c for c in unified_dataset.columns if c not in id_columns]

unified_dataset = unified_dataset[columns]


In [None]:
unified_dataset['text_representation'] = unified_dataset['accommodation_text'] + unified_dataset['review_text']
unified_dataset = unified_dataset.drop(columns=["accommodation_text", "review_text"])

In [None]:
# from sentence_transformers import SentenceTransformer

# # Load the model onto the GPU
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [None]:
# import torch

In [None]:
# import torch
# from torch.utils.data import DataLoader

# text_representation_list = list(unified_dataset['text_representation'])
# text_representation_list = text_representation_list[15400000:]
# print(len(text_representation_list))
# # Create a DataLoader for batching
# batch_size = 1000000
# data_loader = DataLoader(text_representation_list, batch_size=batch_size)

# count = 0
# part = 17
# for batch in data_loader:
#     count += 1
#     print(count)
#     batch_embeddings = embedding_model.encode(batch, convert_to_tensor=True)
#     torch.save(batch_embeddings, f"/content/drive/MyDrive/kaggle_competition_data/test_embeddings_part{part}.pt")
#     part += 1

# # # Combine all embeddings
# # embeddings = torch.cat(embeddings)


8666438
1
2
3
4
5
6
7
8
9


Now that we have the embeddings for the test set, we need to generate the predictions

In [None]:
import pickle

with open(f"{PATH}/xgboost_model.pkl", 'rb') as file:
    xgb_model = pickle.load(file)

In [None]:
import numpy as np
import torch

predictions = np.array([])
for i in range(1, 26):
  print(i)
  embedding = torch.load(f"{PATH}/test_embeddings_part{i}.pt")
  batch_predictions = xgb_model.predict_proba(embedding)[:, 1]
  predictions = np.concatenate((predictions, batch_predictions))


1


  embedding = torch.load(f"{PATH}/test_embeddings_part{i}.pt")
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




2


  embedding = torch.load(f"{PATH}/test_embeddings_part{i}.pt")


3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [None]:
print(len(predictions))
print(len(unified_dataset))

24066438
24066438


In [None]:
unified_dataset['predictions'] = predictions

NOW THAT WE HAVE THE PREDICTIONS, WE NEED TO FILTER THE DATA TO KEEP ONLY THE REVIEWS THAT RECEIVED THE 10 HIGHEST RANKS.

In [None]:
df = unified_dataset.copy()

In [None]:
df['accommodation_user'] = df['accommodation_id'].astype(str) + '<SEP>' + df['user_id'].astype(str)
unique_accommodation_user = df["accommodation_user"].unique()
result_dict = {id: {"reviews": [], "predictions": []} for id in unique_accommodation_user}

counter = 0
for index, row in unified_dataset.iterrows():
    counter += 1
    if counter % 10000 == 0:
      print(counter)
    accommodation_id = row['accommodation_id']
    user_id = row['user_id']
    id = f"{accommodation_id}<SEP>{user_id}"
    review_id = row['review_id']
    prediction = row['predictions']
    result_dict[id]["reviews"].append(review_id)
    result_dict[id]["predictions"].append(prediction)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
1390

In [None]:
# len(result_dict.keys())
# len(users_df)

199138

In [None]:
import json
# Specify the file path
file_path = f"{PATH}/result_dict.json"

# Save the dictionary as a JSON file
with open(file_path, "w") as json_file:
    json.dump(result_dict, json_file, indent=4)

In [None]:
import numpy as np

final_list = []

counter = 0
for key, data in result_dict.items():
    counter += 1
    if counter % 10000 == 0:
      print(counter)
    # Split the key into accommodation_id and user_id
    accommodation_id, user_id = key.split("<SEP>")

    # Get reviews and predictions
    reviews = data["reviews"]
    predictions = data["predictions"]

    # Convert predictions to a NumPy array for efficient processing
    predictions_array = np.array(predictions)

    # Get the indices of the 10 highest predictions
    top_indices = np.argsort(predictions_array)[-10:][::-1]  # Sort in descending order

    # Create a dictionary for this ID
    result_entry = {
        "accommodation_id": accommodation_id,
        "user_id": user_id
    }

    # Add the top 10 reviews directly to the dictionary
    for i in range(10):
        result_entry[f"review_{i+1}"] = reviews[top_indices[i]]

    # Append to the final list
    final_list.append(result_entry)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


In [None]:
final_df = pd.DataFrame(final_list)
final_df.head()

Unnamed: 0,accommodation_id,user_id,review_1,review_2,review_3,review_4,review_5,review_6,review_7,review_8,review_9,review_10
0,2086452554,5f83c2ae-d803-4b4c-9d25-1226f90297ce,10ca6f60-f4a9-4430-8d44-d3d7edf653c7,1414b585-d53b-4995-9a90-c05ca15e6753,f85692be-7f0b-41d9-af54-6462860ea180,faa37671-0d43-4eaf-8f84-fd44f4e45361,143838bf-5cef-40ba-97d3-3098d92e853c,7ecf0ac8-5ca5-4c37-9684-d941e2e25de4,9baed10b-106d-4eca-973d-336ed89bd0fc,064951f3-cd47-464a-9381-2b668e030e55,3d60e063-62ab-449e-943a-1df52dfb50ae,d41d4102-a177-48fb-ac69-b68da950f196
1,-202362622,a194a2ef-9487-4cf0-8828-dd5803c8b9d1,9737170f-7a42-4ec4-b61f-b5fd0eb75297,330f2f8d-18fa-41bf-b0be-68740b3e3947,1f408a47-7089-4c1f-a46d-c8c752cacdd3,d9199ab6-e1c2-48bf-9e74-b270085c5ac7,16b2abb9-c90b-465b-8e2c-56c56bb8eb1e,5331a5d3-761b-4b2c-bc92-6be1ffa97be4,68985299-0984-4bb3-a55c-6976ad977c1f,c4d1101a-48ba-41ea-9792-a5c35e791741,c0f3caa8-8d62-490a-8f2f-eed5bde7e116,3d21b3b7-a35c-4258-9dc8-1be2fddd14d3
2,-1390928232,cfb878d0-af56-4b0d-90ff-87095b1a56d6,9827f8c3-c590-45c4-b4c8-f89c5f208a4c,e4ae957e-ccef-4472-aba6-24b5dc0bc9f9,1580b153-e202-44b5-a347-ef64ecd0c62f,46a1a946-b150-4008-bedc-89e9fedd5af4,973cd547-243c-48b5-aa17-00156bd01684,e37981ac-94ab-4136-ad3c-a0f25b2bd2d5,dbae01e2-627f-4e53-9d9e-05499b21e203,7247d4dc-0fa4-4afd-b677-0b0b9fc99fb1,841132d6-0806-479c-9915-227a208d4253,ef97ef4d-5c16-4a66-896d-b9b89eef3e7e
3,1007230055,19ffcbff-8500-482a-b5af-c55cb4235259,0ac123d1-b4f0-4836-b4e5-b10ba3508598,f0fbd723-0ffb-40c0-894a-ad6f551265b6,7d2c70ab-05e3-4286-b6df-da37df928991,ec22bed5-41ee-46a5-953c-64ce6a0575db,dc29ecab-2b0f-4513-bde8-86d1daa45358,6c60eb0c-de22-4198-9a48-29610953dcbb,a0c92308-5e38-450e-8b16-b44331c4ba90,645e97b0-7adc-4812-b74a-6d5ec61df8b4,b4e6ec65-11ed-4c4e-85c1-5542cb11e88a,0ada749c-a2eb-4ccf-acbf-8b0fc7d94036
4,135365139,98d6a06b-131c-464d-86e7-b74dd4894ae2,0d8d6f2b-bcab-4986-986c-44f467612092,7d601be5-7d58-4c40-9a43-8ec0c97e5ed2,4d8dc561-8628-413a-b696-a4c9aa752891,c91f57d1-1a2d-4d1f-9676-3cd0a0a6e75a,24b42024-4685-48b5-8f79-200047201fd8,6f0d6476-2e3c-4ddd-9a82-dc635860256a,0c29773c-8ae0-434b-8659-de75fb06aafa,b77a992f-5280-4b46-b8b2-684e68d741ba,801500fd-2d33-426f-ae94-e2442fd17206,d2a396ea-a414-4c17-a565-4062598ada28


In [None]:
if 'ID' not in final_df.columns:
    final_df.insert(0, 'ID', range(1, len(final_df) + 1))

In [None]:
final_df.to_csv(f"{PATH}/submission.csv")