## Data Preprocessing

In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")

In [4]:
df = pd.DataFrame(dataset['train'])

### Clean 'price_range' Column by converting to description

In [5]:
def convert_price_range(price_range):
    mapping = {
        "$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Cheap",
        "$$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Moderate",
        "$$$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Expensive"
    }
    return mapping.get(price_range, "Unknown")

df['price_range'] = df['price_range'].apply(convert_price_range)

### Clean 'country' Column by translating Turkiye to Turkey 

In [8]:
df['country'] = df['country'].replace(['Turkiye', 'Türkiye'], 'Turkey')

### Make Cleaned Dataset Text File for RAG

In [9]:
df_subset = df[['hotel_name', 'hotel_description', 'review_title', 'review_text', 'price_range',
                'street_address', 'locality', 'country', 'rate', 'tripdate', 'rating_value', 'review_count']].copy()

# Concatenate the columns into a single text column
df_subset['combined_text'] = df_subset.apply(lambda row: f"Hotel Name: {row['hotel_name']}\n"
                                                         f"Hotel Description: {row['hotel_description']}\n"
                                                         f"Review Title: {row['review_title']}\n"
                                                         f"Review Text: {row['review_text']}\n"
                                                         f"Trip Date: {row['tripdate']}\n"
                                                         f"Price: {row['price_range']}\n"
                                                         f"User Rating: {row['rate']}\n"
                                                         f"Average Rating: {row['rating_value']}\n"
                                                         f"Total Review Count: {row['review_count']}\n"
                                                         f"Address: {row['street_address']}, {row['locality']}, {row['country']}\n",
                                                         axis=1)

# Export this combined column to a text file
with open('hotel_data_for_vector_db.txt', 'w', encoding='utf-8') as file:
    for text in df_subset['combined_text']:
        file.write(text + "\n\n")

In [10]:
file_path = 'hotel_data_for_vector_db.txt'

# Open the file and read the first few lines
with open(file_path, 'r', encoding='utf-8') as file:
    for _ in range(40):  # Adjust the number 5 to change how many lines you read
        line = file.readline()
        print(line)


Hotel Name: Romance Istanbul Hotel

Hotel Description: Romance Istanbul Hotel has 39 rooms.Every room is elegantly furnished and harmonizes the modern life style with the traditional Ottoman touch. Romance Istanbul sits at the intersection of the old city’s most important part. With its luxuriously inspiring design and landmark old city location, steeped in the history of its surroundings, Romance Istanbul Hotel welcomes you with exceptional designed rooms and world-renowned Turkish hospitality. Our colleagues deliver the most personal service. It is perfectly placed and perfectly designed to enhance all that Istanbul has to offer. Each room offers a private bathroom and shower. Each is equipped with a satellite TV and free wifi connection. The rooms size change between 20 m2 and 45 m2. It includes 7 suite rooms: 1 Royal Suite, 4 Grand Suite, 1 Romance Suite and 1 Premium Suite, 2 Luxury Room With Terrace, 22 Deluxe Room, 8 City Room.

Review Title: An exceptional boutique hotel, great