In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse, parse_qs
from datetime import datetime, timedelta
import html2text
import time
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright



In [5]:
def best_hotels_and_dates(url, sort, from_, to_, holiday_length):
    # Parse the URL
    parsed_url = urlparse(url)
    
    # Extract query parameters
    query_params = parse_qs(parsed_url.query)
    
    # Extract check-in and check-out dates
    checkin = query_params.get('checkin', [''])[0]
    checkout = query_params.get('checkout', [''])[0]
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }
    
    order_map = {'price': 'order=price', 'rating': 'order=bayesian_review_score',
                 'price & rating': 'order=review_score_and_price'}
    
    url = url.replace('order=popularity', order_map[sort])
    
    # Convert checkin string to datetime
    checkin_date = datetime.strptime(from_, "%Y-%m-%d")
    
    # Get initial checkout date
    checkout_date = checkin_date + timedelta(days=holiday_length)

    # Get datetime for end of window
    end_date = datetime.strptime(to_, "%Y-%m-%d")

    # Calculate window range
    window = (end_date - checkin_date).days
    range_ = window + 1 - holiday_length
    
    hotels_list = []
    for i in tqdm(range(range_)):
        # print(f'Progress: {i}/{range_}')
        checkin_new_date = checkin_date + timedelta(days=i)
        checkout_new_date = checkout_date + timedelta(days=i)
        
        # Convert back to string
        checkin_new = checkin_new_date.strftime("%Y-%m-%d")
        checkout_new = checkout_new_date.strftime("%Y-%m-%d")
    
        # Create new url
        url_new = url.replace("checkin={0}&checkout={1}".format(checkin, checkout),
                              "checkin={0}&checkout={1}".format(checkin_new, checkout_new)
                              )
        
        best_hotels = best_value_hotels(url_new, sort, headers=headers, dates=[checkin_new, checkout_new], prepare=False)
        
        hotels_list.append(best_hotels)
        
    all_best_hotels = pd.concat(hotels_list)

    # Scale columns
    all_best_hotels['price_scaled'] = all_best_hotels['price'].apply(lambda x: x / all_best_hotels['price'].max())
    all_best_hotels['rating_scaled'] = all_best_hotels['rating']*0.1
    
    # Build VM (Value for Money) score
    all_best_hotels['vm_score_unrounded'] = 100*(((1-all_best_hotels['price_scaled'])+all_best_hotels['rating_scaled'])/2)
    
    
    if sort == 'price & rating':
        all_best_hotels = all_best_hotels.sort_values(by='vm_score_unrounded', ascending=False)
        
    elif sort == 'price':
        all_best_hotels = all_best_hotels.sort_values(by='price', ascending=True)
    
    else:
        all_best_hotels = all_best_hotels.sort_values(by='rating', ascending=False)

    # Apply rounding to VM score
    all_best_hotels['vm_score'] = round(all_best_hotels['vm_score_unrounded'])
    all_best_hotels['vm_score'] = all_best_hotels['vm_score'].astype(int)

    # Drop scaled columns
    all_best_hotels.drop('price_scaled', axis=1, inplace=True)
    all_best_hotels.drop('rating_scaled', axis=1, inplace=True)
    all_best_hotels.drop('vm_score_unrounded', axis=1, inplace=True)
    
    # Drop duplicates
    all_best_hotels = all_best_hotels.drop_duplicates(subset=['name', 'rating', 'reviews',
                                          'checkin date', 'checkout date'])
    
    return all_best_hotels

In [14]:
def best_value_hotels(url, sort, headers=False, dates=False, prepare=True):

    if prepare == True:
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5',
            'Accept-Encoding': 'gzip, deflate'
        }
        
        order_map = {'price': 'order=price', 'rating': 'order=bayesian_review_score',
                     'price & rating': 'order=review_score_and_price'}
        
        url = url.replace('order=popularity', order_map[sort])

    if dates == False:
        # Parse the URL
        parsed_url = urlparse(url)
        
        # Extract query parameters
        query_params = parse_qs(parsed_url.query)
        
        # Extract check-in and check-out dates
        checkin = query_params.get('checkin', [''])[0]
        checkout = query_params.get('checkout', [''])[0]
        
    else:
        checkin = dates[0]
        checkout = dates[1]

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    html = str(soup)
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = False  # Keep links
    text_maker.body_width = 0        # Don't wrap lines
    page = text_maker.handle(html)
    
    hotels_data = {'name': [], 'location': [], 'checkin date': [], 'checkout date': [],
                   'link': [], 'price': [], 'rating': [], 'reviews': []}
    
    hotels = page.split('\n### ')
    for hotel in hotels:
        try:
            name = hotel.split('Opens in new')[0].split('[')[1].strip()
            link = hotel.split(')')[0].split('(')[1].strip()
            location = hotel.split('Show on map')[0].split('[')[-1].strip()
            rating = float(hotel.split('Scored')[1][:5].strip())
            review = int(re.sub(r'[^0-9]', '', hotel.split('reviews')[0][-8:]))
            try:
                price = int(re.sub(r'[^0-9]', '', hotel.split('Price')[1].split('\n')[0].strip()))
            except:
                price = int(re.sub(r'[^0-9]', '', hotel.split('Current price')[1].split('\n')[0].strip()))
    
            hotels_data['name'].append(name)
            hotels_data['location'].append(location)
            hotels_data['checkin date'].append(checkin)
            hotels_data['checkout date'].append(checkout)
            hotels_data['link'].append(link)
            hotels_data['price'].append(price)
            hotels_data['rating'].append(rating)
            hotels_data['reviews'].append(review)
        except:
            continue

    hotels_df = pd.DataFrame(hotels_data)
    hotels_df = hotels_df.drop_duplicates(subset=['name', 'rating', 'reviews',
                                          'checkin date', 'checkout date'])
       
    if prepare == True:
        # Scale columns
        hotels_df['price_scaled'] = hotels_df['price'].apply(lambda x: x / hotels_df['price'].max())
        hotels_df['rating_scaled'] = hotels_df['rating']*0.1
        
        # Build VM (Value for Money) score
        hotels_df['vm_score_unrounded'] = 100*(((1-hotels_df['price_scaled'])+hotels_df['rating_scaled'])/2)
        
        
        if sort == 'price & rating':
            hotels_df = hotels_df.sort_values(by='vm_score_unrounded', ascending=False)
            
        elif sort == 'price':
            hotels_df = hotels_df.sort_values(by='price', ascending=True)
        
        else:
            hotels_df = hotels_df.sort_values(by='rating', ascending=False)
    
        # Apply rounding to VM score
        hotels_df['vm_score'] = round(hotels_df['vm_score_unrounded'])
        hotels_df['vm_score'] = hotels_df['vm_score'].astype(int)
    
        # Drop scaled columns
        hotels_df.drop('price_scaled', axis=1, inplace=True)
        hotels_df.drop('rating_scaled', axis=1, inplace=True)
        hotels_df.drop('vm_score_unrounded', axis=1, inplace=True)

    return hotels_df

In [18]:
nest_asyncio.apply()  # Allows nested async loops (for Jupyter)
async def best_value_hotels(url, sort, headers=False, dates=False, prepare=True):
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        if prepare == True:
            headers = {
                'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
                'Accept-Language': 'en-US, en;q=0.5',
                'Accept-Encoding': 'gzip, deflate'
            }
            
            order_map = {'price': 'order=price', 'rating': 'order=bayesian_review_score',
                         'price & rating': 'order=review_score_and_price'}
            
            url = url.replace('order=popularity', order_map[sort])
    
        if dates == False:
            # Parse the URL
            parsed_url = urlparse(url)
            
            # Extract query parameters
            query_params = parse_qs(parsed_url.query)
            
            # Extract check-in and check-out dates
            checkin = query_params.get('checkin', [''])[0]
            checkout = query_params.get('checkout', [''])[0]
            
        else:
            checkin = dates[0]
            checkout = dates[1]
    
        await page.goto(url)
        await page.wait_for_timeout(5000)  # 5 sec wait for JS content
        content = await page.content()
        await browser.close()
        
        soup = BeautifulSoup(content, 'lxml')
        html = str(soup)
        text_maker = html2text.HTML2Text()
        text_maker.ignore_links = False  # Keep links
        text_maker.body_width = 0        # Don't wrap lines
        page = text_maker.handle(html)
        
        hotels_data = {'name': [], 'location': [], 'checkin date': [], 'checkout date': [],
                       'link': [], 'price': [], 'rating': [], 'reviews': []}
        
        hotels = page.split('\n### ')
        for hotel in hotels:
            try:
                name = hotel.split('Opens in new')[0].split('[')[1].strip()
                link = hotel.split(')')[0].split('(')[1].strip()
                location = hotel.split('Show on map')[0].split('[')[-1].strip()
                rating = float(hotel.split('Scored')[1][:5].strip())
                review = int(re.sub(r'[^0-9]', '', hotel.split('reviews')[0][-8:]))
                try:
                    price = int(re.sub(r'[^0-9]', '', hotel.split('Price')[1].split('\n')[0].strip()))
                except:
                    price = int(re.sub(r'[^0-9]', '', hotel.split('Current price')[1].split('\n')[0].strip()))
        
                hotels_data['name'].append(name)
                hotels_data['location'].append(location)
                hotels_data['checkin date'].append(checkin)
                hotels_data['checkout date'].append(checkout)
                hotels_data['link'].append(link)
                hotels_data['price'].append(price)
                hotels_data['rating'].append(rating)
                hotels_data['reviews'].append(review)
            except:
                continue
    
        hotels_df = pd.DataFrame(hotels_data)
        hotels_df = hotels_df.drop_duplicates(subset=['name', 'rating', 'reviews',
                                              'checkin date', 'checkout date'])
           
        if prepare == True:
            # Scale columns
            hotels_df['price_scaled'] = hotels_df['price'].apply(lambda x: x / hotels_df['price'].max())
            hotels_df['rating_scaled'] = hotels_df['rating']*0.1
            
            # Build VM (Value for Money) score
            hotels_df['vm_score_unrounded'] = 100*(((1-hotels_df['price_scaled'])+hotels_df['rating_scaled'])/2)
            
            
            if sort == 'price & rating':
                hotels_df = hotels_df.sort_values(by='vm_score_unrounded', ascending=False)
                
            elif sort == 'price':
                hotels_df = hotels_df.sort_values(by='price', ascending=True)
            
            else:
                hotels_df = hotels_df.sort_values(by='rating', ascending=False)
        
            # Apply rounding to VM score
            hotels_df['vm_score'] = round(hotels_df['vm_score_unrounded'])
            hotels_df['vm_score'] = hotels_df['vm_score'].astype(int)
        
            # Drop scaled columns
            hotels_df.drop('price_scaled', axis=1, inplace=True)
            hotels_df.drop('rating_scaled', axis=1, inplace=True)
            hotels_df.drop('vm_score_unrounded', axis=1, inplace=True)

        return hotels_df

In [19]:
url = 'https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaFCIAQGYAQm4ARnIAQzYAQHoAQH4AQ2IAgGoAgO4AtKAk8AGwAIB0gIkMDdkMTgwNTQtNDNjMS00YjBmLWE3Y2ItMjVjZTFmOTkyMzBh2AIG4AIB&sid=8bd3ce886640970a93855d9ffdfd4543&aid=304142&ss=Amsterdam+City+Centre%2C+Amsterdam%2C+Noord-Holland%2C+Netherlands&map=1&efdco=1&lang=en-gb&dest_id=145&dest_type=district&ac_position=1&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=42a34466b7e00601&checkin=2025-05-15&checkout=2025-05-19&group_adults=2&no_rooms=1&group_children=0&nflt=mealplan%3D1%3Breview_score%3D70%3Bht_id%3D204&order=popularity'

start_time = time.time()
hotels_df = asyncio.run(best_value_hotels(url, sort='price & rating'))
end_time = time.time()

print('Execution Time: '+ str(end_time - start_time) + 's')

Execution Time: 51.68667936325073s


In [20]:
hotels_df

Unnamed: 0,name,location,checkin date,checkout date,link,price,rating,reviews,vm_score


In [4]:
url = 'https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaFCIAQGYAQm4ARnIAQzYAQHoAQH4AQ2IAgGoAgO4AtKAk8AGwAIB0gIkMDdkMTgwNTQtNDNjMS00YjBmLWE3Y2ItMjVjZTFmOTkyMzBh2AIG4AIB&sid=8bd3ce886640970a93855d9ffdfd4543&aid=304142&ss=Amsterdam+City+Centre%2C+Amsterdam%2C+Noord-Holland%2C+Netherlands&map=1&efdco=1&lang=en-gb&dest_id=145&dest_type=district&ac_position=1&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=42a34466b7e00601&checkin=2025-05-15&checkout=2025-05-19&group_adults=2&no_rooms=1&group_children=0&nflt=mealplan%3D1%3Breview_score%3D70%3Bht_id%3D204&order=popularity'
hotels_and_dates_df = best_hotels_and_dates(url, sort='price & rating', from_='2025-05-15', to_='2025-06-30', holiday_length=4)

100%|███████████████████████████████████████████| 43/43 [03:27<00:00,  4.82s/it]


In [49]:
hotels_and_dates_df

Unnamed: 0,name,location,checkin date,checkout date,link,price,rating,reviews,vm_score
1,Hotelboot Zwaan,"Amsterdam City Centre, Amsterdam",2025-06-12,2025-06-16,https://www.booking.com/hotel/nl/hotelboot-zwa...,691,9.0,232,90
1,Hotelboot Zwaan,"Amsterdam City Centre, Amsterdam",2025-06-26,2025-06-30,https://www.booking.com/hotel/nl/hotelboot-zwa...,691,9.0,232,90
1,Hotelboot Zwaan,"Amsterdam City Centre, Amsterdam",2025-05-22,2025-05-26,https://www.booking.com/hotel/nl/hotelboot-zwa...,730,9.0,232,90
1,Hotelboot Zwaan,"Amsterdam City Centre, Amsterdam",2025-05-15,2025-05-19,https://www.booking.com/hotel/nl/hotelboot-zwa...,730,9.0,232,90
1,'t Hotel,"Amsterdam City Centre, Amsterdam",2025-06-22,2025-06-26,https://www.booking.com/hotel/nl/t.en-gb.html?...,986,9.2,857,89
...,...,...,...,...,...,...,...,...,...
0,Waldorf Astoria Amsterdam,"Amsterdam City Centre, Amsterdam",2025-06-26,2025-06-30,https://www.booking.com/hotel/nl/waldorf-astor...,5292,9.5,234,62
0,Waldorf Astoria Amsterdam,"Amsterdam City Centre, Amsterdam",2025-06-24,2025-06-28,https://www.booking.com/hotel/nl/waldorf-astor...,7329,9.5,234,49
0,Waldorf Astoria Amsterdam,"Amsterdam City Centre, Amsterdam (Canal Belt)",2025-06-21,2025-06-25,https://www.booking.com/hotel/nl/waldorf-astor...,7358,9.5,234,49
0,Waldorf Astoria Amsterdam,"Amsterdam City Centre, Amsterdam",2025-06-23,2025-06-27,https://www.booking.com/hotel/nl/waldorf-astor...,7412,9.5,234,48


In [52]:
len(hotels_and_dates_df['name'].unique())

39