In [55]:
import asyncio
import nest_asyncio
from collections import defaultdict
import json
from pathlib import Path
import re
import os
from typing import List, Optional
from urllib.parse import urlencode
from scrapfly import ScrapflyClient, ScrapeConfig, ScrapeApiResponse
from parsel import Selector
import tracemalloc
from datetime import datetime, timedelta
import time

In [56]:
nest_asyncio.apply()
tracemalloc.start()

In [57]:
scrapfly = ScrapflyClient(key="scp-live-b77441dce98d408ea07b4bd966b558c9")

In [58]:
async def request_hotels_page(
    query,
    checkin: str = "",
    checkout: str = "",
    number_of_rooms=1,
    offset: int = 0,
):
    """scrapes a single hotel search page of booking.com"""
    checkin_year, checking_month, checking_day = checkin.split("-") if checkin else "", "", ""
    checkout_year, checkout_month, checkout_day = checkout.split("-") if checkout else "", "", ""

    url = "https://www.booking.com/searchresults.html"
    url += "?" + urlencode(
        {
            "ss": query,
            "checkin_year": checkin_year,
            "checkin_month": checking_month,
            "checkin_monthday": checking_day,
            "checkout_year": checkout_year,
            "checkout_month": checkout_month,
            "checkout_monthday": checkout_day,
            "no_rooms": number_of_rooms,
            "offset": offset,
        }
    )
    return await scrapfly.async_scrape(ScrapeConfig(url, country="US"))

In [59]:
def parse_search_total_results(html):
    sel = Selector(text=html)
    h1_text = sel.css("h1::text").get()
    if h1_text:
        match = re.search(r"([\d,]+)\s+properties\s+found", h1_text)
        if match:
            return int(match.group(1).replace(",", ""))
    # If no match found, check for alternative patterns or return a default value
    return 0  # or some other appropriate default value

In [60]:
def parse_search_hotels(html: str):
    sel = Selector(text=html)

    hotel_previews = {}
    for hotel_box in sel.xpath('//div[@data-testid="property-card"]'):
        url = hotel_box.xpath('.//h3/a[@data-testid="title-link"]/@href').get("").split("?")[0]
        hotel_previews[url] = {
            "name": hotel_box.xpath('.//h3/a[@data-testid="title-link"]/div/text()').get(""),
            "location": hotel_box.xpath('.//span[@data-testid="address"]/text()').get(""),
            "score": hotel_box.xpath('.//div[@data-testid="review-score"]/div/text()').get(""),
            "review_count": hotel_box.xpath('.//div[@data-testid="review-score"]/div[2]/div[2]/text()').get(""),
            "stars": len(hotel_box.xpath('.//div[@data-testid="rating-stars"]/span').getall()),
            "image": hotel_box.xpath('.//img[@data-testid="image"]/@src').get(),
        }
    return hotel_previews

In [61]:
async def scrape_search(
    query,
    checkin: str = "",
    checkout: str = "",
    number_of_rooms=1,
    max_results: Optional[int] = None,
):
    first_page = await request_hotels_page(
        query=query, checkin=checkin, checkout=checkout, number_of_rooms=number_of_rooms
    )
    hotel_previews = parse_search_hotels(first_page.content)
    total_results = parse_search_total_results(first_page.content)
    if max_results and total_results > max_results:
        total_results = max_results
    other_pages = await asyncio.gather(
        *[
            request_hotels_page(
                query=query,
                checkin=checkin,
                checkout=checkout,
                number_of_rooms=number_of_rooms,
                offset=offset,
            )
            for offset in range(25, total_results, 25)
        ]
    )
    for result in other_pages:
        hotel_previews.update(parse_search_hotels(result.content))
    return hotel_previews

In [62]:
def parse_hotel(html: str):
    sel = Selector(text=html)
    css = lambda selector, sep="": sep.join(sel.css(selector).getall()).strip()
    css_first = lambda selector: sel.css(selector).get("")
    lat, lng = css_first(".show_map_hp_link::attr(data-atlas-latlng)").split(",")
    features = defaultdict(list)
    for feat_box in sel.css("[data-capla-component*=FacilitiesBlock]>div>div>div"):
        type_ = feat_box.xpath('.//span[contains(@data-testid, "facility-group-icon")]/../text()').get()
        feats = [f.strip() for f in feat_box.css("li ::text").getall() if f.strip()]
        features[type_] = feats
    data = {
        "title": css("h2#hp_hotel_name::text"),
        "description": css("div#property_description_content ::text", "\n"),
        "address": css(".hp_address_subtitle::text"),
        "lat": lat,
        "lng": lng,
        "features": dict(features),
        "id": re.findall(r"b_hotel_id:\s*'(.+?)'", html)[0],
    }
    return data

In [63]:
async def scrape_prices(hotel_id, csrf_token, hotel_url, start_date, duration, days_to_check):
    prices = {}
    start = datetime.strptime(start_date, "%Y-%m-%d")
    
    for day in range(days_to_check):
        check_date = start + timedelta(days=day)
        data = {
            "name": "hotel.availability_calendar",
            "result_format": "price_histogram",
            "hotel_id": hotel_id,
            "search_config": json.dumps(
                {
                    "b_adults_total": 2,
                    "b_nr_rooms_needed": 1,
                    "b_children_total": 0,
                    "b_children_ages_total": [],
                    "b_is_group_search": 0,
                    "b_pets_total": 0,
                    "b_rooms": [{"b_adults": 2, "b_room_order": 1}],
                }
            ),
            "checkin": check_date.strftime("%Y-%m-%d"),
            "n_days": duration,
            "respect_min_los_restriction": 1,
            "los": duration,
        }
        result = await scrapfly.async_scrape(
            ScrapeConfig(
                url="https://www.booking.com/fragment.json?cur_currency=usd",
                method="POST",
                data=data,
                headers={"X-Booking-CSRF": csrf_token},
                session=hotel_url.split("/")[-1].split(".")[0],
                country="US",
            )
        )
        price_data = json.loads(result.content)["data"]
        prices[check_date.strftime("%Y-%m-%d")] = price_data
        
        # Add a delay to avoid sending too many requests at once
        await asyncio.sleep(1)
    
    return prices

In [64]:
async def scrape_hotel_availability(url: str, start_date: str, durations: List[int], days_to_check: int):
    result = await scrapfly.async_scrape(ScrapeConfig(
        url, 
        session=url.split("/")[-1].split(".")[0],
        country="US",
    ))
    hotel = parse_hotel(result.content)
    hotel["url"] = result.context['url']
    csrf_token = re.findall(r"b_csrf_token:\s*'(.+?)'", result.content)[0]
    
    hotel["availability"] = {}
    for duration in durations:
        hotel["availability"][duration] = await scrape_prices(
            csrf_token=csrf_token,
            hotel_id=hotel["id"],
            hotel_url=url,
            start_date=start_date,
            duration=duration,
            days_to_check=days_to_check
        )
    
    return hotel

In [65]:
async def run_hotel_scraper(hotel_urls: List[str], start_date: str, durations: List[int], days_to_check: int):
    out = Path(os.getcwd()) / "results"
    out.mkdir(exist_ok=True)

    results = []
    for url in hotel_urls:
        hotel_data = await scrape_hotel_availability(url, start_date, durations, days_to_check)
        results.append(hotel_data)
        
        # Add a delay between processing each hotel to avoid being too aggressive
        await asyncio.sleep(5)

    out.joinpath("hotel_availability.json").write_text(json.dumps(results, indent=2, ensure_ascii=False))


In [66]:
async def scrape_hotels(urls: List[str], price_start_dt: str, price_n_days=30):
    async def scrape_hotel(url: str):
        result = await scrapfly.async_scrape(ScrapeConfig(
            url, 
            session=url.split("/")[-1].split(".")[0],
            country="US",
        ))
        hotel = parse_hotel(result.content)
        hotel["url"] = result.context['url']
        csrf_token = re.findall(r"b_csrf_token:\s*'(.+?)'", result.content)[0]
        hotel["price"] = await scrape_prices(csrf_token=csrf_token, hotel_id=hotel["id"], hotel_url=url)
        return hotel

    async def scrape_prices(hotel_id, csrf_token, hotel_url):
        data = {
            "name": "hotel.availability_calendar",
            "result_format": "price_histogram",
            "hotel_id": hotel_id,
            "search_config": json.dumps(
                {
                    # we can adjust pricing configuration here but this is the default
                    "b_adults_total": 2,
                    "b_nr_rooms_needed": 1,
                    "b_children_total": 0,
                    "b_children_ages_total": [],
                    "b_is_group_search": 0,
                    "b_pets_total": 0,
                    "b_rooms": [{"b_adults": 2, "b_room_order": 1}],
                }
            ),
            "checkin": price_start_dt,
            "n_days": price_n_days,
            "respect_min_los_restriction": 1,
            "los": 1,
        }
        result = await scrapfly.async_scrape(
            ScrapeConfig(
                url="https://www.booking.com/fragment.json?cur_currency=usd",
                method="POST",
                data=data,
                headers={"X-Booking-CSRF": csrf_token},
                session=hotel_url.split("/")[-1].split(".")[0],
                country="US",
            )
        )
        return json.loads(result.content)["data"]

    hotels = await asyncio.gather(*[scrape_hotel(url) for url in urls])
    return hotels

In [67]:
hotel_urls = [
    "https://www.booking.com/hotel/mx/century-zona-rosa.es.html",
    "https://www.booking.com/hotel/mx/bristol.es.html",
    # Add more hotel URLs as needed
]

start_date = "2024-08-8" 
durations = [1, 3, 7] 
days_to_check = 90 

In [68]:
await run_hotel_scraper(hotel_urls, start_date, durations, days_to_check)

CRITICAL:root:<-- 200 | ERR::SCRAPE::BAD_UPSTREAM_RESPONSE - The website you target respond with an unexpected status code (>400) - The scrapped url: https://www.booking.com/fragment.json?cur_currency=usd respond with 400 - Bad Request: . Checkout the related doc: https://scrapfly.io/docs/scrape-api/error/ERR::SCRAPE::BAD_UPSTREAM_RESPONSE


UpstreamHttpClientError: Target website responded with 400 - Bad Request