In [1]:
from __future__ import annotations

import os
import re
import argparse
import random
from time import sleep
from datetime import datetime
from typing import Dict, Optional

import pandas as pd
import requests
from bs4 import BeautifulSoup


In [None]:
# ---------------------- Default configuration ---------------------- #

DEFAULT_CSV = "expedia_scores.csv"

DEFAULT_SEP = ";"                # semicolon CSV
DEFAULT_TIMEOUT = 20
DEFAULT_MIN_DELAY = 2.0
DEFAULT_MAX_DELAY = 5.0
DEFAULT_RETRIES = 2

DATE_COL_RE = re.compile(r"\d{4}-\d{2}-\d{2}")  # YYYY-MM-DD

# Map of hotel display name -> Expedia URL
EXPEDIA_URLS: Dict[str, str] = {

    "Ananea Castelo Suites Hotel" : "https://euro.expedia.net/Albufeira-Hotels-Castelo-Suites-Hotel.h111521689.Hotel-Information?pwaDialog=product-reviews",
    "PortoBay Falésia" : "https://euro.expedia.net/Albufeira-Hotels-PortoBay-Falesia.h1787641.Hotel-Information?pwaDialog=product-reviews",
    "Regency Salgados Hotel & Spa" : "https://euro.expedia.net/Albufeira-Hotels-Regency-Salgados-Hotel-Spa.h67650702.Hotel-Information?pwaDialog=product-reviews",
    "NAU São Rafael Atlântico" : "https://euro.expedia.net/Albufeira-Hotels-Sao-Rafael-Suite-Hotel.h1210300.Hotel-Information?pwaDialog=summary-reviews-1210300",
    "NAU Salgados Dunas Suites" : "",
    "Vidamar Resort Hotel Algarve " : "https://euro.expedia.net/Albufeira-Hotels-VidaMar-Resort-Hotel-Algarve.h5670748.Hotel-Information?pwaDialog=summary-reviews-5670748"
}


In [4]:
HEADERS = {
    # Pretend to be a real browser; adjust if needed
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) "
        "Version/17.0 Safari/605.1.15"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}


In [5]:
def fetch_page(url: str, timeout: int, retries: int) -> Optional[str]:
    """
    Fetch the HTML for a given Expedia hotel URL with simple retry logic.

    Returns the response text on success, or None on failure.
    """
    if not url:
        return None

    last_exc: Exception | None = None
    for attempt in range(retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            last_exc = e
            if attempt < retries:
                sleep(1.5)
            else:
                print(f"   ERROR: failed to fetch after {retries + 1} attempts: {e}")
                return None
    return None


In [7]:

# ---------------------------- CSV logic ---------------------------- #

def ensure_csv(csv_path: str, sep: str, hotels: list[str]) -> pd.DataFrame:
    """
    Create or load the CSV. Ensure the index includes all hotels and
    that an 'Average Score' column exists.
    """
    if not os.path.exists(csv_path):
        print(f"Creating {csv_path} …")
        df = pd.DataFrame(index=hotels)
        df.index.name = "Hotel"
        df["Average Score"] = pd.NA
        df.to_csv(csv_path, sep=sep, index_label="Hotel")
        return df

    df = pd.read_csv(csv_path, sep=sep, index_col="Hotel")

    # Ensure all hotels are present
    for h in hotels:
        if h not in df.index:
            df.loc[h] = pd.Series(dtype="float64")

    if "Average Score" not in df.columns:
        df["Average Score"] = pd.NA

    return df

In [8]:

def update_average(df: pd.DataFrame) -> None:
    """
    Recompute 'Average Score' across all columns that look like YYYY-MM-DD.
    (Non-date columns are ignored.)
    """
    date_cols = [c for c in df.columns if isinstance(c, str) and DATE_COL_RE.fullmatch(c)]
    if date_cols:
        df["Average Score"] = df[date_cols].mean(axis=1, numeric_only=True).round(2)



In [None]:

def get_expedia_score(url: str, timeout: int = DEFAULT_TIMEOUT, retries: int = DEFAULT_RETRIES) -> Optional[float]:
    """
    Fetch overall Expedia score (0–5) from a hotel page.

    This is heuristic; you may need to adjust the selector/regex after
    inspecting the HTML of a few hotel pages.

    Returns
    -------
    float or None
        Score if found, else None.
    """
    html = fetch_page(url, timeout=timeout, retries=retries)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(" ", strip=True)

    # Common Expedia pattern is like "8.8 out of 10"
    m = re.search(r"(\d+(\.\d+)?)\s+out of\s+10", text)
    if not m:
        # If this fails, you will need to inspect the page and refine this regex
        return None

    try:
        return float(m.group(1))
    except ValueError:
        return None


In [17]:
html = get_expedia_score(
            EXPEDIA_URLS["Ananea Castelo Suites Hotel"],
            timeout=DEFAULT_TIMEOUT,
            retries=DEFAULT_RETRIES,
        )

In [18]:
html

8.8

In [10]:
score

In [None]:






# ------------------------ Scraper logic ---------------------------- #









 
    hotels = list(EXPEDIA_URLS.keys())
    df = ensure_csv(DEFAULT_CSV, sep=DEFAULT_SEP, hotels)

    today_col = DATE_COL_RE
    new_scores: dict[str, Optional[float]] = {}

    print(f"Writing Expedia scores into column: {today_col}\n")

    for i, (hotel, url) in enumerate(EXPEDIA_URLS.items(), start=1):
        print(f"{i:02d}/{len(EXPEDIA_URLS)} → {hotel}")
        score = get_expedia_score(
            url,
            timeout=DEFAULT_TIMEOUT,
            retries=DEFAULT_RETRIES,
        )
        new_scores[hotel] = score

        if score is not None:
            print(f"   {score}/5")
        else:
            print("   (no score)")

        delay = random.uniform(DEFAULT_MIN_DELAY, DEFAULT_MAX_DELAY)
        sleep(delay)


