# Expedia Debug Notebook

Step-by-step checks for `src/sites/expedia.py`.

In [1]:
from pathlib import Path
import sys
import os

ROOT = Path.cwd()
if not (ROOT / 'src').exists() and (ROOT.parent / 'src').exists():
    ROOT = ROOT.parent

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print('Project root:', ROOT)


Project root: /Users/laurabquintas/Documents/GitHub/reputation-analyzer


In [2]:
from src.sites import expedia as site

In [3]:
site.EXPEDIA_URLS

{'Ananea Castelo Suites Hotel': 'https://euro.expedia.net/Albufeira-Hotels-Castelo-Suites-Hotel.h111521689.Hotel-Information?pwaDialog=product-reviews',
 'PortoBay Falésia': 'https://euro.expedia.net/Albufeira-Hotels-PortoBay-Falesia.h1787641.Hotel-Information?pwaDialog=product-reviews',
 'Regency Salgados Hotel & Spa': 'https://euro.expedia.net/Albufeira-Hotels-Regency-Salgados-Hotel-Spa.h67650702.Hotel-Information?pwaDialog=product-reviews',
 'NAU São Rafael Atlântico': 'https://euro.expedia.net/Albufeira-Hotels-Sao-Rafael-Suite-Hotel.h1210300.Hotel-Information?pwaDialogNested=PropertyDetailsReviewsBreakdownDialog',
 'NAU Salgados Dunas Suites': '',
 'Vidamar Resort Hotel Algarve': 'https://euro.expedia.net/Albufeira-Hotels-VidaMar-Resort-Hotel-Algarve.h5670748.Hotel-Information?pwaDialog=product-reviews'}

In [4]:
hotel, url = next(iter(site.EXPEDIA_URLS.items()))
print('Testing:', hotel)
score = site.get_expedia_score(url, timeout=20, retries=1)
print('Score:', score)


Testing: Ananea Castelo Suites Hotel
Score: 8.8


In [5]:
results = {}
for hotel, url in site.EXPEDIA_URLS.items():
    try:
        results[hotel] = site.get_expedia_score(url, timeout=20, retries=1)
    except Exception as exc:
        results[hotel] = f'ERROR: {exc}'
results


{'Ananea Castelo Suites Hotel': 8.8,
 'PortoBay Falésia': 9.2,
 'Regency Salgados Hotel & Spa': 9.2,
 'NAU São Rafael Atlântico': 8.4,
 'NAU Salgados Dunas Suites': None,
 'Vidamar Resort Hotel Algarve': 8.6}

In [6]:
vidamar_url = site.EXPEDIA_URLS['Vidamar Resort Hotel Algarve']
site.debug_expedia_score_candidates(vidamar_url, timeout=20, retries=1)


{'fetch_ok': True,
 'jsonld_score': None,
 'semantic_div_score': None,
 'textual_score': None,
 'embedded_json_score': 8.6,
 'contains_8_6': True}

In [7]:
# Fetch raw HTML for side-by-side comparison
vidamar_url = site.EXPEDIA_URLS['Vidamar Resort Hotel Algarve']
ananea_url = site.EXPEDIA_URLS['Ananea Castelo Suites Hotel']

vidamar_html = site.fetch_page(vidamar_url, timeout=30, retries=1)
ananea_html = site.fetch_page(ananea_url, timeout=30, retries=1)

print('Vidamar fetched:', vidamar_html is not None, 'len=', len(vidamar_html or ''))
print('Ananea fetched :', ananea_html is not None, 'len=', len(ananea_html or ''))


Vidamar fetched: True len= 1053762
Ananea fetched : True len= 1045102


In [8]:
# Save fetched pages to notebooks/fetched_html
out_dir = ROOT / 'notebooks' / 'fetched_html'
out_dir.mkdir(parents=True, exist_ok=True)

vidamar_path = out_dir / 'expedia_vidamar.html'
ananea_path = out_dir / 'expedia_ananea.html'

if vidamar_html is not None:
    vidamar_path.write_text(vidamar_html, encoding='utf-8')
if ananea_html is not None:
    ananea_path.write_text(ananea_html, encoding='utf-8')

print('Vidamar file:', vidamar_path)
print('Ananea file :', ananea_path)


Vidamar file: /Users/laurabquintas/Documents/GitHub/reputation-analyzer/notebooks/fetched_html/expedia_vidamar.html
Ananea file : /Users/laurabquintas/Documents/GitHub/reputation-analyzer/notebooks/fetched_html/expedia_ananea.html


In [None]:
# Quick compare snippets around score markers
import re

def show_markers(label, html):
    if not html:
        print(f'\n{label}: no html')
        return
    print(f'\n{label}')
    for pat in [r'8\.6', r'out\s+of\s+10', r'/10', r'aggregateRating', r'reviewScore', r'guestRating']:
        m = re.search(pat, html, flags=re.IGNORECASE)
        print(f'  {pat}:', bool(m))
        if m:
            i = m.start()
            print('   ...', html[max(0, i-160):i+220].replace('\n',' ')[:420])

show_markers('Vidamar', vidamar_html)
show_markers('Ananea', ananea_html)


In [None]:
from datetime import datetime
import subprocess

date_col = datetime.now().strftime('%Y-%m-%d')
cmd = [sys.executable, str(ROOT / 'src' / 'sites' / 'expedia.py'), '--date', date_col]
print('Running:', ' '.join(cmd))
subprocess.run(cmd, check=False)
