In [2]:
# Vérifier les sitemaps autorisés
import requests
from bs4 import BeautifulSoup

sitemap_url = "https://www.booking.com/sitembk-hotel-review-index.xml"
response = requests.get(sitemap_url, headers={'User-Agent': 'Mozilla/5.0'})

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'xml')
    urls = [loc.text for loc in soup.find_all('loc')]
    print(f"Nombre d'URLs de commentaires trouvées: {len(urls)}")
else:
    print("Accès refusé au sitemap des commentaires")

Nombre d'URLs de commentaires trouvées: 167


In [4]:
import requests
from bs4 import BeautifulSoup
import time

# Liste pour stocker les commentaires
all_comments = []

# Parcourir les URLs (exemple sur les 5 premières pour test)
for url in urls[:5]:  # Limitez-vous pour ne pas surcharger le serveur
    try:
        print(f"Extraction depuis: {url}")
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Exemple d'extraction (à adapter au HTML réel)
        comments = soup.find_all('div', class_='review_item')  # Classe probable
        for comment in comments:
            text = comment.find('div', class_='review-body').get_text(strip=True)
            all_comments.append(text)
        
        time.sleep(10)  # Délai critique pour éviter le blocage
    except Exception as e:
        print(f"Erreur sur {url}: {str(e)}")

print(f"Total de commentaires extraits: {len(all_comments)}")

Extraction depuis: https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz
Extraction depuis: https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz
Extraction depuis: https://www.booking.com/sitembk-hotel-review-en-gb.0021.xml.gz
Extraction depuis: https://www.booking.com/sitembk-hotel-review-en-gb.0020.xml.gz
Extraction depuis: https://www.booking.com/sitembk-hotel-review-en-gb.0019.xml.gz
Total de commentaires extraits: 0


In [6]:
import requests
import gzip
from io import BytesIO
from bs4 import BeautifulSoup
import time

# Liste pour stocker les URLs de commentaires finales
review_urls = []

# Parcourir les URLs des sitemaps .gz
sitemap_urls = [
    "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
    "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz"
]

for sitemap_url in sitemap_urls:
    try:
        print(f"Traitement de : {sitemap_url}")
        
        # Télécharger et décompresser le .gz
        response = requests.get(sitemap_url, headers={'User-Agent': 'Mozilla/5.0'})
        compressed_file = BytesIO(response.content)
        with gzip.open(compressed_file, 'rb') as f:
            xml_content = f.read()
        
        # Parser le XML pour trouver les URLs
        soup = BeautifulSoup(xml_content, 'xml')  # Notez 'xml' et non 'html.parser'
        urls = [loc.text for loc in soup.find_all('loc')]
        
        # Filtrer seulement les URLs de commentaires
        for url in urls:
            if '/reviews/' in url:  # Exemple de filtre
                review_urls.append(url)
        
        time.sleep(10)  # Respecter les délais
        
    except Exception as e:
        print(f"Erreur sur {sitemap_url}: {str(e)}")

print(f"URLs de commentaires trouvées: {len(review_urls)}")

Traitement de : https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz
Traitement de : https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz
URLs de commentaires trouvées: 70353


In [10]:
import os
import gzip
import requests
from io import BytesIO
from bs4 import BeautifulSoup

# Méthode 2 : Récupération directe depuis les sitemaps (sans fichier requis)
def get_review_urls_from_sitemap(sitemap_url):
    try:
        print(f"Téléchargement du sitemap: {sitemap_url}")
        response = requests.get(sitemap_url, 
                             headers={'User-Agent': 'Mozilla/5.0'},
                             timeout=10)
        response.raise_for_status()
        
        compressed_file = BytesIO(response.content)
        with gzip.open(compressed_file, 'rb') as f:
            xml_content = f.read().decode('utf-8')
        
        soup = BeautifulSoup(xml_content, 'xml')
        urls = [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
        print(f"Trouvé {len(urls)} URLs dans ce sitemap")
        return urls
        
    except Exception as e:
        print(f"Erreur avec {sitemap_url}: {str(e)}")
        return []

# Liste des sitemaps à traiter
sitemaps = [
    "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
    "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz",
    # Ajoutez d'autres sitemaps ici
]

# Récupération de toutes les URLs
review_urls = []
for sitemap in sitemaps:
    review_urls.extend(get_review_urls_from_sitemap(sitemap))

# Sauvegarde dans un fichier pour usage futur
if review_urls:
    with open('urls_commentaires.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(review_urls))
    print(f"\nTotal URLs trouvées: {len(review_urls)}")
    print("Sauvegardé dans urls_commentaires.txt")
else:
    print("Aucune URL trouvée - vérifiez les sitemaps")

Téléchargement du sitemap: https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz
Trouvé 49366 URLs dans ce sitemap
Téléchargement du sitemap: https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz
Trouvé 20987 URLs dans ce sitemap

Total URLs trouvées: 70353
Sauvegardé dans urls_commentaires.txt


In [18]:
import requests
import gzip
from io import BytesIO
from bs4 import BeautifulSoup
import time
import random

# 1. Trouver les sitemaps de commentaires (méthode corrigée)
def find_review_sitemaps():
    # Ces patterns couvrent la plupart des formats de sitemap Booking
    patterns = [
        "sitembk-hotel-review-",
        "sitembk-reviews-",
        "review-index"
    ]
    
    # URL de base des sitemaps
    sitemap_index = "https://www.booking.com/sitembk-index.xml"
    
    try:
        response = requests.get(sitemap_index, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        soup = BeautifulSoup(response.content, 'xml')
        
        review_sitemaps = []
        for sitemap in soup.find_all('sitemap'):
            loc = sitemap.find('loc').text
            if any(pattern in loc for pattern in patterns):
                review_sitemaps.append(loc)
        
        # Fallback si la méthode automatique échoue
        if not review_sitemaps:
            print("Méthode automatique échouée, utilisation des sitemaps connus")
            return [
                "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
                "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz",
                "https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz"
            ]
        
        return review_sitemaps
    
    except Exception as e:
        print(f"Erreur lors de la recherche des sitemaps: {e}")
        return []

# 2. Extraire les URLs de commentaires
def extract_review_urls(sitemap_url):
    try:
        print(f"Traitement de {sitemap_url}")
        response = requests.get(sitemap_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
        response.raise_for_status()
        
        if sitemap_url.endswith('.gz'):
            compressed_file = BytesIO(response.content)
            with gzip.open(compressed_file, 'rb') as f:
                content = f.read().decode('utf-8')
        else:
            content = response.text
            
        soup = BeautifulSoup(content, 'xml')
        return [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
        
    except Exception as e:
        print(f"Erreur avec {sitemap_url}: {e}")
        return []

# 3. Exécution principale
def main():
    # Étape 1: Trouver les sitemaps
    sitemaps = find_review_sitemaps()
    print(f"Sitemaps trouvés: {len(sitemaps)}")
    
    # Étape 2: Extraire toutes les URLs
    all_review_urls = []
    for sitemap in sitemaps:
        urls = extract_review_urls(sitemap)
        print(f"→ {len(urls)} URLs extraites")
        all_review_urls.extend(urls)
        time.sleep(random.uniform(2, 5))  # Délai de politesse
    
    # Étape 3: Sauvegarde
    if all_review_urls:
        with open('urls_commentaires.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(all_review_urls))
        print(f"\nRésultat final: {len(all_review_urls)} URLs sauvegardées")
    else:
        print("Aucune URL trouvée")

if __name__ == "__main__":
    main()

Méthode automatique échouée, utilisation des sitemaps connus
Sitemaps trouvés: 3
Traitement de https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz
→ 49366 URLs extraites
Traitement de https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz
→ 20987 URLs extraites
Traitement de https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz
→ 49990 URLs extraites

Résultat final: 120343 URLs sauvegardées


In [30]:
!pip install requests beautifulsoup4



In [32]:
import requests
import gzip
from io import BytesIO
from bs4 import BeautifulSoup
import time
import random
import json
from urllib.parse import urlparse

# Configuration
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9'
}
REQUEST_DELAY = (3, 7)  # Délai aléatoire entre requêtes

# 1. Récupération des Sitemaps
def get_review_sitemaps():
    KNOWN_SITEMAPS = [
        "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
        "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz",
        "https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz",
        "https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz",
        "https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz"
    ]
    
    try:
        # Essai de détection automatique
        sitemap_index = "https://www.booking.com/sitembk-index.xml"
        response = requests.get(sitemap_index, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(response.content, 'xml')
        
        auto_sitemaps = []
        for sitemap in soup.find_all('sitemap'):
            loc = sitemap.find('loc').text
            if 'hotel-review' in loc or 'reviews' in loc:
                auto_sitemaps.append(loc)
        
        return auto_sitemaps if auto_sitemaps else KNOWN_SITEMAPS
    
    except Exception as e:
        print(f"Detection automatique échouée: {str(e)}")
        return KNOWN_SITEMAPS

# 2. Extraction des URLs de commentaires
def extract_review_urls(sitemap_url):
    try:
        print(f"Traitement de: {sitemap_url}")
        
        response = requests.get(sitemap_url, headers=HEADERS, timeout=20)
        response.raise_for_status()
        
        # Gestion des fichiers .gz
        if sitemap_url.endswith('.gz'):
            with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
                content = f.read().decode('utf-8')
        else:
            content = response.text
        
        soup = BeautifulSoup(content, 'xml')
        urls = [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
        
        print(f"→ {len(urls)} URLs trouvées")
        return urls
    
    except Exception as e:
        print(f"Erreur avec {sitemap_url}: {str(e)}")
        return []

# 3. Extraction des commentaires
def scrape_review_page(url):
    try:
        time.sleep(random.uniform(*REQUEST_DELAY))
        
        response = requests.get(url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Sélecteurs actualisés (à vérifier périodiquement)
        hotel_name = soup.select_one('h2#hp_hotel_name').text.strip() if soup.select_one('h2#hp_hotel_name') else None
        reviews = []
        
        for review in soup.select('div[data-testid="review-card"]'):
            try:
                reviews.append({
                    'hotel': hotel_name,
                    'author': review.select_one('span.bui-avatar-block__title').text.strip(),
                    'rating': float(review.select_one('div.bui-review-score__badge').text.strip()),
                    'date': review.select_one('span.c-review-block__date').text.strip(),
                    'title': review.select_one('h3.c-review-block__title').text.strip(),
                    'content': review.select_one('div.c-review__body').text.strip(),
                    'url': url
                })
            except Exception as e:
                print(f"Erreur dans un commentaire: {str(e)}")
                continue
        
        return reviews
    
    except Exception as e:
        print(f"Erreur sur {url}: {str(e)}")
        return []

# 4. Exécution principale
def main():
    # Étape 1: Récupérer les sitemaps
    sitemaps = get_review_sitemaps()
    print(f"\n{len(sitemaps)} sitemaps à traiter\n")
    
    # Étape 2: Extraire toutes les URLs
    all_review_urls = []
    for sitemap in sitemaps:
        all_review_urls.extend(extract_review_urls(sitemap))
        time.sleep(random.uniform(2, 5))
    
    # Sauvegarde des URLs
    with open('booking_review_urls.json', 'w') as f:
        json.dump(all_review_urls, f)
    print(f"\nTotal URLs sauvegardées: {len(all_review_urls)}\n")
    
    # Étape 3: Extraction des commentaires (exemple sur 5 URLs)
    sample_urls = random.sample(all_review_urls, min(5, len(all_review_urls)))
    all_reviews = []
    
    for url in sample_urls:
        all_reviews.extend(scrape_review_page(url))
    
    # Sauvegarde des résultats
    with open('booking_reviews_sample.json', 'w', encoding='utf-8') as f:
        json.dump(all_reviews, f, ensure_ascii=False, indent=2)
    
    print("\nExtraction terminée. Résultats sauvegardés dans:")
    print("- booking_review_urls.json (toutes les URLs)")
    print("- booking_reviews_sample.json (échantillon de commentaires)")

if __name__ == "__main__":
    main()


5 sitemaps à traiter

Traitement de: https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz
→ 49366 URLs trouvées
Traitement de: https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz
→ 20987 URLs trouvées
Traitement de: https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz
→ 49990 URLs trouvées
Traitement de: https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz
→ 49990 URLs trouvées
Traitement de: https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz
→ 49990 URLs trouvées

Total URLs sauvegardées: 220323


Extraction terminée. Résultats sauvegardés dans:
- booking_review_urls.json (toutes les URLs)
- booking_reviews_sample.json (échantillon de commentaires)


In [34]:
import requests
import gzip
import json
import time
import random
from io import BytesIO
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
import os

# Configuration avancée
CONFIG = {
    'user_agents': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    ],
    'timeouts': {
        'sitemap': 30,
        'review_page': 45
    },
    'delays': {
        'min': 3,
        'max': 10
    },
    'max_workers': 3,
    'sample_size': 0.001  # 0.1% des URLs pour l'échantillon
}

# 1. Gestion des sitemaps
class SitemapManager:
    @staticmethod
    def get_sitemaps():
        PRIMARY_SITEMAPS = [
            "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz"
        ]
        
        try:
            index = requests.get(
                "https://www.booking.com/sitembk-index.xml",
                headers={'User-Agent': random.choice(CONFIG['user_agents'])},
                timeout=CONFIG['timeouts']['sitemap']
            )
            soup = BeautifulSoup(index.content, 'xml')
            discovered = [loc.text for loc in soup.find_all('loc') if 'review' in loc.text.lower()]
            return discovered if discovered else PRIMARY_SITEMAPS
        except Exception as e:
            print(f"⚠️ Échec détection sitemaps: {str(e)}")
            return PRIMARY_SITEMAPS

# 2. Extraction des URLs
class URLHarvester:
    @staticmethod
    def process_sitemap(sitemap_url):
        try:
            print(f"🔍 Traitement de: {sitemap_url[:60]}...")
            
            headers = {'User-Agent': random.choice(CONFIG['user_agents'])}
            response = requests.get(
                sitemap_url,
                headers=headers,
                timeout=CONFIG['timeouts']['sitemap']
            )
            
            if sitemap_url.endswith('.gz'):
                content = gzip.decompress(response.content).decode('utf-8')
            else:
                content = response.text
                
            soup = BeautifulSoup(content, 'xml')
            urls = [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
            
            print(f"✅ {len(urls):,} URLs extraites")
            return urls
            
        except Exception as e:
            print(f"❌ Erreur sur {sitemap_url[:50]}: {str(e)}")
            return []

# 3. Extraction des commentaires
class ReviewScraper:
    @staticmethod
    def scrape_page(url):
        try:
            time.sleep(random.uniform(CONFIG['delays']['min'], CONFIG['delays']['max']))
            
            headers = {
                'User-Agent': random.choice(CONFIG['user_agents']),
                'Accept-Language': 'en-US,en;q=0.5'
            }
            
            response = requests.get(
                url,
                headers=headers,
                timeout=CONFIG['timeouts']['review_page']
            )
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Métadonnées de l'hôtel
            hotel_data = {
                'name': soup.select_one('h2#hp_hotel_name').get_text(strip=True) if soup.select_one('h2#hp_hotel_name') else None,
                'address': soup.select_one('span.hp_address_subtitle').get_text(strip=True) if soup.select_one('span.hp_address_subtitle') else None,
                'rating': soup.select_one('div.bui-review-score__badge').get_text(strip=True) if soup.select_one('div.bui-review-score__badge') else None
            }
            
            # Extraction des commentaires
            reviews = []
            for review in soup.select('div[data-testid="review-card"]'):
                try:
                    reviews.append({
                        'author': review.select_one('span.bui-avatar-block__title').get_text(strip=True),
                        'rating': float(review.select_one('div.bui-review-score__badge').get_text(strip=True)),
                        'date': review.select_one('span.c-review-block__date').get_text(strip=True),
                        'title': review.select_one('h3.c-review-block__title').get_text(strip=True),
                        'content': review.select_one('div.c-review__body').get_text(strip=True),
                        'trip_type': review.select_one('span.c-review-block__type').get_text(strip=True) if review.select_one('span.c-review-block__type') else None,
                        'response': review.select_one('div.c-review-block__response__inner').get_text(strip=True) if review.select_one('div.c-review-block__response__inner') else None
                    })
                except Exception as e:
                    continue
            
            return {
                'hotel': hotel_data,
                'reviews': reviews,
                'url': url,
                'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
            }
            
        except Exception as e:
            print(f"⚠️ Échec sur {url[:50]}: {str(e)}")
            return None

# 4. Gestion des données
class DataManager:
    @staticmethod
    def save_data(data, filename):
        os.makedirs('data', exist_ok=True)
        path = f"data/{filename}"
        
        try:
            with open(path, 'w', encoding='utf-8') as f:
                if filename.endswith('.json'):
                    json.dump(data, f, ensure_ascii=False, indent=2)
                else:
                    f.write('\n'.join(data))
            print(f"💾 Données sauvegardées: {path}")
        except Exception as e:
            print(f"❌ Erreur sauvegarde {path}: {str(e)}")

# 5. Orchestrateur principal
def main():
    print("🚀 Démarrage de l'extraction Booking.com")
    
    # Étape 1: Récupération des sitemaps
    sitemaps = SitemapManager.get_sitemaps()
    print(f"\n📦 {len(sitemaps)} sitemaps à traiter\n")
    
    # Étape 2: Extraction des URLs (parallélisée)
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        all_urls = list(executor.map(URLHarvester.process_sitemap, sitemaps))
    
    flat_urls = [url for sublist in all_urls for url in sublist]
    print(f"\n🌍 Total URLs trouvées: {len(flat_urls):,}")
    
    # Sauvegarde des URLs
    DataManager.save_data(flat_urls, 'booking_urls.txt')
    
    # Étape 3: Extraction des commentaires (échantillon)
    sample_size = max(5, int(len(flat_urls) * CONFIG['sample_size']))
    sample_urls = random.sample(flat_urls, sample_size)
    
    print(f"\n🔎 Extraction de {len(sample_urls)} commentaires...")
    reviews = []
    
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        results = executor.map(ReviewScraper.scrape_page, sample_urls)
        reviews = [r for r in results if r is not None]
    
    # Sauvegarde des résultats
    DataManager.save_data(reviews, 'booking_reviews.json')
    
    # Statistiques
    total_reviews = sum(len(hotel['reviews']) for hotel in reviews)
    print(f"\n🎉 Extraction terminée! {total_reviews} commentaires analysés")
    print("📂 Fichiers créés:")
    print("- data/booking_urls.txt (toutes les URLs)")
    print("- data/booking_reviews.json (commentaires détaillés)")

if __name__ == "__main__":
    main()

🚀 Démarrage de l'extraction Booking.com

📦 5 sitemaps à traiter

🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz...
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz...
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml....
✅ 20,987 URLs extraites
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz...
✅ 39,832 URLs extraites
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz...
✅ 0 URLs extraites
✅ 49,974 URLs extraites
✅ 49,990 URLs extraites

🌍 Total URLs trouvées: 160,783
💾 Données sauvegardées: data/booking_urls.txt

🔎 Extraction de 160 commentaires...
💾 Données sauvegardées: data/booking_reviews.json

🎉 Extraction terminée! 0 commentaires analysés
📂 Fichiers créés:
- data/booking_urls.txt (toutes les URLs)
- data/booking_reviews.json (commentaires détaillés)


In [1]:
!pip install requests beautifulsoup4 pymongo websockets



In [31]:
import requests
import gzip
import json
import time
import random
from io import BytesIO
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
import os

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser') 

session = requests.Session()
session.mount('https://', requests.adapters.HTTPAdapter(
    max_retries=3,
    pool_connections=100,
    pool_maxsize=100
)) 
# Configuration avancée
# Configuration avancée
CONFIG = {
    # Ajoutez dans CONFIG
    'proxies': [
        "http://proxy1:port",
        "http://proxy2:port"
    ],
    
    'user_agents': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    ],
    'timeouts': {
        'sitemap': 60,
        'review_page': 90
    },
    'delays': {
        'min': 10,
        'max': 30
    },
    'max_retries': 5,  # Ajoutez cette option
    'max_workers': 3,
    'sample_size': 0.001  # 0.1% des URLs pour l'échantillon
}

# 1. Gestion des sitemaps
class SitemapManager:
    @staticmethod
    def get_sitemaps():
        PRIMARY_SITEMAPS = [
            "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz"
        ]
        
        try:
            index = requests.get(
                "https://www.booking.com/sitembk-index.xml",
                headers={'User-Agent': random.choice(CONFIG['user_agents'])},
                timeout=CONFIG['timeouts']['sitemap']
            )
            soup = BeautifulSoup(index.content, 'xml')
            discovered = [loc.text for loc in soup.find_all('loc') if 'review' in loc.text.lower()]
            return discovered if discovered else PRIMARY_SITEMAPS
        except Exception as e:
            print(f"⚠️ Échec détection sitemaps: {str(e)}")
            return PRIMARY_SITEMAPS

# 2. Extraction des URLs
class URLHarvester:
    @staticmethod
    def process_sitemap(sitemap_url):
        try:
            print(f"🔍 Traitement de: {sitemap_url[:60]}...")
            
            headers = {'User-Agent': random.choice(CONFIG['user_agents'])}
            response = requests.get(
                sitemap_url,
                headers=headers,
                timeout=CONFIG['timeouts']['sitemap']
            )
            
            if sitemap_url.endswith('.gz'):
                content = gzip.decompress(response.content).decode('utf-8')
            else:
                content = response.text
                
            soup = BeautifulSoup(content, 'xml')
            urls = [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
            
            print(f"✅ {len(urls):,} URLs extraites")
            return urls
            
        except Exception as e:
            print(f"❌ Erreur sur {sitemap_url[:50]}: {str(e)}")
            return []

# 3. Extraction des commentaires
class ReviewScraper:
    @staticmethod
    # Dans la classe ReviewScraper, remplacez la méthode scrape_page par ceci :

    def scrape_page(url):
        try:
            time.sleep(random.uniform(CONFIG['delays']['min'], CONFIG['delays']['max']))
            
            headers = {
                'User-Agent': random.choice(CONFIG['user_agents']),
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
            }
            
            response = requests.get(
                url,
                headers=headers,
                timeout=CONFIG['timeouts']['review_page']
            )
            
            # Debug: Sauvegarder le HTML pour inspection
            with open('debug_page.html', 'w', encoding='utf-8') as f:
                f.write(response.text)
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Nouveaux sélecteurs testés (mai 2024)
            hotel_data = {
                'name': soup.find('h2', {'id': 'hp_hotel_name'}).get_text(strip=True) if soup.find('h2', {'id': 'hp_hotel_name'}) else None,
                'address': soup.find('span', {'class': 'hp_address_subtitle'}).get_text(strip=True) if soup.find('span', {'class': 'hp_address_subtitle'}) else None,
                'rating': soup.find('div', {'class': 'bui-review-score__badge'}).get_text(strip=True) if soup.find('div', {'class': 'bui-review-score__badge'}) else None
            }
            
            reviews = []
            
            # Méthode plus robuste pour trouver les commentaires
            review_blocks = soup.find_all('div', {'class': 'review_item'}) or \
                           soup.find_all('div', {'class': 'review_list_new_item'}) or \
                           soup.find_all('div', {'data-testid': 'review-card'})
            
            for review in review_blocks:
                try:
                    author_elem = review.find('span', class_='bui-avatar-block__title') or \
                                 review.find('span', class_='reviewer_name')
                    
                    rating_elem = review.find('div', class_='bui-review-score__badge') or \
                                 review.find('div', class_='review-score-badge')
                    
                    date_elem = review.find('span', class_='c-review-block__date') or \
                               review.find('span', class_='review_item_date')
                    
                    content_elem = review.find('div', class_='c-review__body') or \
                                  review.find('div', class_='review_item_review_content')
                    
                    reviews.append({
                        'author': author_elem.get_text(strip=True) if author_elem else 'Anonyme',
                        'rating': float(rating_elem.get_text(strip=True)) if rating_elem else None,
                        'date': date_elem.get_text(strip=True) if date_elem else 'Date inconnue',
                        'title': review.find('h3').get_text(strip=True) if review.find('h3') else '',
                        'content': content_elem.get_text(strip=True) if content_elem else '',
                        'response': review.find('div', class_='c-review-block__response__inner').get_text(strip=True) 
                                    if review.find('div', class_='c-review-block__response__inner') else None
                    })
                except Exception as e:
                    print(f"Erreur traitement commentaire: {str(e)}")
                    continue
            
            return {
                'hotel': hotel_data,
                'reviews': reviews,
                'url': url,
                'timestamp': time.strftime("%Y-%m-%d %H:%M:%S"),
                'html_saved': 'debug_page.html'  # Pour inspection
            }
            
        except Exception as e:
            print(f"⚠️ Échec grave sur {url[:50]}: {str(e)}")
            return None

# 4. Gestion des données
class DataManager:
    @staticmethod
    def save_data(data, filename):
        os.makedirs('data', exist_ok=True)
        path = f"data/{filename}"
        
        try:
            with open(path, 'w', encoding='utf-8') as f:
                if filename.endswith('.json'):
                    json.dump(data, f, ensure_ascii=False, indent=2)
                else:
                    f.write('\n'.join(data))
            print(f"💾 Données sauvegardées: {path}")
        except Exception as e:
            print(f"❌ Erreur sauvegarde {path}: {str(e)}")

# 5. Orchestrateur principal
def main():
    print("🚀 Démarrage de l'extraction Booking.com")
    
    # Étape 1: Récupération des sitemaps
    sitemaps = SitemapManager.get_sitemaps()
    print(f"\n📦 {len(sitemaps)} sitemaps à traiter\n")
    
    # Étape 2: Extraction des URLs (parallélisée)
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        all_urls = list(executor.map(URLHarvester.process_sitemap, sitemaps))
    
    flat_urls = [url for sublist in all_urls for url in sublist]
    print(f"\n🌍 Total URLs trouvées: {len(flat_urls):,}")
    
    # Sauvegarde des URLs
    DataManager.save_data(flat_urls, 'booking_urls.txt')
    
    # Étape 3: Extraction des commentaires (échantillon)
    sample_size = max(5, int(len(flat_urls) * CONFIG['sample_size']))
    sample_urls = random.sample(flat_urls, sample_size)
    
    print(f"\n🔎 Extraction de {len(sample_urls)} commentaires...")
    reviews = []
    
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        results = executor.map(ReviewScraper.scrape_page, sample_urls)
        reviews = [r for r in results if r is not None]
    
    # Sauvegarde des résultats
    DataManager.save_data(reviews, 'booking_reviews.json')
    
    # Statistiques
    total_reviews = sum(len(hotel['reviews']) for hotel in reviews)
    print(f"\n🎉 Extraction terminée! {total_reviews} commentaires analysés")
    print("📂 Fichiers créés:")
    print("- data/booking_urls.txt (toutes les URLs)")
    print("- data/booking_reviews.json (commentaires détaillés)")

if __name__ == "__main__":
    main()

NameError: name 'url' is not defined

In [33]:
import requests
import gzip
import json
import time
import random
from io import BytesIO
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Configuration avancée
CONFIG = {
    'proxies': [
        "http://proxy1:port",
        "http://proxy2:port"
    ],
    'user_agents': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    ],
    'timeouts': {
        'sitemap': 60,
        'review_page': 90
    },
    'delays': {
        'min': 10,
        'max': 30
    },
    'max_retries': 5,
    'max_workers': 3,
    'sample_size': 0.001  # 0.1% des URLs pour l'échantillon
}

# Initialize session
session = requests.Session()
session.mount('https://', requests.adapters.HTTPAdapter(
    max_retries=3,
    pool_connections=100,
    pool_maxsize=100
))

# [Rest of your classes (SitemapManager, URLHarvester, etc.)...]
# 1. Gestion des sitemaps
class SitemapManager:
    @staticmethod
    def get_sitemaps():
        PRIMARY_SITEMAPS = [
            "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz"
        ]
        
        try:
            index = requests.get(
                "https://www.booking.com/sitembk-index.xml",
                headers={'User-Agent': random.choice(CONFIG['user_agents'])},
                timeout=CONFIG['timeouts']['sitemap']
            )
            soup = BeautifulSoup(index.content, 'xml')
            discovered = [loc.text for loc in soup.find_all('loc') if 'review' in loc.text.lower()]
            return discovered if discovered else PRIMARY_SITEMAPS
        except Exception as e:
            print(f"⚠️ Échec détection sitemaps: {str(e)}")
            return PRIMARY_SITEMAPS

# 2. Extraction des URLs
class URLHarvester:
    @staticmethod
    def process_sitemap(sitemap_url):
        try:
            print(f"🔍 Traitement de: {sitemap_url[:60]}...")
            
            headers = {'User-Agent': random.choice(CONFIG['user_agents'])}
            response = requests.get(
                sitemap_url,
                headers=headers,
                timeout=CONFIG['timeouts']['sitemap']
            )
            
            if sitemap_url.endswith('.gz'):
                content = gzip.decompress(response.content).decode('utf-8')
            else:
                content = response.text
                
            soup = BeautifulSoup(content, 'xml')
            urls = [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
            
            print(f"✅ {len(urls):,} URLs extraites")
            return urls
            
        except Exception as e:
            print(f"❌ Erreur sur {sitemap_url[:50]}: {str(e)}")
            return []

class ReviewScraper:
    @staticmethod
    def scrape_page(url):
        try:
            time.sleep(random.uniform(CONFIG['delays']['min'], CONFIG['delays']['max']))
            
            # Option 1: Use requests (faster)
            headers = {
                'User-Agent': random.choice(CONFIG['user_agents']),
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
            }
            
            response = session.get(
                url,
                headers=headers,
                timeout=CONFIG['timeouts']['review_page'],
                proxies={'http': random.choice(CONFIG['proxies'])} if CONFIG.get('proxies') else None
            )
            
            # Option 2: Use Selenium only if needed (for JavaScript-rendered content)
            if "JavaScript" in response.text:  # Simple check if page needs JS
                options = Options()
                options.add_argument("--headless")
                driver = webdriver.Chrome(options=options)
                driver.get(url)
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                driver.quit()
            else:
                soup = BeautifulSoup(response.text, 'html.parser')
            
            # [Rest of your scraping logic...]
            
        except Exception as e:
            print(f"⚠️ Échec grave sur {url[:50]}: {str(e)}")
            return None

# [Rest of your code...]
# 4. Gestion des données
class DataManager:
    @staticmethod
    def save_data(data, filename):
        os.makedirs('data', exist_ok=True)
        path = f"data/{filename}"
        
        try:
            with open(path, 'w', encoding='utf-8') as f:
                if filename.endswith('.json'):
                    json.dump(data, f, ensure_ascii=False, indent=2)
                else:
                    f.write('\n'.join(data))
            print(f"💾 Données sauvegardées: {path}")
        except Exception as e:
            print(f"❌ Erreur sauvegarde {path}: {str(e)}")

# 5. Orchestrateur principal
def main():
    print("🚀 Démarrage de l'extraction Booking.com")
    
    # Étape 1: Récupération des sitemaps
    sitemaps = SitemapManager.get_sitemaps()
    print(f"\n📦 {len(sitemaps)} sitemaps à traiter\n")
    
    # Étape 2: Extraction des URLs (parallélisée)
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        all_urls = list(executor.map(URLHarvester.process_sitemap, sitemaps))
    
    flat_urls = [url for sublist in all_urls for url in sublist]
    print(f"\n🌍 Total URLs trouvées: {len(flat_urls):,}")
    
    # Sauvegarde des URLs
    DataManager.save_data(flat_urls, 'booking_urls.txt')
    
    # Étape 3: Extraction des commentaires (échantillon)
    sample_size = max(5, int(len(flat_urls) * CONFIG['sample_size']))
    sample_urls = random.sample(flat_urls, sample_size)
    
    print(f"\n🔎 Extraction de {len(sample_urls)} commentaires...")
    reviews = []
    
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        results = executor.map(ReviewScraper.scrape_page, sample_urls)
        reviews = [r for r in results if r is not None]
    
    # Sauvegarde des résultats
    DataManager.save_data(reviews, 'booking_reviews.json')
    
    # Statistiques
    total_reviews = sum(len(hotel['reviews']) for hotel in reviews)
    print(f"\n🎉 Extraction terminée! {total_reviews} commentaires analysés")
    print("📂 Fichiers créés:")
    print("- data/booking_urls.txt (toutes les URLs)")
    print("- data/booking_reviews.json (commentaires détaillés)")

if __name__ == "__main__":
    main()

🚀 Démarrage de l'extraction Booking.com

📦 5 sitemaps à traiter

🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz...
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz...
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml....
✅ 21,009 URLs extraites
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz...
✅ 49,380 URLs extraites
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz...
✅ 49,990 URLs extraites
✅ 49,992 URLs extraites
✅ 49,964 URLs extraites

🌍 Total URLs trouvées: 220,335
💾 Données sauvegardées: data/booking_urls.txt

🔎 Extraction de 220 commentaires...
⚠️ Échec grave sur https://www.booking.com/reviews/it/hotel/b-amp-b-l: HTTPSConnectionPool(host='www.booking.com', port=443): Read timed out.
💾 Données sauvegardées: data/booking_reviews.json

🎉 Extraction terminée! 0 commentaires analysés
📂 Fichiers créés:
- data/booking_urls.txt (toutes les URLs)

In [34]:
import requests
import gzip
import json
import time
import random
from io import BytesIO
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Configuration avancée
CONFIG = {
    'proxies': [
        "http://proxy1:port",
        "http://proxy2:port"
    ],
    'user_agents': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    ],
    'timeouts': {
        'sitemap': 60,
        'review_page': 90
    },
    'delays': {
        'min': 10,
        'max': 30
    },
    'max_retries': 5,
    'max_workers': 3,
    'sample_size': 0.001  # 0.1% des URLs pour l'échantillon
}

# Initialize session
session = requests.Session()
session.mount('https://', requests.adapters.HTTPAdapter(
    max_retries=3,
    pool_connections=100,
    pool_maxsize=100
))

# [Rest of your classes (SitemapManager, URLHarvester, etc.)...]
# 1. Gestion des sitemaps
class SitemapManager:
    @staticmethod
    def get_sitemaps():
        PRIMARY_SITEMAPS = [
            "https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz"
        ]
        
        try:
            index = requests.get(
                "https://www.booking.com/sitembk-index.xml",
                headers={'User-Agent': random.choice(CONFIG['user_agents'])},
                timeout=CONFIG['timeouts']['sitemap']
            )
            soup = BeautifulSoup(index.content, 'xml')
            discovered = [loc.text for loc in soup.find_all('loc') if 'review' in loc.text.lower()]
            return discovered if discovered else PRIMARY_SITEMAPS
        except Exception as e:
            print(f"⚠️ Échec détection sitemaps: {str(e)}")
            return PRIMARY_SITEMAPS

# 2. Extraction des URLs
class URLHarvester:
    @staticmethod
    def process_sitemap(sitemap_url):
        try:
            print(f"🔍 Traitement de: {sitemap_url[:60]}...")
            
            headers = {'User-Agent': random.choice(CONFIG['user_agents'])}
            response = requests.get(
                sitemap_url,
                headers=headers,
                timeout=CONFIG['timeouts']['sitemap']
            )
            
            if sitemap_url.endswith('.gz'):
                content = gzip.decompress(response.content).decode('utf-8')
            else:
                content = response.text
                
            soup = BeautifulSoup(content, 'xml')
            urls = [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
            
            print(f"✅ {len(urls):,} URLs extraites")
            return urls
            
        except Exception as e:
            print(f"❌ Erreur sur {sitemap_url[:50]}: {str(e)}")
            return []

class ReviewScraper:
    class ReviewScraper:
    @staticmethod
    def scrape_page(url):
        for attempt in range(CONFIG['max_retries']):
            try:
                time.sleep(random.uniform(CONFIG['delays']['min'], CONFIG['delays']['max']))
                
                headers = {
                    'User-Agent': random.choice(CONFIG['user_agents']),
                    'Accept-Language': 'en-US,en;q=0.5',
                    'Accept-Encoding': 'gzip, deflate',
                    'Connection': 'keep-alive'
                }
                
                try:
                    response = session.get(
                        url,
                        headers=headers,
                        timeout=CONFIG['timeouts']['review_page'],
                        proxies={'http': random.choice(CONFIG['proxies'])} if CONFIG.get('proxies') else None
                    )
                    response.raise_for_status()
                    
                    # Debug: print the first 500 chars of the page
                    print(f"Page content (first 500 chars): {response.text[:500]}...")
                    
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # Extraction des données de l'hôtel
                    hotel_name = soup.find('h2', {'id': 'hp_hotel_name'}).get_text(strip=True) if soup.find('h2', {'id': 'hp_hotel_name'}) else "Nom non trouvé"
                    print(f"Extraction des commentaires pour: {hotel_name}")
                    
                    # Extraction des commentaires
                    reviews = []
                    review_blocks = soup.find_all('li', class_='review_list_new_item') or \
                                  soup.find_all('div', class_='c-review-block')
                    
                    for review in review_blocks:
                        try:
                            review_data = {
                                'author': review.find('span', class_='bui-avatar-block__title').get_text(strip=True) if review.find('span', class_='bui-avatar-block__title') else 'Anonyme',
                                'rating': review.find('div', class_='bui-review-score__badge').get_text(strip=True) if review.find('div', class_='bui-review-score__badge') else None,
                                'date': review.find('span', class_='c-review-block__date').get_text(strip=True) if review.find('span', class_='c-review-block__date') else None,
                                'content': review.find('div', class_='c-review__body').get_text(strip=True) if review.find('div', class_='c-review__body') else None
                            }
                            reviews.append(review_data)
                            
                            # Afficher le commentaire dans la console
                            print(f"\n⭐ Note: {review_data.get('rating', 'N/A')}")
                            print(f"📅 Date: {review_data.get('date', 'N/A')}")
                            print(f"👤 Auteur: {review_data.get('author', 'Anonyme')}")
                            print(f"📝 Commentaire: {review_data.get('content', 'Aucun contenu')[:200]}...\n")
                            
                        except Exception as e:
                            print(f"Erreur sur un commentaire: {str(e)}")
                            continue
                    
                    return {
                        'hotel_name': hotel_name,
                        'url': url,
                        'reviews': reviews,
                        'review_count': len(reviews)
                    }
                    
                except requests.exceptions.RequestException as e:
                    print(f"⚠️ Tentative {attempt + 1} échouée pour {url[:50]}...: {str(e)}")
                    if attempt == CONFIG['max_retries'] - 1:
                        return None
                    time.sleep(5 * (attempt + 1))  # Attente exponentielle
                    
            except Exception as e:
                print(f"⚠️ Erreur inattendue sur {url[:50]}: {str(e)}")
                return None
        return None

# [Rest of your code...]
# 4. Gestion des données
class DataManager:
    @staticmethod
    def save_data(data, filename):
        os.makedirs('data', exist_ok=True)
        path = f"data/{filename}"
        
        try:
            with open(path, 'w', encoding='utf-8') as f:
                if filename.endswith('.json'):
                    json.dump(data, f, ensure_ascii=False, indent=2)
                else:
                    f.write('\n'.join(data))
            print(f"💾 Données sauvegardées: {path}")
        except Exception as e:
            print(f"❌ Erreur sauvegarde {path}: {str(e)}")

# 5. Orchestrateur principal
def main():
    print("🚀 Démarrage de l'extraction Booking.com")
    
    # Étape 1: Récupération des sitemaps
    sitemaps = SitemapManager.get_sitemaps()
    print(f"\n📦 {len(sitemaps)} sitemaps à traiter\n")
    
    # Étape 2: Extraction des URLs (parallélisée)
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        all_urls = list(executor.map(URLHarvester.process_sitemap, sitemaps))
    
    flat_urls = [url for sublist in all_urls for url in sublist]
    print(f"\n🌍 Total URLs trouvées: {len(flat_urls):,}")
    
    # Sauvegarde des URLs
    DataManager.save_data(flat_urls, 'booking_urls.txt')
    
    # Étape 3: Extraction des commentaires (échantillon)
    sample_size = max(5, int(len(flat_urls) * CONFIG['sample_size']))
    sample_urls = random.sample(flat_urls, sample_size)
    
    print(f"\n🔎 Extraction de {len(sample_urls)} commentaires...")
    reviews = []
    
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        results = executor.map(ReviewScraper.scrape_page, sample_urls)
        reviews = [r for r in results if r is not None]
    
    # Sauvegarde des résultats
    DataManager.save_data(reviews, 'booking_reviews.json')
    
    # Statistiques
    total_reviews = sum(len(hotel['reviews']) for hotel in reviews)
    print(f"\n🎉 Extraction terminée! {total_reviews} commentaires analysés")
    print("📂 Fichiers créés:")
    print("- data/booking_urls.txt (toutes les URLs)")
    print("- data/booking_reviews.json (commentaires détaillés)")

def display_reviews_from_file(filename="data/booking_reviews.json"):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        print(f"\n📊 Résultats de l'extraction ({len(data)} hôtels analysés)")
        
        for hotel in data:
            if hotel is None:
                continue
                
            print(f"\n🏨 Hôtel: {hotel.get('hotel_name', 'Nom inconnu')}")
            print(f"🔗 URL: {hotel.get('url', 'N/A')}")
            print(f"📊 Nombre de commentaires: {len(hotel.get('reviews', []))}")
            
            for i, review in enumerate(hotel.get('reviews', []), 1):
                print(f"\nCommentaire #{i}:")
                print(f"⭐ Note: {review.get('rating', 'N/A')}")
                print(f"📅 Date: {review.get('date', 'N/A')}")
                print(f"👤 Auteur: {review.get('author', 'Anonyme')}")
                print(f"📝 Contenu: {review.get('content', '')[200:]}")
                
    except Exception as e:
        print(f"Erreur lors de la lecture du fichier: {str(e)}")

# Modifiez la fin de votre main() pour:
if __name__ == "__main__":
    main()
    display_reviews_from_file()  # Ajoutez cette ligne

IndentationError: expected an indented block after class definition on line 102 (3193857946.py, line 103)

In [None]:
!pip install requests beautifulsoup4 selenium webdriver-manager

In [None]:
import requests
import gzip
import json
import time
import random
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configuration avancée
CONFIG = {
    'proxies': [
        "http://proxy1:port",
        "http://proxy2:port"
    ],
    'user_agents': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    ],
    'timeouts': {
        'sitemap': 60,
        'review_page': 90,
        'selenium': 30
    },
    'delays': {
        'min': 5,
        'max': 15
    },
    'max_retries': 3,
    'max_workers': 2,
    'sample_size': 0.001,
    'selenium_wait': 5
}

# Initialize session
session = requests.Session()
session.mount('https://', requests.adapters.HTTPAdapter(
    max_retries=3,
    pool_connections=50,
    pool_maxsize=50
))

class SitemapManager:
    @staticmethod
    def get_sitemaps():
        PRIMARY_SITEMAPS = [
            "https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml.gz",
            "https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz"
        ]
        
        try:
            headers = {'User-Agent': random.choice(CONFIG['user_agents'])}
            response = requests.get(
                "https://www.booking.com/sitembk-index.xml",
                headers=headers,
                timeout=CONFIG['timeouts']['sitemap']
            )
            soup = BeautifulSoup(response.content, 'xml')
            discovered = [loc.text for loc in soup.find_all('loc') if 'review' in loc.text.lower()]
            return discovered if discovered else PRIMARY_SITEMAPS
        except Exception as e:
            print(f"⚠️ Échec détection sitemaps: {str(e)}")
            return PRIMARY_SITEMAPS

class URLHarvester:
    @staticmethod
    def process_sitemap(sitemap_url):
        try:
            print(f"🔍 Traitement de: {sitemap_url[:60]}...")
            
            headers = {'User-Agent': random.choice(CONFIG['user_agents'])}
            response = requests.get(
                sitemap_url,
                headers=headers,
                timeout=CONFIG['timeouts']['sitemap']
            )
            
            content = gzip.decompress(response.content).decode('utf-8') if sitemap_url.endswith('.gz') else response.text
            soup = BeautifulSoup(content, 'xml')
            urls = [loc.text for loc in soup.find_all('loc') if '/reviews/' in loc.text]
            
            print(f"✅ {len(urls):,} URLs extraites")
            return urls
            
        except Exception as e:
            print(f"❌ Erreur sur {sitemap_url[:50]}: {str(e)}")
            return []

class ReviewScraper:
    @staticmethod
    def init_driver():
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument(f"user-agent={random.choice(CONFIG['user_agents'])}")
        driver = webdriver.Chrome(options=options)
        return driver

    @staticmethod
    def scrape_page(url):
        for attempt in range(CONFIG['max_retries']):
            try:
                time.sleep(random.uniform(CONFIG['delays']['min'], CONFIG['delays']['max']))
                
                # Essayer d'abord avec requests
                try:
                    headers = {
                        'User-Agent': random.choice(CONFIG['user_agents']),
                        'Accept-Language': 'en-US,en;q=0.5',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
                    }
                    
                    response = session.get(
                        url,
                        headers=headers,
                        timeout=CONFIG['timeouts']['review_page'],
                        proxies={'http': random.choice(CONFIG['proxies'])} if CONFIG.get('proxies') else None
                    )
                    response.raise_for_status()
                    
                    soup = BeautifulSoup(response.text, 'html.parser')
                    if "JavaScript" in response.text or not soup.find('div', {'data-testid': 'review-card'}):
                        raise Exception("Page nécessite JavaScript")
                        
                except Exception:
                    # Fallback à Selenium si requests échoue
                    driver = ReviewScraper.init_driver()
                    try:
                        driver.get(url)
                        WebDriverWait(driver, CONFIG['timeouts']['selenium']).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="review-card"]'))
                        time.sleep(CONFIG['selenium_wait'])
                        soup = BeautifulSoup(driver.page_source, 'html.parser')
                    finally:
                        driver.quit()
                
                # Extraction des données
                hotel_name = soup.find('h2', {'data-testid': 'heading-title'}).get_text(strip=True) if soup.find('h2', {'data-testid': 'heading-title'}) else "Nom non trouvé"
                
                reviews = []
                review_blocks = soup.find_all('div', {'data-testid': 'review-card'})
                
                for review in review_blocks:
                    try:
                        review_data = {
                            'author': review.find('span', {'data-testid': 'reviewer-name'}).get_text(strip=True) if review.find('span', {'data-testid': 'reviewer-name'}) else 'Anonyme',
                            'rating': review.find('div', {'data-testid': 'review-score'}).get_text(strip=True) if review.find('div', {'data-testid': 'review-score'}) else None,
                            'date': review.find('span', {'data-testid': 'date'}).get_text(strip=True) if review.find('span', {'data-testid': 'date'}) else None,
                            'content': review.find('div', {'data-testid': 'review-body'}).get_text(strip=True) if review.find('div', {'data-testid': 'review-body'}) else None
                        }
                        reviews.append(review_data)
                    except Exception as e:
                        print(f"Erreur traitement commentaire: {str(e)}")
                        continue
                
                return {
                    'hotel_name': hotel_name,
                    'url': url,
                    'reviews': reviews,
                    'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
                }
                
            except Exception as e:
                print(f"⚠️ Tentative {attempt + 1} échouée pour {url[:50]}...: {str(e)}")
                if attempt == CONFIG['max_retries'] - 1:
                    return None
                time.sleep(5 * (attempt + 1))
        
        return None

class DataManager:
    @staticmethod
    def save_data(data, filename):
        os.makedirs('data', exist_ok=True)
        path = f"data/{filename}"
        
        try:
            with open(path, 'w', encoding='utf-8') as f:
                if filename.endswith('.json'):
                    json.dump(data, f, ensure_ascii=False, indent=2)
                else:
                    f.write('\n'.join(data))
            print(f"💾 Données sauvegardées: {path}")
        except Exception as e:
            print(f"❌ Erreur sauvegarde {path}: {str(e)}")

def display_reviews(filename="data/booking_reviews.json"):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"\n📊 Résultats de l'extraction ({len([h for h in data if h])} hôtels analysés)")
        
        for hotel in data:
            if not hotel:
                continue
                
            print(f"\n🏨 Hôtel: {hotel.get('hotel_name', 'Nom inconnu')}")
            print(f"🔗 URL: {hotel.get('url', 'N/A')}")
            print(f"📅 Date extraction: {hotel.get('timestamp', 'N/A')}")
            print(f"📊 Nombre de commentaires: {len(hotel.get('reviews', []))}")
            
            for i, review in enumerate(hotel.get('reviews', []), 1):
                print(f"\nCommentaire #{i}:")
                print(f"⭐ Note: {review.get('rating', 'N/A')}")
                print(f"📅 Date: {review.get('date', 'N/A')}")
                print(f"👤 Auteur: {review.get('author', 'Anonyme')}")
                print(f"📝 Contenu: {review.get('content', '')[:200]}...")
                
    except Exception as e:
        print(f"Erreur lecture fichier: {str(e)}")

def main():
    print("🚀 Démarrage de l'extraction Booking.com")
    
    # Étape 1: Récupération des sitemaps
    sitemaps = SitemapManager.get_sitemaps()
    print(f"\n📦 {len(sitemaps)} sitemaps à traiter\n")
    
    # Étape 2: Extraction des URLs
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        all_urls = list(executor.map(URLHarvester.process_sitemap, sitemaps))
    
    flat_urls = [url for sublist in all_urls for url in sublist if url]
    print(f"\n🌍 Total URLs trouvées: {len(flat_urls):,}")
    
    DataManager.save_data(flat_urls, 'booking_urls.txt')
    
    # Étape 3: Extraction des commentaires (échantillon)
    sample_size = max(5, int(len(flat_urls) * CONFIG['sample_size']))
    sample_urls = random.sample(flat_urls, sample_size)
    print(f"\n🔎 Extraction de {len(sample_urls)} commentaires...")
    
    reviews = []
    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        results = executor.map(ReviewScraper.scrape_page, sample_urls)
        reviews = [r for r in results if r]
    
    DataManager.save_data(reviews, 'booking_reviews.json')
    
    # Statistiques
    total_reviews = sum(len(h['reviews']) for h in reviews if h and 'reviews' in h)
    print(f"\n🎉 Extraction terminée! {total_reviews} commentaires analysés")
    print("📂 Fichiers créés:")
    print("- data/booking_urls.txt (toutes les URLs)")
    print("- data/booking_reviews.json (commentaires détaillés)")
    
    display_reviews()

if __name__ == "__main__":
    main()

🚀 Démarrage de l'extraction Booking.com

📦 5 sitemaps à traiter

🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-tr.0000.xml.gz...
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-bg.0000.xml.gz...
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-en-gb.0000.xml....
✅ 21,009 URLs extraites
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-fr.0000.xml.gz...
✅ 49,380 URLs extraites
🔍 Traitement de: https://www.booking.com/sitembk-hotel-review-de.0000.xml.gz...
✅ 49,990 URLs extraites
✅ 49,992 URLs extraites
✅ 49,964 URLs extraites

🌍 Total URLs trouvées: 220,335
💾 Données sauvegardées: data/booking_urls.txt

🔎 Extraction de 220 commentaires...
Page content (first 500 chars): <!DOCTYPE html>
<!--
You know you could be getting paid to poke around in our code?
We're hiring designers and developers to work in Amsterdam:
https://careers.booking.com/
-->
<!-- wdot-802 -->
<script type="text/javascript" nonce="YmlzcnFbRcw8MSW">
document.addEven