In [1]:
import requests
import time
from bs4 import BeautifulSoup
import re  # Added for regex cleaning

In [5]:
def safe_get_text(element, default="N/A"):
    """Safely extract text from a BeautifulSoup element or return default if None."""
    return element.get_text(strip=True) if element else default

def clean_hotel_name(name):
    """Remove numbering prefixes (e.g., '1. Hotel Name' -> 'Hotel Name')."""
    return re.sub(r'^\d+\.\s*', '', name)

def clean_num_reviews(reviews_str):
    """Extract integer from reviews string (e.g., '(5,455reviews)' -> 5455)."""
    if reviews_str == "N/A":
        return None
    # Remove all non-digit characters and convert to integer
    num = re.sub(r'[^\d]', '', reviews_str)
    return int(num) if num else None

def scrape_tripadvisor_hotels(base_url, max_retries=10, delay_between_retries=2):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    offset = 0
    all_hotels = []
    any_hotels_found = False
    
    while True:
        if offset == 0:
            url = base_url
        else:
            url = base_url.replace(
                "Hotels-g293753-", 
                f"Hotels-g293753-oa{offset}-"
            )
        
        retry_count = 0
        success = False
        page_has_hotels = False
        
        while retry_count < max_retries and not success:
            try:
                print(f"Fetching (Attempt {retry_count + 1}): {url}")
                response = requests.get(url, headers=headers)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                hotel_listings = soup.find_all('div', {'data-automation': lambda x: x and x.startswith('non-plus-hotel-offer')})
                
                if not hotel_listings:
                    print(f"No hotels found on attempt {retry_count + 1}. Retrying...")
                    retry_count += 1
                    time.sleep(delay_between_retries)
                    continue
                
                print(f"✅ Successfully scraped {len(hotel_listings)} hotels at offset {offset}")
                page_has_hotels = True
                any_hotels_found = True
                
                for hotel in hotel_listings:
                    try:
                        name = clean_hotel_name(safe_get_text(hotel.find('h3', class_='biGQs')))
                        link = "https://www.tripadvisor.com" + hotel.find('a', class_='BMQDV')['href'] if hotel.find('a', class_='BMQDV') else "N/A"
                        rating = safe_get_text(hotel.find('div', {'data-automation': 'bubbleRatingValue'}))
                        num_reviews = clean_num_reviews(safe_get_text(hotel.find('div', {'data-automation': 'bubbleReviewCount'})))
                        
                        all_hotels.append({
                            'name': name,
                            'link': link,
                            'rating': rating,
                            'num_reviews': num_reviews
                        })
                    except Exception as e:
                        print(f"⚠️ Skipping a hotel due to error: {e}")
                        continue
                
                success = True
                offset += 30
            
            except requests.exceptions.RequestException as e:
                print(f"⚠️ Error (Attempt {retry_count + 1}): {e}")
                retry_count += 1
                time.sleep(delay_between_retries)
            
            except Exception as e:
                print(f"⚠️ Unexpected error (Attempt {retry_count + 1}): {e}")
                retry_count += 1
                time.sleep(delay_between_retries)
        
        if not page_has_hotels:
            if not any_hotels_found:
                print("❌ No hotels found in any page. Stopping.")
            else:
                print("✅ No more hotels found. Stopping.")
            break
    
    return all_hotels

# Example usage
if __name__ == "__main__":
    base_url = "https://www.tripadvisor.com/Hotels-g293753-Tunisia-Hotels.html#SPLITVIEWLIST"
    hotels = scrape_tripadvisor_hotels(base_url)
    
    if hotels:
        for i, hotel in enumerate(hotels, 1):
            print(f"{i}. {hotel['name']}")
            print(f"   Rating: {hotel['rating']}")
            print(f"   Reviews: {hotel['num_reviews']}")
            print(f"   Link: {hotel['link']}")
            print("-" * 50)
        print(f"Total hotels scraped: {len(hotels)}")
    else:
        print("No hotels found.")

Fetching (Attempt 1): https://www.tripadvisor.com/Hotels-g293753-Tunisia-Hotels.html#SPLITVIEWLIST
No hotels found on attempt 1. Retrying...
Fetching (Attempt 2): https://www.tripadvisor.com/Hotels-g293753-Tunisia-Hotels.html#SPLITVIEWLIST
No hotels found on attempt 2. Retrying...
Fetching (Attempt 3): https://www.tripadvisor.com/Hotels-g293753-Tunisia-Hotels.html#SPLITVIEWLIST
No hotels found on attempt 3. Retrying...
Fetching (Attempt 4): https://www.tripadvisor.com/Hotels-g293753-Tunisia-Hotels.html#SPLITVIEWLIST
No hotels found on attempt 4. Retrying...
Fetching (Attempt 5): https://www.tripadvisor.com/Hotels-g293753-Tunisia-Hotels.html#SPLITVIEWLIST
No hotels found on attempt 5. Retrying...
Fetching (Attempt 6): https://www.tripadvisor.com/Hotels-g293753-Tunisia-Hotels.html#SPLITVIEWLIST
✅ Successfully scraped 30 hotels at offset 0
Fetching (Attempt 1): https://www.tripadvisor.com/Hotels-g293753-oa30-Tunisia-Hotels.html#SPLITVIEWLIST
No hotels found on attempt 1. Retrying...
Fetch

In [7]:
def save_to_json(data, filename="tripadvisor_hotels_no_details_no_reviews.json"):
    """Save scraped data to a JSON file."""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

if __name__ == "__main__":
    if hotels:
        save_to_json(hotels)  # Save to JSON
        print(f"Total hotels scraped: {len(hotels)}")
    else:
        print("No hotels found.")

Data saved to tripadvisor_hotels_no_details_no_reviews.json
Total hotels scraped: 1533
