In [None]:
## ZIDHA EL LOCATION 

In [4]:
import json
import requests
from bs4 import BeautifulSoup
import time
import random
import os

# File paths
INPUT_FILE = 'tripadvisor_hotels_no_details_no_reviews.json'
OUTPUT_FILE = 'hotels_details_no_reviews.json'
BACKUP_FILE = 'hotels_details_no_reviews_backup.json'

# Load existing JSON data
with open(INPUT_FILE, 'r') as f:
    hotels = json.load(f)

# List of user agents to rotate through
USER_AGENTS = [
    # Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    
    # Firefox
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13.5; rv:120.0) Gecko/20100101 Firefox/120.0',
    'Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0',
        
    # Safari
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1',    
    
    # Edge
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',   
    
    # Mobile
    # Android - Samsung Galaxy
    'Mozilla/5.0 (Linux; Android 13; SM-S901B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
    # Android - Google Pixel
    'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
    # iPad
    'Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1',
]

# specify the proxy address
PROXY_LIST = [
    'http://200.174.198.86:8888',
]

def get_random_user_agent():
    """Return a random user agent from the list"""
    return random.choice(USER_AGENTS)

def get_random_proxy():
    """Return a random proxy from the list"""
    return random.choice(PROXY_LIST)

def save_progress(data, filename=OUTPUT_FILE):
    """Save the current progress to a file"""
    try:
        # First save to a temporary file
        temp_file = filename + '.tmp'
        with open(temp_file, 'w') as f:
            json.dump(data, f, indent=2)
        
        # Then rename the temp file to the target file (atomic operation)
        if os.path.exists(filename):
            os.replace(temp_file, filename)
        else:
            os.rename(temp_file, filename)
        
        # Also create a backup copy
        with open(BACKUP_FILE, 'w') as f:
            json.dump(data, f, indent=2)
            
    except Exception as e:
        print(f"Error saving progress: {str(e)}")

def load_existing_progress(filename=OUTPUT_FILE):
    """Load existing progress if available"""
    try:
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                return json.load(f)
    except Exception as e:
        print(f"Warning: Could not load existing progress file: {str(e)}")
    return None

def is_captcha_page(response_text):
    """Check if the response contains CAPTCHA indicators"""
    captcha_indicators = [
        "captcha",
        "are you human",
        "verify you are not a robot",
        "recaptcha",
        "challenge"
    ]
    return any(indicator in response_text.lower() for indicator in captcha_indicators)


def scrape_hotel_details_with_retries(url, max_attempts=10, delay=2):
    """Wrapper function to add retry logic to your original scraper"""
    for attempt in range(max_attempts):
        # Change user agent for each attempt
        headers = {
            'User-Agent': get_random_user_agent(),
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Referer': 'https://www.tripadvisor.com/',
            'DNT': '1'
        }
        
        details = scrape_hotel_details(url, headers)  # Pass headers to your function
        if details is not None:  # If we got data
            return details
        print(f"Attempt {attempt + 1} failed, retrying in {delay} seconds...")
        time.sleep(delay * (attempt + 1))  # Exponential backoff
    print(f"Failed after {max_attempts} attempts")
    return None

def scrape_hotel_details(url, headers=None):
    """Modified scraping function to accept headers"""
    if headers is None:
        headers = {
            'User-Agent': get_random_user_agent(),
            'Accept-Language': 'en-US,en;q=0.5',
        }
        

    try:
        proxy = get_random_proxy()
        print(proxy)
        print(f"Requesting URL: {url}")
        response = requests.get(url, headers=headers , proxies={"https": proxy})
        response.raise_for_status()

        # Check for CAPTCHA first
        if is_captcha_page(response.text):
            print("CAPTCHA detected - manual intervention required")
            # You could add logic here to pause or notify you
            return None

        
        # Check if we got blocked
        if "Sorry, we were unable to fulfill your request" in response.text:
            print("Blocked by TripAdvisor")
            return None
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize result dictionary
        details = {
            'description': '',
            'property_amenities': [],
            'room_features': [],
            'room_types': [],
            'hotel_class': '',
            'languages_spoken': [],
            'hotel_style': []
        }
        
        # Extract description
        description_div = soup.find('div', {'data-tab': 'TABS_ABOUT', 'data-section-signature': 'about'})
        if description_div:
            description_text = description_div.find('div', class_='_T')
            if description_text:
                details['description'] = description_text.get_text(strip=True)
        
        # Extract amenities and other sections
        sections = soup.find_all('div', class_='vqEpQ')
        for section in sections:
            section_title = section.get_text(strip=True) 
            next_div = section.find_next('div', class_='Jevoh')
            
            if not next_div:
                continue
                
            items = [item.get_text(strip=True) for item in next_div.find_all('div', class_='gFttI')]
            
            if 'Property amenities' in section_title:
                details['property_amenities'] = items
            elif 'Room features' in section_title:
                details['room_features'] = items
            elif 'Room types' in section_title:
                details['room_types'] = items
        
        # Hotel class extraction
        stars_svg = soup.find('svg', class_='JXZuC', attrs={'data-automation': 'bubbleRatingImage'})
        if stars_svg:
            title_tag = stars_svg.find('title')
            if title_tag:
                rating_text = title_tag.get_text(strip=True)
                if "of 5 stars" in rating_text:
                    details['hotel_class'] = rating_text.split("of 5 stars")[0].strip() + " stars"
                else:
                    details['hotel_class'] = rating_text
        else:
            rating_title = soup.find('title', string=lambda text: text and "of 5 stars" in text)
            if rating_title:
                rating_text = rating_title.get_text(strip=True)
                details['hotel_class'] = rating_text.split("of 5 stars")[0].strip() + " stars"
            else:
                filled_stars = len(soup.find_all('path', class_='xHhmW'))
                if filled_stars > 0:
                    details['hotel_class'] = f"{filled_stars} stars"
        
        # Languages and hotel style
        good_to_know_section = soup.find('div', class_=['ruCQl', 'fqojJ'])
        if good_to_know_section:
            # Languages Spoken
            languages_title_div = good_to_know_section.find('div', class_='MTDER', string='Languages Spoken')
            if languages_title_div:
                languages_div = languages_title_div.find_next_sibling('div', class_='CMiVw')
                if languages_div:
                    details['languages_spoken'] = [lang.strip() for lang in languages_div.get_text().split(',')]

            # Hotel Style
            hotel_style_title_div = good_to_know_section.find('div', class_='MTDER', string='HOTEL STYLE')
            if hotel_style_title_div:
                styles_list = []
                current_sibling = hotel_style_title_div.find_next_sibling('div', class_='CMiVw')
                while current_sibling:
                    styles_list.append(current_sibling.get_text(strip=True))
                    current_sibling = current_sibling.find_next_sibling('div', class_='CMiVw')
                details['hotel_style'] = styles_list
        
        return details
        
    except requests.exceptions.RequestException as req_err:
        print(f"Network or HTTP error for {url}: {req_err}")
        return None
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

def main():
    # Try to load existing progress
    existing_progress = load_existing_progress()
    
    # Check if we have complete valid data (all hotels have description)
    if existing_progress and all('description' in h for h in existing_progress):
        print("Found complete existing progress file. Nothing to do.")
        return
    
    # If we have partial progress, update our hotels list
    if existing_progress:
        print(f"Resuming from existing progress")
        
        # Create mapping of hotel numbers to indices
        hotel_mapping = {h['hotel_number']: i for i, h in enumerate(hotels)}
        
        # Update our hotels list with the scraped data we already have
        for existing_hotel in existing_progress:
            if 'hotel_number' in existing_hotel:
                hotel_idx = hotel_mapping.get(existing_hotel['hotel_number'])
                if hotel_idx is not None and 'description' in existing_hotel:
                    hotels[hotel_idx].update(existing_hotel)
    
    # Process all hotels with progress counter
    total_hotels = len(hotels)
    
    # Find hotels that need processing (missing description)
    hotels_to_process = [i for i, h in enumerate(hotels) if 'description' not in h]
    
    print(f"Found {len(hotels_to_process)} hotels remaining to process")
    
    for idx in hotels_to_process:
        hotel = hotels[idx]
        print(f"\nProcessing hotel {hotel['hotel_number']} of {total_hotels}: {hotel['name']}...")
        
        # Use the retry wrapper
        details = scrape_hotel_details_with_retries(hotel['link'])
        
        if details:
            hotel.update(details)
            # Save progress after each successful scrape
            save_progress(hotels)
            print(f"Progress saved after hotel {hotel['hotel_number']}")

        # Random delay between 2-5 seconds to appear more human-like
        time.sleep(random.uniform(2,5))

    print(f"\nScraping complete! Processed {total_hotels} hotels. Final data saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

Resuming from existing progress
Found 21 hotels remaining to process

Processing hotel 1513 of 1533: Casa Del Mar...
http://200.174.198.86:8888
Requesting URL: https://www.tripadvisor.com/Hotel_Review-g297946-d12711169-Reviews-Casa_Del_Mar-La_Marsa_Tunis_Governorate.html
Progress saved after hotel 1513

Processing hotel 1514 of 1533: Hammam Sousse Apartment...
http://200.174.198.86:8888
Requesting URL: https://www.tripadvisor.com/Hotel_Review-g1189197-d15766033-Reviews-Hammam_Sousse_Apartment-Hammam_Sousse_Sousse_Governorate.html
Progress saved after hotel 1514

Processing hotel 1515 of 1533: Dessole Garden Beach Resort...
http://200.174.198.86:8888
Requesting URL: https://www.tripadvisor.com/Hotel_Review-g297949-d25361749-Reviews-Dessole_Garden_Beach_Resort-Monastir_Monastir_Governorate.html
Progress saved after hotel 1515

Processing hotel 1516 of 1533: Pieds Dans L'eau A Kerkennah...
http://200.174.198.86:8888
Requesting URL: https://www.tripadvisor.com/Hotel_Review-g11775155-d16865