In [None]:
import json
import os
from bs4 import BeautifulSoup

def extract_reviews_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    reviews = []
    
    review_divs = soup.find_all('div', class_='JVaPo Gi kQjeB')
    
    for review_div in review_divs:
        try:
            # 1. Rating extraction
            rating_tag = review_div.find('div', class_='nKWJn u')
            rating = float(rating_tag.find('title').text.split(' of ')[0]) if rating_tag else None
            
            # 2. Date visited
            date_tag = review_div.find('div', class_='biGQs _P fiohW fOtGX')
            date_visited = date_tag.text.strip() if date_tag else None
            
            # 3. Improved Review title extraction
            title_container = review_div.find('div', class_='_c', attrs={'data-test-target': 'review-title'})
            if not title_container:
                title_container = review_div.find('div', class_='biGQs _P fiohW ngXxk')
            
            title_tag = title_container.find('a', class_='BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS') if title_container else None
            review_title = title_tag.text.strip() if title_tag else None
            
            # 4. Review text
            text_tag = review_div.find('span', class_='_d _c')
            review_text = text_tag.text.strip() if text_tag else None
            
            # Validate all fields before adding
            if all([rating, date_visited, review_title, review_text]):
                reviews.append({
                    'review_rating': rating,
                    'date_visited': date_visited,
                    'review_title': review_title,
                    'review_text': review_text
                })
            else:
                missing_fields = []
                if not rating: missing_fields.append("rating")
                if not date_visited: missing_fields.append("date")
                if not review_title: missing_fields.append("title")
                if not review_text: missing_fields.append("text")
                print(f"⚠️ Skipped incomplete review (missing: {', '.join(missing_fields)})")
                
        except Exception as e:
            print(f"⚠️ Error processing review: {str(e)}")
            continue
    
    return reviews

def get_html_input():
    print("\nPaste HTML content (type 'END' on a new line when finished):")
    html_lines = []
    while True:
        try:
            line = input()
            if line.strip().upper() == 'END':
                break
            html_lines.append(line)
        except EOFError:
            break
    return '\n'.join(html_lines)

def save_progress(output_file, data):
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        return True
    except Exception as e:
        print(f"Error saving file: {e}")
        return False

def process_hotels(hotels, output_file):
    # Initialize saved_hotels as empty list if file doesn't exist
    if os.path.exists(output_file):
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                saved_hotels = json.load(f)
        except:
            saved_hotels = []
    else:
        saved_hotels = []

    processed_numbers = {h['hotel_number'] for h in saved_hotels}

    for hotel in hotels:
        if hotel['hotel_number'] in processed_numbers:
            print(f"\n⏩ Skipping already processed hotel #{hotel['hotel_number']}")
            continue

        current_hotel = {
            'name': hotel['name'],
            'link': hotel['link'],
            'hotel_number': hotel['hotel_number'],
            'reviews': []
        }

        print("\n" + "="*50)
        print(f"🏨 Hotel #{hotel['hotel_number']}: {hotel['name']}")
        print(f"🔗 Link: {hotel['link']}")
        print("="*50)

        while True:
            print("\nOptions:")
            print("1. Paste HTML to extract reviews")
            print("2. Finish and save this hotel")
            print("3. Skip this hotel (save without reviews)")
            print("4. Exit and save progress")
            choice = input("Your choice (1-4): ").strip()

            if choice == '1':
                html_content = get_html_input()
                
                if not html_content.strip():
                    print("❗ No HTML content provided")
                    continue
                
                reviews = extract_reviews_from_html(html_content)
                
                if reviews:
                    current_hotel['reviews'].extend(reviews)
                    print(f"\n✅ Extracted {len(reviews)} reviews!")
                    print(f"Sample review: {reviews[0]['review_title'][:50]}...")
                else:
                    print("\n❗ No reviews found in the HTML")
                    print("Please verify you're copying the correct section containing reviews")

            elif choice == '2':
                if not current_hotel['reviews']:
                    print("\n⚠️ No reviews added to this hotel. Use option 3 to skip instead?")
                    continue
                    
                saved_hotels.append(current_hotel)
                if save_progress(output_file, saved_hotels):
                    print(f"\n💾 Saved hotel #{hotel['hotel_number']} with {len(current_hotel['reviews'])} reviews")
                    break
                else:
                    print("❌ Failed to save progress!")

            elif choice == '3':
                saved_hotels.append(current_hotel)
                if save_progress(output_file, saved_hotels):
                    print(f"\n💾 Saved hotel #{hotel['hotel_number']} (skipped)")
                    break
                else:
                    print("❌ Failed to save progress!")

            elif choice == '4':
                if save_progress(output_file, saved_hotels):
                    print("\n💾 Progress saved. Exiting...")
                    return saved_hotels
                else:
                    print("❌ Failed to save progress!")
                    continue

            else:
                print("❗ Invalid choice. Please enter 1-4")

    return saved_hotels

if __name__ == "__main__":
    input_file = 'hotels_details_no_reviews.json'
    output_file = 'hotels_reviews_only.json'

    print(f"\nLoading hotels from {input_file}...")
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            hotels = json.load(f)
    except Exception as e:
        print(f"Error loading input file: {e}")
        exit()

    print(f"Starting review extraction. Progress will be saved to {output_file}")
    result = process_hotels(hotels, output_file)

    print(f"\n🎉 Completed! Processed {len(result)} hotels")
    print(f"Final data saved to {output_file}")