In [None]:
import os
import time
import requests
import re

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager

TARGET_CITY = "Amsterdam"
OUTPUT_ROOT = "../../raw"
TARGET_FILES_ENDINGS = ["listings.csv.gz", "calendar.csv.gz", "reviews.csv.gz", "neighbourhoods.csv"]

def download_file(url, folder_path):
    filename = url.split("/")[-1]
    save_path = os.path.join(folder_path, filename)

    
    headers = {'User-Agent':  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                             "AppleWebKit/537.36 (KHTML, like Gecko) "
                             "Chrome/128.0.0.0 Safari/537.36"}
    try:
        print(f"Downloading: {filename}")
        response = requests.get(url, headers=headers)  
        response.raise_for_status()
    
        with open(save_path, 'wb') as f:
            f.write(response.content) 
    except Exception as e:
        print(f" Error: {e}")

def scrape_robust(city_name):
    
    options = Options()# chỗ lại không hiểu sao để headless lại không tải được toàn bộ snapshot
    driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)
    print(f" Đang mở Firefox ")
    try:
        url = "http://insideairbnb.com/get-the-data/"
        driver.get(url)
        print("Mở web thành công đang đợi tải")
        #Chuẩn hóa tên thành phố
        city_xpath = f"//h3[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{city_name.lower()}')]"
        
        try:
            # 1. Tìm title Thành Phố
            city_header = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, city_xpath))
            )
            
            # Lướt đến thành phố cần thiết
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", city_header)
            time.sleep(1)
            print(f"Đã thấy title của {city_name}")

            # 2. Tìm nút Show nằm ngay sau tiêu đề đó
            # XPath: Từ thẻ H3 city, tìm thẻ 'a' kế tiếp có chứa chữ 'show'
            show_btn_xpath = f"{city_xpath}/following::a[contains(text(), 'show')][1]"
            
            show_button = driver.find_element(By.XPATH, show_btn_xpath)

            driver.execute_script("arguments[0].click();", show_button)
            print("  Đã CLICK nút Show! Đang chờ 5s...")
            time.sleep(5)
            
        except Exception as e:
            print(f"Lỗi không liên quan đến xử lý nút show: {e}")

        # 3. Tải dữ liệu
        print(" Đang quét HTML...")
        soup = BeautifulSoup(driver.page_source, 'html.parser')


        all_h3 = soup.find_all('h3')
        target_h3 = None
        for h3 in all_h3:
            if city_name.lower() in " ".join(h3.get_text().split()).lower():
                target_h3 = h3
                break
        
        if not target_h3:
            print("Vẫn không thấy H3 trong HTML sau khi tải.")
            return

        current_date_str = None
        processed_count = 0
        found_snapshots = set()
        
        for element in target_h3.next_elements:
            # Gặp thành phố khác thì dừng
            if element.name == 'h3': break
            # Ngày tháng của snapshot đang xử lý
            if element.name == 'h4':
                raw_date = element.get_text(strip=True)
                match = re.search(r"(\d{1,2}\s+[A-Za-z]+\,?\s+\d{4})", raw_date)
                if match: current_date_str = match.group(1)
            
            if element.name == 'table':
                if not current_date_str: continue
                

                folder_path = os.path.join(OUTPUT_ROOT, city_name, current_date_str)
                
                links = element.find_all('a', href=True)
                folder_created = False
                # Tải
                for link in links:
                    f_url = link['href']
                    if os.path.basename(f_url) in TARGET_FILES_ENDINGS:
                        if not folder_created:
                            if current_date_str not in found_snapshots:
                                print(f"  Snapshot: {current_date_str}")
                                found_snapshots.add(current_date_str)
                            if not os.path.exists(folder_path): os.makedirs(folder_path)
                            folder_created = True
                        
                        download_file(f_url, folder_path)
                        processed_count += 1

        print(f"\nHOÀN TẤT! {processed_count} files.")

    except Exception as e:
        print(f"Lỗi tổng: {e}")
    finally:
        driver.quit()
        print("Đã đóng Firefox")
        pass

if __name__ == "__main__":
    scrape_robust(TARGET_CITY)

 Đang mở Firefox 
Mở web thành công đang đợi tải
Đã thấy title của Amsterdam
  Đã CLICK nút Show! Đang chờ 5s...
 Đang quét HTML...
  Snapshot: 11 September 2025
Downloading: listings.csv.gz
Downloading: calendar.csv.gz
Downloading: reviews.csv.gz
Downloading: neighbourhoods.csv
  Snapshot: 09 June 2025
Downloading: listings.csv.gz
Downloading: calendar.csv.gz
Downloading: reviews.csv.gz
Downloading: neighbourhoods.csv
  Snapshot: 02 March 2025
Downloading: listings.csv.gz
Downloading: calendar.csv.gz
Downloading: reviews.csv.gz
Downloading: neighbourhoods.csv
  Snapshot: 07 December 2024
Downloading: listings.csv.gz
Downloading: calendar.csv.gz
Downloading: reviews.csv.gz
Downloading: neighbourhoods.csv

HOÀN TẤT! 16 files.
Đã đóng Firefox
