In [3]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementClickInterceptedException
from bs4 import BeautifulSoup
import time
import os, json
import re
import logging
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import quote

In [4]:
def load_progress(mode: int) -> int:
    """
    mode == 1 -> STATE_FILE_TOP 사용
    mode == 0 -> STATE_FILE_BOTTOM 사용
    파일이 없거나 형식이 올바르지 않으면 0 반환
    """
    state_file = STATE_FILE_TOP if mode == 1 else STATE_FILE_BOTTOM
    if not os.path.exists(state_file):
        return 0
    try:
        with open(state_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            return data.get('last_idx', 0)
    except (json.JSONDecodeError, IOError):
        return 0
    
def save_progress(idx: int, mode: int):
    state_file = STATE_FILE_TOP if mode == 1 else STATE_FILE_BOTTOM
    with open(state_file, 'w', encoding='utf-8') as f:
        json.dump({'last_idx': idx}, f, ensure_ascii=False)

In [5]:
# — 설정 및 환경 준비 —
csv_input = 'C:/VSCode/EDA/data/combined_tourist_data.csv'
csv_output_top = 'C:/VSCode/EDA/data/naver_map_reviews_top_100-150.csv'
csv_output_bottom = 'C:/VSCode/EDA/data/naver_map_reviews_bottom_200.csv'

# 출력 파일이 없으면 헤더부터 생성
if not os.path.exists(csv_output_top):
    pd.DataFrame(columns=['place','text','total_reviews'])\
      .to_csv(csv_output_top, index=False, encoding='utf-8-sig')
if not os.path.exists(csv_output_bottom):
    pd.DataFrame(columns=['place','text','total_reviews'])\
      .to_csv(csv_output_bottom, index=False, encoding='utf-8-sig')

# 크롬 드라이버 설정
chrome_driver_path = r'C:/VSCode/EDA/code/chromedriver-win64/chromedriver.exe'  # 실제 경로로 수정
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('window-size=1380,900')
# options.add_argument('--headless')  # 화면 표시 없이 실행하려면 주석 해제

service = Service(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)

df = pd.read_csv(csv_input, encoding='utf-8')
visitor_map = dict(zip(df['관광지'], df['2024']))

In [6]:
df_clean = (
    df
    .dropna(subset=['2024'])          # 결측치 제거
    .loc[lambda d: d['2024'] > 0]     # 0인 값 제거
)
df_desc = df_clean.sort_values('2024', ascending=False).reset_index(drop=True)
df_100_150 = df_desc.iloc[99:150].reset_index(drop=True)
df_asc = df_clean.sort_values('2024', ascending=True).reset_index(drop=True)
df_100_200 = df_asc.iloc[200:600].reset_index(drop=True)
top_100    = df_clean.nlargest(100, '2024').reset_index(drop=True)
bottom_100 = df_clean.nsmallest(100, '2024').reset_index(drop=True)
print("상위 100~150번째 shape:", df_100_150.shape)
print("하위 100~200번째 shape:", df_100_200.shape)

상위 100~150번째 shape: (51, 9)
하위 100~200번째 shape: (400, 9)


In [8]:

bottom_tourist_spots = df_100_200['관광지'].dropna().unique().tolist()

print(f"하위 {len(bottom_tourist_spots)}개의 관광지명을 불러왔습니다.")

하위 400개의 관광지명을 불러왔습니다.


In [9]:
middle_top = df_100_150['관광지'].dropna().unique().tolist()
middle_bottom = df_100_200['관광지'].dropna().unique().tolist()

In [10]:

STATE_FILE_BOTTOM = 'progress_middle_bottom_1.json'

In [13]:
start_idx = load_progress(0) + 1
print(f"▶ 재시작 인덱스: {start_idx} ({middle_bottom[start_idx]} 부터 처리)")

▶ 재시작 인덱스: 37 (희움일본군위안부역사관 부터 처리)


In [14]:
MAX_REVIEWS = 100  
for idx in range(start_idx, len(middle_bottom)):
    place_name = middle_bottom[idx]
    main_handle = driver.current_window_handle
    try:
        driver.get(f"https://map.naver.com/v5/search/{place_name}")
        time.sleep(3)
        for handle in driver.window_handles:
            if handle != main_handle:
                driver.switch_to.window(handle)
                driver.close()
        driver.switch_to.window(main_handle)
        driver.find_element(By.ID, "searchIframe") 
        try:
            WebDriverWait(driver, 10).until(
                EC.frame_to_be_available_and_switch_to_it((By.ID, "searchIframe"))
            )
            first_span = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "span.YwYLL"))
            )
        except TimeoutException:
            print(f"   ⚠️ 검색 결과 없음: {place_name} → 건너뜁니다.")
            continue
        first_click = first_span.find_element(By.XPATH, "./ancestor::a[1]")

        driver.execute_script("arguments[0].click();", first_click)
                
        driver.switch_to.default_content()

        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '//*[@id="entryIframe"]')))
        iframe = driver.find_element(By.XPATH, '//*[@id="entryIframe"]')
        driver.switch_to.frame(iframe)

        review_tab = wait.until(EC.element_to_be_clickable(
            (By.XPATH, "//a[contains(text(),'리뷰')]")
        ))
        tab_text = review_tab.text        
        m = re.search(r'([\d,]+)', tab_text)
        total_reviews = int(m.group(1).replace(',', '')) if m else 0
        # JS로 클릭해서 안정성↑
        driver.execute_script("arguments[0].click();", review_tab)
        time.sleep(2)

        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        time.sleep(1)

        # sort_btn = WebDriverWait(driver, 10).until(
        #     EC.presence_of_element_located((By.XPATH, "//a[normalize-space(text())='최신순']"))
        # )

        sort_btn = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[normalize-space(text())='최신순']"))
        )

        # 2) 뷰포트 중앙에 위치시켜 본래 클릭 시도
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", sort_btn)
        time.sleep(0.5)

        try:
            # 일반 클릭
            sort_btn.click()
        except ElementClickInterceptedException:
            # 가려져 있다면 화면을 1/4만큼 위로 스크롤
            driver.execute_script("window.scrollBy(0, -window.innerHeight * 0.25);")
            time.sleep(0.5)
            # JS 클릭으로 재시도
            driver.execute_script("arguments[0].click();", sort_btn)


        # WebDriverWait(driver, 10).until(
        #         EC.element_to_be_clickable((By.XPATH, "//a[text()='최신순']"))
        #     ).click()
        # time.sleep(2)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep
        for _ in range(25):
            # 1) 현재 로드된 리뷰 요소 개수 체크
            elems = driver.find_elements(By.XPATH, "//div[@class='pui__vn15t2']/a")
            valid_count = sum(
                1 for el in elems 
                if (txt:=el.text.strip()) and txt != "더보기"
            )
            if valid_count >= MAX_REVIEWS:
                break   # 충분히 로드했으니 클릭 중단

            try:
                more_btn = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//span[normalize-space(text())='더보기']/ancestor::a[1]")
                    )
                )
                driver.execute_script(
                    "arguments[0].scrollIntoView({block:'center'});", more_btn
                )
                time.sleep(0.5)
                driver.execute_script("arguments[0].click();", more_btn)
                time.sleep(1)  # 클릭 후 로딩 대기
            except TimeoutException:
                break  # 더보기 버튼이 더 이상 없으면 중단

        # 리뷰 요소 수집
        try:
            review_elements = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, "//div[@class='pui__vn15t2']/a")
                )
            )
        except TimeoutException:
            print(f"   ⚠️ 리뷰 없음: {place_name} → 건너뜁니다.")
            continue
        reviews = []
        for el in review_elements:
            text = el.text.strip()
            if text and text != "더보기":
                reviews.append(text)
                if len(reviews) >= MAX_REVIEWS:
                    break  # 100개 모이면 중단
        print(f"▶ {place_name} 리뷰 수집: {len(reviews)}개 (목표 {MAX_REVIEWS}개)")
        visitors_count = visitor_map.get(place_name, None)
        if reviews:
            records = [
                {
                    'place': place_name,
                    'text': text,
                    'total_reviews': total_reviews,  # 방금 구한 총 리뷰 수
                    'visitors_2024':  visitors_count,
                }
                for text in reviews
            ]
            df_out = pd.DataFrame(records)
            df_out.to_csv(
                csv_output_bottom,
                mode='a',            # append 모드
                header=False,        # 이미 헤더가 있으므로 False
                index=False,
                encoding='utf-8-sig'
            )

        print(f"   ✔️ 완료: {len(reviews)}개의 리뷰 저장됨")
    except Exception as e:
        save_progress(idx, 0)  
        print(f"   ⚠ 에러 발생: {place_name} ({e}), idx={idx} 저장 후 다음 관광지로 이동합니다.")
        for handle in driver.window_handles:
            if handle != main_handle:
                driver.switch_to.window(handle)
                driver.close()
        driver.switch_to.window(main_handle)
        continue


   ⚠ 에러 발생: 희움일본군위안부역사관 (Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="searchIframe"]"}
  (Session info: chrome=138.0.7204.169); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
	GetHandleVerifier [0x0x7ff71676e935+77845]
	GetHandleVerifier [0x0x7ff71676e990+77936]
	(No symbol) [0x0x7ff716529cda]
	(No symbol) [0x0x7ff7165806aa]
	(No symbol) [0x0x7ff71658095c]
	(No symbol) [0x0x7ff7165d3d07]
	(No symbol) [0x0x7ff7165a890f]
	(No symbol) [0x0x7ff7165d0b07]
	(No symbol) [0x0x7ff7165a86a3]
	(No symbol) [0x0x7ff716571791]
	(No symbol) [0x0x7ff716572523]
	GetHandleVerifier [0x0x7ff716a4684d+3059501]
	GetHandleVerifier [0x0x7ff716a40c0d+3035885]
	GetHandleVerifier [0x0x7ff716a60400+3164896]
	GetHandleVerifier [0x0x7ff716788c3e+185118]
	GetHandleVerifier [0x0x7ff71679054f+216111]
	GetHandleVerifier [0x0x7ff7167772e4+113092]
	GetHandleVerifier [