In [None]:

import os
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from IPython.display import display



# .env 파일 불러오기
load_dotenv()

# 환경 변수 읽기
service_key = os.getenv("SERVICE_KEY")
file_path = os.getenv("File_path")


place_df = pd.read_csv(file_path)
display(place_df.head())
display(place_df.info())
print(place_df.info())

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(), options=options)

wait = WebDriverWait(driver, 10)
combined_data = []

def search_place_and_scrape(query):
    try:
        driver.get("https://map.kakao.com/")
        time.sleep(2)
        search_box = driver.find_element(By.ID, "search.keyword.query")
        search_box.clear()
        search_box.send_keys(query)
        search_box.send_keys(Keys.RETURN)
        time.sleep(3)

        # 인기도순 버튼 클릭
        try:
            sort_btn = driver.find_element(By.CSS_SELECTOR, "#info\.search\.place\.sort > li:nth-child(2) > a")
            sort_btn.click()
            time.sleep(2)
        except:
            pass

        # 첫 번째 결과
        first_item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#info\\.search\\.place\\.list > li:nth-child(1)")))

        try:
            name_tag = first_item.find_element(By.CSS_SELECTOR, "a[data-id='name']")
            name = name_tag.get_attribute("title")
        except:
            name = None
        try:
            score_tag = first_item.find_element(By.CSS_SELECTOR, "em[data-id='scoreNum']")
            score = score_tag.get_attribute("title")
        except:
            score = None
        try:
            addr_tag = first_item.find_element(By.CSS_SELECTOR, "p[data-id='address']")
            addr = addr_tag.get_attribute("title")
        except:
            addr = None

        print(f"[장소 수집] 이름: {name}, 평점: {score}, 주소: {addr}")

        # 상세페이지
        detail_btn = first_item.find_element(By.CSS_SELECTOR, "a.moreview")
        driver.execute_script("arguments[0].click();", detail_btn)
        time.sleep(2)
        driver.switch_to.window(driver.window_handles[-1])

        # 후기 탭
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href='#comment']")))
            driver.find_element(By.CSS_SELECTOR, "a[href='#comment']").click()
            time.sleep(2)
        except:
            pass

        # 리뷰 더보기
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            try:
                more_btn = driver.find_element(By.CSS_SELECTOR, "a.btn_more")
                if more_btn.is_displayed():
                    driver.execute_script("arguments[0].click();", more_btn)
                    time.sleep(2)
                else:
                    break
            except:
                break

        # 리뷰
        reviews = driver.find_elements(By.CSS_SELECTOR, "ul.list_review > li")
        if reviews:
            print(f"[리뷰 수집] 총 {len(reviews)}개의 리뷰를 찾음.")
            for idx, r in enumerate(reviews, start=1):
                try:
                    btn_more = r.find_elements(By.CSS_SELECTOR, "span.btn_more")
                    if btn_more:
                        driver.execute_script("arguments[0].click();", btn_more[0])
                        time.sleep(1)
                except:
                    pass
                
                try:
                    stars = r.find_elements(By.CSS_SELECTOR, "span.wrap_grade > span.figure_star.on")
                    rating = len(stars)
                except:
                    rating = None
                try:
                    reviewer = r.find_element(By.CSS_SELECTOR, "span.name_user").text
                except:
                    reviewer = None
                try:
                    date = r.find_element(By.CSS_SELECTOR, "span.txt_date").text
                except:
                    date = None
                try:
                    content = r.find_element(By.CSS_SELECTOR, "p.desc_review").text
                except:
                    content = None

                print(f"[{idx}] 리뷰 - 닉네임: {reviewer}, 별점: {rating}, 날짜: {date}, 내용: {content}")

                combined_data.append({
                    "검색어": query,
                    "장소명": name,
                    "평점": score,
                    "주소": addr,
                    "리뷰닉네임": reviewer,
                    "별점": rating,
                    "날짜": date,
                    "리뷰내용": content
                })
        else:
            print("[리뷰 없음] 리뷰가 없으므로 장소 정보만 저장.")
            combined_data.append({
                "검색어": query,
                "장소명": name,
                "평점": score,
                "주소": addr,
                "리뷰닉네임": None,
                "별점": None,
                "날짜": None,
                "리뷰내용": None
            })

        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        return True

    except Exception as e:
        print(f"[오류 발생] 검색어 '{query}' 로 검색 실패: {e}")
        return False

# 전체 반복 실행
# 전체 반복 실행 (상위 502행까지만 반복)
for idx, row in place_df.iloc[:502].iterrows():
    keyword = str(row.get("title", "")).strip()
    alt_keyword = str(row.get("addr1", "")).strip()
    
    success = search_place_and_scrape(keyword)
    if not success and alt_keyword:
        print(f"주소로 재검색: {alt_keyword}")
        search_place_and_scrape(alt_keyword)

driver.quit()
result_df = pd.DataFrame(combined_data)

Unnamed: 0,addr1,addr2,areacode,booktour,cat1,cat2,cat3,contentid,contenttypeid,createdtime,...,firstimage2,cpyrhtDivCd,mapx,mapy,mlevel,modifiedtime,sigungucode,tel,title,zipcode
0,서울특별시 종로구 북촌로 41,,1,,A05,A0502,A05020100,1968879,39,20141100000000.0,...,http://tong.visitkorea.or.kr/cms/resource/77/1...,Type3,126.984692,37.580603,6.0,20221200000000.0,23,02-747-5535,백년토종삼계탕,3055.0
1,서울특별시 종로구 서순라길 55,,1,,A05,A0502,A05020900,1968826,39,20141100000000.0,...,http://tong.visitkorea.or.kr/cms/resource/93/3...,Type3,126.992992,37.572951,6.0,20241200000000.0,23,02-3672-1599,순라길 예 & 비비,3134.0
2,서울특별시 중구 세종대로21길 49,,1,,A05,A0502,A05020300,1966366,39,20141100000000.0,...,http://tong.visitkorea.or.kr/cms/resource/18/1...,Type3,126.975605,37.568773,6.0,20250100000000.0,24,02-736-1001,오양회참치,4519.0
3,서울특별시 종로구 종로36나길 9 (종로5가),,1,,A05,A0502,A05020100,1965691,39,20141100000000.0,...,,,127.004444,37.570268,6.0,20241200000000.0,23,02-2266-8125,더 우리곱창,3197.0
4,서울특별시 종로구 종로40가길 5,,1,,A05,A0502,A05020100,1965675,39,20141100000000.0,...,,,127.006368,37.570263,6.0,20241200000000.0,23,02-2279-0996,호남집,3197.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   addr1          999 non-null    object 
 1   addr2          381 non-null    object 
 2   areacode       999 non-null    int64  
 3   booktour       232 non-null    float64
 4   cat1           999 non-null    object 
 5   cat2           999 non-null    object 
 6   cat3           999 non-null    object 
 7   contentid      999 non-null    int64  
 8   contenttypeid  999 non-null    int64  
 9   createdtime    999 non-null    float64
 10  firstimage     635 non-null    object 
 11  firstimage2    635 non-null    object 
 12  cpyrhtDivCd    635 non-null    object 
 13  mapx           999 non-null    float64
 14  mapy           999 non-null    float64
 15  mlevel         995 non-null    float64
 16  modifiedtime   999 non-null    float64
 17  sigungucode    999 non-null    int64  
 18  tel       

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   addr1          999 non-null    object 
 1   addr2          381 non-null    object 
 2   areacode       999 non-null    int64  
 3   booktour       232 non-null    float64
 4   cat1           999 non-null    object 
 5   cat2           999 non-null    object 
 6   cat3           999 non-null    object 
 7   contentid      999 non-null    int64  
 8   contenttypeid  999 non-null    int64  
 9   createdtime    999 non-null    float64
 10  firstimage     635 non-null    object 
 11  firstimage2    635 non-null    object 
 12  cpyrhtDivCd    635 non-null    object 
 13  mapx           999 non-null    float64
 14  mapy           999 non-null    float64
 15  mlevel         995 non-null    float64
 16  modifiedtime   999 non-null    float64
 17  sigungucode    999 non-null    int64  
 18  tel       

In [None]:

# (수정부분) excel 파일 말고 csv 파일로 저장!! -> 에러나면 이 부분만 다시 실행
result_df.to_csv("./data/ZB_TourAPI_area_based_seoul_4_reviewCrawling.csv", index=False, encoding='utf-8-sig')
print("장소+리뷰 데이터 저장 완료: ZB_TourAPI_area_based_seoul_4_reviewCrawling.csv")
