# Kakao Ratings

In [31]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

# Set up ChromeDriver path
chrome_driver_path = r"C:\Users\jeong\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(chrome_driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service, options=options)

# Open Kakao Map URL
url = "https://map.kakao.com/?from=total&nil_suggest=btn&tab=place&q=%EB%8C%80%EC%A0%84+%EC%9C%A0%EC%84%B1%EA%B5%AC+%EB%B4%89%EB%AA%85%EB%8F%99+%EC%9D%8C%EC%8B%9D%EC%A0%90"
driver.get(url)
time.sleep(5)  # Wait for the page to load

# Global variables
data = []
max_results = 2500  # Limit for results
current_page = 1

def remove_dimmed_layer():
    """Remove the dimmed layer that blocks clicks."""
    try:
        driver.execute_script("document.getElementById('dimmedLayer').remove();")
        print("Dimmed layer removed.")
    except:
        pass  # No overlay to remove

def scroll_and_collect():
    """Scroll to load more places and collect restaurant data."""
    scroll_count = 15
    for i in range(scroll_count):
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(2)
        print(f"Scrolling... {i + 1}/{scroll_count}")

    # Collect PlaceItem data
    places = driver.find_elements(By.CLASS_NAME, "PlaceItem.clickArea")
    print(f"Total places found on this page: {len(places)}")
    
    for place in places:
        try:
            title = place.find_element(By.CLASS_NAME, "link_name").get_attribute("title")
            rating = place.find_element(By.CSS_SELECTOR, "div[data-id='rating'] em.num").text
            print(f"Restaurant: {title}, Rating: {rating}")
            data.append({"Title": title, "Rating": rating})
        except Exception:
            print("Error extracting place data. Skipping...")
            continue

def navigate_pages():
    """Navigate through pages and collect data."""
    global current_page

    while len(data) < max_results:  # Stop if we reach max results
        try:
            print(f"--- Starting Page {current_page} ---")
            scroll_and_collect()

            # Stop if we reach the maximum results
            if len(data) >= max_results:
                break

            # Calculate local page number (1-5)
            page_number = current_page % 5 if current_page % 5 != 0 else 5

            # Click on the page button (1–5)
            remove_dimmed_layer()
            page_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, f"info.search.page.no{page_number}"))
            )
            page_button.click()
            print(f"Clicked on Page {current_page} (Local Page {page_number})")
            time.sleep(3)  # Allow time for the page to load

            # If it's Page 5, click the "Next" button
            if page_number == 5:
                remove_dimmed_layer()
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.ID, "info.search.page.next"))
                )
                next_button.click()
                print(f"Clicked 'Next' button after Page {current_page}")
                time.sleep(5)  # Allow time for pages to reload

            current_page += 1

        except Exception as e:
            print(f"No more pages or error occurred: {e}")
            break

# Run the scraping and navigation
navigate_pages()

# Save collected data to CSV
df = pd.DataFrame(data)
output_path = r"C:\Users\jeong\Downloads\kakao_map_restaurant_ratings.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"Data saved to {output_path}")

# Close browser
driver.quit()
print("Browser closed successfully.")
print(f"Total restaurants collected: {len(data)}")

--- Starting Page 1 ---
Scrolling... 1/15
Scrolling... 2/15
Scrolling... 3/15
Scrolling... 4/15
Scrolling... 5/15
Scrolling... 6/15
Scrolling... 7/15
Scrolling... 8/15
Scrolling... 9/15
Scrolling... 10/15
Scrolling... 11/15
Scrolling... 12/15
Scrolling... 13/15
Scrolling... 14/15
Scrolling... 15/15
Total places found on this page: 15
Restaurant: 원조태평소국밥 본관, Rating: 4.3
Restaurant: 온천손칼국수쭈꾸미, Rating: 3.8
Restaurant: 상무초밥 유성점, Rating: 3.5
Restaurant: 원조태평소국밥 유성점, Rating: 3.9
Restaurant: 워낭명가, Rating: 3.8
Restaurant: 바르미 샤브샤브n칼국수 봉명점, Rating: 3.9
Restaurant: 구들마루, Rating: 3.5
Restaurant: 일당감자탕, Rating: 3.5
Restaurant: 띠울석갈비 유성직영점, Rating: 2.6
Restaurant: 백마강참숯민물장어 유성점, Rating: 4.1
Restaurant: 유람 대전봉명점, Rating: 
Restaurant: 대손관 본점, Rating: 3.6
Restaurant: 아케이드커피, Rating: 3.0
Restaurant: 르뺑99-1, Rating: 3.3
Restaurant: 촌놈들연탄구이 본점, Rating: 4.2
Dimmed layer removed.
Clicked on Page 1 (Local Page 1)
--- Starting Page 2 ---
Scrolling... 1/15
Scrolling... 2/15
Scrolling... 3/15
Scrolling... 4/15

# Naver Images and Reviews

### Page 1

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os

# ChromeDriver path
chrome_driver_path = r"C:\Users\jeong\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(chrome_driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service, options=options)

# Target URL
URL = "https://map.naver.com/p/search/%EB%8C%80%EC%A0%84%20%EB%B4%89%EB%AA%85%EB%8F%99%20%EC%9D%8C%EC%8B%9D%EC%A0%90"

# Load previously scraped data
data_file = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"
if os.path.exists(data_file):
    with open(data_file, "r", encoding="utf-8") as json_file:
        all_restaurants_data = json.load(json_file)
else:
    all_restaurants_data = []

# Extract names of already scraped restaurants
processed_restaurant_names = {restaurant["restaurant_name"] for restaurant in all_restaurants_data}

try:
    # Open the page
    driver.get(URL)
    time.sleep(10)  # Allow the page to load fully

    wait = WebDriverWait(driver, 15)

    def scrape_reviews_and_images(restaurant_name):
        """Scrape reviews and images from the side panel iframe."""
        restaurant_data = {
            "restaurant_name": restaurant_name,
            "reviews": [],
            "images": []
        }
        try:
            # Click the "리뷰" tab
            review_tab = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '리뷰') and @class='veBoZ']")))
            print("Clicking the 리뷰 tab...")
            review_tab.click()
            time.sleep(5)  # Allow time for the review tab to load

            # Scroll to load all reviews and images
            last_height = driver.execute_script("return document.body.scrollHeight")
            while True:
                driver.execute_script("window.scrollBy(0, 1000);")
                time.sleep(5)  # Wait for content to load
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:  # Stop if no new content is loaded
                    break
                last_height = new_height

            # Retrieve reviews
            reviews = driver.find_elements(By.CSS_SELECTOR, "a[data-pui-click-code='rvshowmore']")
            for review in reviews:
                review_text = review.text.strip()
                if review_text and review_text != "더보기":  # Exclude "더보기"
                    restaurant_data["reviews"].append(review_text)

            # Retrieve images
            images = driver.find_elements(By.CSS_SELECTOR, "img.K0PDV")
            for img in images:
                img_url = img.get_attribute("src")
                if img_url and "/common/" not in img_url and "/emoji/" not in img_url:  # Exclude unwanted links
                    restaurant_data["images"].append(img_url)

        except Exception as e:
            print(f"Error scraping reviews and images: {e}")
        return restaurant_data

    while True:
        try:
            # Switch to the search iframe
            iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
            driver.switch_to.frame(iframe)

            # Scroll to load all restaurants on the current page
            scrollable_element = driver.find_element(By.CLASS_NAME, "Ryr1F")
            last_height = 0
            while True:
                driver.execute_script("arguments[0].scrollTop += 1000;", scrollable_element)
                time.sleep(10)  # Allow time for content to load
                new_height = driver.execute_script("return arguments[0].scrollHeight;", scrollable_element)
                if new_height == last_height:
                    break
                last_height = new_height

            # Find all restaurant elements
            restaurants = driver.find_elements(By.CLASS_NAME, "TYaxT")
            print(f"\nFound {len(restaurants)} restaurants on this page.")

            # Process each restaurant
            for idx, restaurant in enumerate(restaurants, start=1):
                try:
                    restaurant_name = restaurant.text
                    if restaurant_name in processed_restaurant_names:
                        print(f"Skipping already processed restaurant: {restaurant_name}")
                        continue

                    print(f"Processing restaurant {idx}: {restaurant_name}")
                    driver.execute_script("arguments[0].click();", restaurant)
                    time.sleep(5)  # Allow the side panel to load

                    # Switch to the side panel iframe
                    driver.switch_to.default_content()
                    entry_iframe = wait.until(EC.presence_of_element_located((By.ID, "entryIframe")))
                    driver.switch_to.frame(entry_iframe)

                    # Scrape reviews and images
                    restaurant_data = scrape_reviews_and_images(restaurant_name)
                    all_restaurants_data.append(restaurant_data)
                    processed_restaurant_names.add(restaurant_name)

                    # Save progress after each restaurant
                    with open(data_file, "w", encoding="utf-8") as json_file:
                        json.dump(all_restaurants_data, json_file, ensure_ascii=False, indent=4)

                    # Return to the search iframe
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

                except Exception as e:
                    print(f"Error processing restaurant {idx}: {e}")
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

            # Locate and click the "Next Page" button
            driver.switch_to.default_content()
            next_page_button = wait.until(
                EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
            )
            if next_page_button.get_attribute("aria-disabled") == "true":
                print("Next Page button is disabled. No more pages to navigate.")
                break
            print("Navigating to the next page...")
            driver.execute_script("arguments[0].click();", next_page_button)
            time.sleep(10)  # Wait for the next page to load

        except Exception as e:
            print(f"Error occurred during page navigation: {e}")
            break

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser
    # driver.quit()
    print("Scraping complete.")


Found 76 restaurants on this page.
Skipping already processed restaurant: 노마드
Skipping already processed restaurant: 호맥 봉명점
Skipping already processed restaurant: 태평소국밥
Skipping already processed restaurant: 황우마을
Skipping already processed restaurant: 애월장흥한우
Skipping already processed restaurant: 야키토리잔잔 대전봉명점
Skipping already processed restaurant: 박봉명생갈비
Skipping already processed restaurant: 경성삼겹살
Skipping already processed restaurant: 부산갈매기 대전봉명점
Skipping already processed restaurant: 회뜨는총각 봉명점
Skipping already processed restaurant: 모미지 대전봉명본점
Skipping already processed restaurant: 파도수산 봉명점
Skipping already processed restaurant: 낙원갈비집 대전유성점
Skipping already processed restaurant: 온천칼국수
Skipping already processed restaurant: 하루토 이자카야
Skipping already processed restaurant: 육화담길
Skipping already processed restaurant: 홍콩구락부 대전봉명본점
Skipping already processed restaurant: 르뺑99-1
Skipping already processed restaurant: 컴히얼 대전봉명점
Skipping already processed restaurant: 일호황소곱창
Skipping already p

### Page 2

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os

# ChromeDriver path
chrome_driver_path = r"C:\Users\jeong\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(chrome_driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service, options=options)

# Target URL
URL = "https://map.naver.com/p/search/%EB%8C%80%EC%A0%84%20%EB%B4%89%EB%AA%85%EB%8F%99%20%EC%9D%8C%EC%8B%9D%EC%A0%90"

# Load previously scraped data
data_file = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"
if os.path.exists(data_file):
    with open(data_file, "r", encoding="utf-8") as json_file:
        all_restaurants_data = json.load(json_file)
else:
    all_restaurants_data = []

# Extract names of already scraped restaurants
processed_restaurant_names = {restaurant["restaurant_name"] for restaurant in all_restaurants_data}


def scroll_to_load_all(driver, scrollable_element):
    """Scroll through the given element until all content is loaded."""
    last_height = 0
    while True:
        driver.execute_script("arguments[0].scrollTop += 1000;", scrollable_element)
        time.sleep(10)  # Allow time for content to load
        new_height = driver.execute_script("return arguments[0].scrollHeight;", scrollable_element)
        if new_height == last_height:
            print("Reached the end of the scrollable content.")
            break
        last_height = new_height


def scrape_reviews_and_images(restaurant_name):
    """Scrape reviews and images from the side panel iframe."""
    restaurant_data = {
        "restaurant_name": restaurant_name,
        "reviews": [],
        "images": []
    }
    try:
        # Click the "리뷰" tab
        review_tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '리뷰') and @class='veBoZ']"))
        )
        print("Clicking the 리뷰 tab...")
        review_tab.click()
        time.sleep(5)

        # Scroll to load all reviews and images
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Retrieve reviews
        reviews = driver.find_elements(By.CSS_SELECTOR, "a[data-pui-click-code='rvshowmore']")
        for review in reviews:
            review_text = review.text.strip()
            if review_text and review_text != "더보기":
                restaurant_data["reviews"].append(review_text)

        # Retrieve images
        images = driver.find_elements(By.CSS_SELECTOR, "img.K0PDV")
        for img in images:
            img_url = img.get_attribute("src")
            if img_url and "/common/" not in img_url and "/emoji/" not in img_url:
                restaurant_data["images"].append(img_url)

    except Exception as e:
        print(f"Error scraping reviews and images: {e}")
    return restaurant_data


try:
    # Open the page
    driver.get(URL)
    time.sleep(10)  # Allow the page to load fully

    wait = WebDriverWait(driver, 15)

    # Skip the first page
    print("Skipping the first page...")
    iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
    driver.switch_to.frame(iframe)

    next_page_button = wait.until(
        EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
    )
    driver.execute_script("arguments[0].click();", next_page_button)
    time.sleep(5)  # Wait for the second page to load
    driver.switch_to.default_content()

    while True:
        try:
            # Switch to the search iframe
            iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
            driver.switch_to.frame(iframe)

            # Scroll to load all restaurants on the current page
            scrollable_element = driver.find_element(By.CLASS_NAME, "Ryr1F")
            scroll_to_load_all(driver, scrollable_element)

            # Find all restaurant elements
            restaurants = driver.find_elements(By.CLASS_NAME, "TYaxT")
            print(f"\nFound {len(restaurants)} restaurants on this page.")

            # Process each restaurant
            for idx, restaurant in enumerate(restaurants, start=1):
                try:
                    restaurant_name = restaurant.text
                    if restaurant_name in processed_restaurant_names:
                        print(f"Skipping already processed restaurant: {restaurant_name}")
                        continue

                    print(f"Processing restaurant {idx}: {restaurant_name}")
                    driver.execute_script("arguments[0].click();", restaurant)
                    time.sleep(5)

                    # Switch to the side panel iframe
                    driver.switch_to.default_content()
                    entry_iframe = wait.until(EC.presence_of_element_located((By.ID, "entryIframe")))
                    driver.switch_to.frame(entry_iframe)

                    # Scrape reviews and images
                    restaurant_data = scrape_reviews_and_images(restaurant_name)
                    all_restaurants_data.append(restaurant_data)
                    processed_restaurant_names.add(restaurant_name)

                    # Save progress after each restaurant
                    with open(data_file, "w", encoding="utf-8") as json_file:
                        json.dump(all_restaurants_data, json_file, ensure_ascii=False, indent=4)

                    # Return to the search iframe
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

                except Exception as e:
                    print(f"Error processing restaurant {idx}: {e}")
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

            # Locate and click the "Next Page" button
            driver.switch_to.default_content()
            next_page_button = wait.until(
                EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
            )
            if next_page_button.get_attribute("aria-disabled") == "true":
                print("Next Page button is disabled. No more pages to navigate.")
                break
            print("Navigating to the next page...")
            driver.execute_script("arguments[0].click();", next_page_button)
            time.sleep(10)

        except Exception as e:
            print(f"Error occurred during page navigation: {e}")
            break

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    print("Scraping complete.")


Skipping the first page...
Reached the end of the scrollable content.

Found 76 restaurants on this page.
Processing restaurant 1: 어서오시게 대전유성점
Clicking the 리뷰 tab...
Processing restaurant 2: 진수곱창 대전봉명점
Clicking the 리뷰 tab...
Processing restaurant 3: 숯토리 대전봉명점
Clicking the 리뷰 tab...
Processing restaurant 4: 갈마골 초가김치
Clicking the 리뷰 tab...
Processing restaurant 5: 뼈먹자 얼큰한 뼈해장국
Clicking the 리뷰 tab...
Skipping already processed restaurant: 어선재
Processing restaurant 7: 더스크래치 뷔페 호텔인터시티
Clicking the 리뷰 tab...
Processing restaurant 8: 하루팡
Clicking the 리뷰 tab...
Processing restaurant 9: 윅스커피
Clicking the 리뷰 tab...
Processing restaurant 10: 김희선제육짜글이
Clicking the 리뷰 tab...
Processing restaurant 11: 퐁당 유성점
Clicking the 리뷰 tab...
Processing restaurant 12: 타마
Clicking the 리뷰 tab...
Processing restaurant 13: 청주해장국 유성본점
Clicking the 리뷰 tab...
Processing restaurant 14: 라비프
Clicking the 리뷰 tab...
Processing restaurant 15: 금회세꼬시
Clicking the 리뷰 tab...
Processing restaurant 16: 바다한상
Clicking the 리뷰 tab...

### Page 3

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os

# ChromeDriver path
chrome_driver_path = r"C:\Users\jeong\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(chrome_driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service, options=options)

# Target URL
URL = "https://map.naver.com/p/search/%EB%8C%80%EC%A0%84%20%EB%B4%89%EB%AA%85%EB%8F%99%20%EC%9D%8C%EC%8B%9D%EC%A0%90"

# Load previously scraped data
data_file = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"
if os.path.exists(data_file):
    with open(data_file, "r", encoding="utf-8") as json_file:
        all_restaurants_data = json.load(json_file)
else:
    all_restaurants_data = []

# Extract names of already scraped restaurants
processed_restaurant_names = {restaurant["restaurant_name"] for restaurant in all_restaurants_data}


def scroll_to_load_all(driver, scrollable_element):
    """Scroll through the given element until all content is loaded."""
    last_height = 0
    while True:
        driver.execute_script("arguments[0].scrollTop += 1000;", scrollable_element)
        time.sleep(10)  # Allow time for content to load
        new_height = driver.execute_script("return arguments[0].scrollHeight;", scrollable_element)
        if new_height == last_height:
            print("Reached the end of the scrollable content.")
            break
        last_height = new_height


def scrape_reviews_and_images(restaurant_name):
    """Scrape reviews and images from the side panel iframe."""
    restaurant_data = {
        "restaurant_name": restaurant_name,
        "reviews": [],
        "images": []
    }
    try:
        # Click the "리뷰" tab
        review_tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '리뷰') and @class='veBoZ']"))
        )
        print("Clicking the 리뷰 tab...")
        review_tab.click()
        time.sleep(5)

        # Scroll to load all reviews and images
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Retrieve reviews
        reviews = driver.find_elements(By.CSS_SELECTOR, "a[data-pui-click-code='rvshowmore']")
        for review in reviews:
            review_text = review.text.strip()
            if review_text and review_text != "더보기":
                restaurant_data["reviews"].append(review_text)

        # Retrieve images
        images = driver.find_elements(By.CSS_SELECTOR, "img.K0PDV")
        for img in images:
            img_url = img.get_attribute("src")
            if img_url and "/common/" not in img_url and "/emoji/" not in img_url:
                restaurant_data["images"].append(img_url)

    except Exception as e:
        print(f"Error scraping reviews and images: {e}")
    return restaurant_data


try:
    # Open the page
    driver.get(URL)
    time.sleep(10)  # Allow the page to load fully

    wait = WebDriverWait(driver, 15)

    # Skip to page 3
    print("Navigating to page 3...")
    iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
    driver.switch_to.frame(iframe)

    for _ in range(2):  # Click the "Next Page" button twice to go to page 3
        next_page_button = wait.until(
            EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
        )
        if next_page_button.get_attribute("aria-disabled") == "true":
            print("Next Page button is disabled. Cannot navigate further.")
            break
        driver.execute_script("arguments[0].click();", next_page_button)
        time.sleep(5)  # Wait for the page to load

    driver.switch_to.default_content()  # Return to the main context

    while True:
        try:
            # Switch to the search iframe
            iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
            driver.switch_to.frame(iframe)

            # Scroll to load all restaurants on the current page
            scrollable_element = driver.find_element(By.CLASS_NAME, "Ryr1F")
            scroll_to_load_all(driver, scrollable_element)

            # Find all restaurant elements
            restaurants = driver.find_elements(By.CLASS_NAME, "TYaxT")
            print(f"\nFound {len(restaurants)} restaurants on this page.")

            # Process each restaurant
            for idx, restaurant in enumerate(restaurants, start=1):
                try:
                    restaurant_name = restaurant.text
                    if restaurant_name in processed_restaurant_names:
                        print(f"Skipping already processed restaurant: {restaurant_name}")
                        continue

                    print(f"Processing restaurant {idx}: {restaurant_name}")
                    driver.execute_script("arguments[0].click();", restaurant)
                    time.sleep(5)

                    # Switch to the side panel iframe
                    driver.switch_to.default_content()
                    entry_iframe = wait.until(EC.presence_of_element_located((By.ID, "entryIframe")))
                    driver.switch_to.frame(entry_iframe)

                    # Scrape reviews and images
                    restaurant_data = scrape_reviews_and_images(restaurant_name)
                    all_restaurants_data.append(restaurant_data)
                    processed_restaurant_names.add(restaurant_name)

                    # Save progress after each restaurant
                    with open(data_file, "w", encoding="utf-8") as json_file:
                        json.dump(all_restaurants_data, json_file, ensure_ascii=False, indent=4)

                    # Return to the search iframe
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

                except Exception as e:
                    print(f"Error processing restaurant {idx}: {e}")
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

            # Locate and click the "Next Page" button
            driver.switch_to.default_content()
            next_page_button = wait.until(
                EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
            )
            if next_page_button.get_attribute("aria-disabled") == "true":
                print("Next Page button is disabled. No more pages to navigate.")
                break
            print("Navigating to the next page...")
            driver.execute_script("arguments[0].click();", next_page_button)
            time.sleep(10)

        except Exception as e:
            print(f"Error occurred during page navigation: {e}")
            break

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    print("Scraping complete.")

Navigating to page 3...
Reached the end of the scrollable content.

Found 70 restaurants on this page.
Processing restaurant 1: 달빛참치대전유성점
Error processing restaurant 1: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6F7E16CC5+28821]
	(No symbol) [0x00007FF6F7D83850]
	(No symbol) [0x00007FF6F7C2578A]
	(No symbol) [0x00007FF6F7C791BE]
	(No symbol) [0x00007FF6F7C794AC]
	(No symbol) [0x00007FF6F7CC2647]
	(No symbol) [0x00007FF6F7C9F33F]
	(No symbol) [0x00007FF6F7CBF412]
	(No symbol) [0x00007FF6F7C9F0A3]
	(No symbol) [0x00007FF6F7C6A778]
	(No symbol) [0x00007FF6F7C6B8E1]
	GetHandleVerifier [0x00007FF6F814FCCD+3408029]
	GetHandleVerifier [0x00007FF6F816743F+3504143]
	GetHandleVerifier [0x00007FF6F815B61D+3455469]
	GetHandleVerifier [0x00007FF6F7EDBDCB+835995]
	(No symbol) [0x00007FF6F7D8EB6F]
	(No symbol) [0x00007FF6F7D8A824]
	(No symbol) [0x00007FF6F7D8A9BD]
	(No symbol) [0x00007FF6F7D7A1A9]
	BaseThreadInitThunk [0x00007FFAAAEA259D+29]
	RtlUserThreadStart [0x00007FFAAC34AF38+40]

Proces

### Page 4

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os

# ChromeDriver path
chrome_driver_path = r"C:\Users\jeong\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(chrome_driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service, options=options)

# Target URL
URL = "https://map.naver.com/p/search/%EB%8C%80%EC%A0%84%20%EB%B4%89%EB%AA%85%EB%8F%99%20%EC%9D%8C%EC%8B%9D%EC%A0%90"

# Load previously scraped data
data_file = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"
if os.path.exists(data_file):
    with open(data_file, "r", encoding="utf-8") as json_file:
        all_restaurants_data = json.load(json_file)
else:
    all_restaurants_data = []

# Extract names of already scraped restaurants
processed_restaurant_names = {restaurant["restaurant_name"] for restaurant in all_restaurants_data}


def scroll_to_load_all(driver, scrollable_element):
    """Scroll through the given element until all content is loaded."""
    last_height = 0
    while True:
        driver.execute_script("arguments[0].scrollTop += 1000;", scrollable_element)
        time.sleep(10)  # Allow time for content to load
        new_height = driver.execute_script("return arguments[0].scrollHeight;", scrollable_element)
        if new_height == last_height:
            print("Reached the end of the scrollable content.")
            break
        last_height = new_height


def scrape_reviews_and_images(restaurant_name):
    """Scrape reviews and images from the side panel iframe."""
    restaurant_data = {
        "restaurant_name": restaurant_name,
        "reviews": [],
        "images": []
    }
    try:
        # Click the "리뷰" tab
        review_tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '리뷰') and @class='veBoZ']"))
        )
        print("Clicking the 리뷰 tab...")
        review_tab.click()
        time.sleep(5)

        # Scroll to load all reviews and images
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Retrieve reviews
        reviews = driver.find_elements(By.CSS_SELECTOR, "a[data-pui-click-code='rvshowmore']")
        for review in reviews:
            review_text = review.text.strip()
            if review_text and review_text != "더보기":
                restaurant_data["reviews"].append(review_text)

        # Retrieve images
        images = driver.find_elements(By.CSS_SELECTOR, "img.K0PDV")
        for img in images:
            img_url = img.get_attribute("src")
            if img_url and "/common/" not in img_url and "/emoji/" not in img_url:
                restaurant_data["images"].append(img_url)

    except Exception as e:
        print(f"Error scraping reviews and images: {e}")
    return restaurant_data


try:
    # Open the page
    driver.get(URL)
    time.sleep(10)  # Allow the page to load fully

    wait = WebDriverWait(driver, 15)

    # Skip to page 3
    print("Navigating to page 4...")
    iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
    driver.switch_to.frame(iframe)

    for _ in range(3):  # Click the "Next Page" button three times to go to page 4
        next_page_button = wait.until(
            EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
        )
        if next_page_button.get_attribute("aria-disabled") == "true":
            print("Next Page button is disabled. Cannot navigate further.")
            break
        driver.execute_script("arguments[0].click();", next_page_button)
        time.sleep(5)  # Wait for the page to load

    driver.switch_to.default_content()  # Return to the main context

    while True:
        try:
            # Switch to the search iframe
            iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
            driver.switch_to.frame(iframe)

            # Scroll to load all restaurants on the current page
            scrollable_element = driver.find_element(By.CLASS_NAME, "Ryr1F")
            scroll_to_load_all(driver, scrollable_element)

            # Find all restaurant elements
            restaurants = driver.find_elements(By.CLASS_NAME, "TYaxT")
            print(f"\nFound {len(restaurants)} restaurants on this page.")

            # Process each restaurant
            for idx, restaurant in enumerate(restaurants, start=1):
                try:
                    restaurant_name = restaurant.text
                    if restaurant_name in processed_restaurant_names:
                        print(f"Skipping already processed restaurant: {restaurant_name}")
                        continue

                    print(f"Processing restaurant {idx}: {restaurant_name}")
                    driver.execute_script("arguments[0].click();", restaurant)
                    time.sleep(5)

                    # Switch to the side panel iframe
                    driver.switch_to.default_content()
                    entry_iframe = wait.until(EC.presence_of_element_located((By.ID, "entryIframe")))
                    driver.switch_to.frame(entry_iframe)

                    # Scrape reviews and images
                    restaurant_data = scrape_reviews_and_images(restaurant_name)
                    all_restaurants_data.append(restaurant_data)
                    processed_restaurant_names.add(restaurant_name)

                    # Save progress after each restaurant
                    with open(data_file, "w", encoding="utf-8") as json_file:
                        json.dump(all_restaurants_data, json_file, ensure_ascii=False, indent=4)

                    # Return to the search iframe
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

                except Exception as e:
                    print(f"Error processing restaurant {idx}: {e}")
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

            # Locate and click the "Next Page" button
            driver.switch_to.default_content()
            next_page_button = wait.until(
                EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
            )
            if next_page_button.get_attribute("aria-disabled") == "true":
                print("Next Page button is disabled. No more pages to navigate.")
                break
            print("Navigating to the next page...")
            driver.execute_script("arguments[0].click();", next_page_button)
            time.sleep(10)

        except Exception as e:
            print(f"Error occurred during page navigation: {e}")
            break

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    print("Scraping complete.")

Navigating to page 4...
Reached the end of the scrollable content.

Found 70 restaurants on this page.
Skipping already processed restaurant: 고깃리88번지 대전봉명점
Skipping already processed restaurant: 할리스 대전도안DT점
Processing restaurant 3: 전통콩나물밥
Clicking the 리뷰 tab...
Processing restaurant 4: 오복대구탕
Clicking the 리뷰 tab...
Processing restaurant 5: 주가든
Clicking the 리뷰 tab...
Processing restaurant 6: 빈야드10
Clicking the 리뷰 tab...
Processing restaurant 7: 블렌딩바 봉명
Clicking the 리뷰 tab...
Processing restaurant 8: 파이룸
Clicking the 리뷰 tab...
Processing restaurant 9: 허성준보쌈 유성점
Clicking the 리뷰 tab...
Processing restaurant 10: 곱창고 대전봉명동점
Clicking the 리뷰 tab...
Processing restaurant 11: 빵장수단팥빵 대전유성점
Clicking the 리뷰 tab...
Processing restaurant 12: 미진축산 대전봉명점
Clicking the 리뷰 tab...
Processing restaurant 13: 설도 유성점
Clicking the 리뷰 tab...
Processing restaurant 14: 야키토리 코코데
Clicking the 리뷰 tab...
Processing restaurant 15: 스타벅스 대전도안DT점
Clicking the 리뷰 tab...
Processing restaurant 16: 용용선생 대전봉명점
Clicking the 리뷰 t

### Page 5

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os

# ChromeDriver path
chrome_driver_path = r"C:\Users\jeong\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(chrome_driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service, options=options)

# Target URL
URL = "https://map.naver.com/p/search/%EB%8C%80%EC%A0%84%20%EB%B4%89%EB%AA%85%EB%8F%99%20%EC%9D%8C%EC%8B%9D%EC%A0%90"

# Load previously scraped data
data_file = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"
if os.path.exists(data_file):
    with open(data_file, "r", encoding="utf-8") as json_file:
        all_restaurants_data = json.load(json_file)
else:
    all_restaurants_data = []

# Extract names of already scraped restaurants
processed_restaurant_names = {restaurant["restaurant_name"] for restaurant in all_restaurants_data}


def scroll_to_load_all(driver, scrollable_element):
    """Scroll through the given element until all content is loaded."""
    last_height = 0
    while True:
        driver.execute_script("arguments[0].scrollTop += 1000;", scrollable_element)
        time.sleep(10)  # Allow time for content to load
        new_height = driver.execute_script("return arguments[0].scrollHeight;", scrollable_element)
        if new_height == last_height:
            print("Reached the end of the scrollable content.")
            break
        last_height = new_height


def scrape_reviews_and_images(restaurant_name):
    """Scrape reviews and images from the side panel iframe."""
    restaurant_data = {
        "restaurant_name": restaurant_name,
        "reviews": [],
        "images": []
    }
    try:
        # Click the "리뷰" tab
        review_tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '리뷰') and @class='veBoZ']"))
        )
        print("Clicking the 리뷰 tab...")
        review_tab.click()
        time.sleep(5)

        # Scroll to load all reviews and images
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Retrieve reviews
        reviews = driver.find_elements(By.CSS_SELECTOR, "a[data-pui-click-code='rvshowmore']")
        for review in reviews:
            review_text = review.text.strip()
            if review_text and review_text != "더보기":
                restaurant_data["reviews"].append(review_text)

        # Retrieve images
        images = driver.find_elements(By.CSS_SELECTOR, "img.K0PDV")
        for img in images:
            img_url = img.get_attribute("src")
            if img_url and "/common/" not in img_url and "/emoji/" not in img_url:
                restaurant_data["images"].append(img_url)

    except Exception as e:
        print(f"Error scraping reviews and images: {e}")
    return restaurant_data


try:
    # Open the page
    driver.get(URL)
    time.sleep(10)  # Allow the page to load fully

    wait = WebDriverWait(driver, 15)

    # Skip to page 3
    print("Navigating to page 5...")
    iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
    driver.switch_to.frame(iframe)

    for _ in range(4):  # Click the "Next Page" button three times to go to page 4
        next_page_button = wait.until(
            EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
        )
        if next_page_button.get_attribute("aria-disabled") == "true":
            print("Next Page button is disabled. Cannot navigate further.")
            break
        driver.execute_script("arguments[0].click();", next_page_button)
        time.sleep(5)  # Wait for the page to load

    driver.switch_to.default_content()  # Return to the main context

    while True:
        try:
            # Switch to the search iframe
            iframe = wait.until(EC.presence_of_element_located((By.ID, "searchIframe")))
            driver.switch_to.frame(iframe)

            # Scroll to load all restaurants on the current page
            scrollable_element = driver.find_element(By.CLASS_NAME, "Ryr1F")
            scroll_to_load_all(driver, scrollable_element)

            # Find all restaurant elements
            restaurants = driver.find_elements(By.CLASS_NAME, "TYaxT")
            print(f"\nFound {len(restaurants)} restaurants on this page.")

            # Process each restaurant
            for idx, restaurant in enumerate(restaurants, start=1):
                try:
                    restaurant_name = restaurant.text
                    if restaurant_name in processed_restaurant_names:
                        print(f"Skipping already processed restaurant: {restaurant_name}")
                        continue

                    print(f"Processing restaurant {idx}: {restaurant_name}")
                    driver.execute_script("arguments[0].click();", restaurant)
                    time.sleep(5)

                    # Switch to the side panel iframe
                    driver.switch_to.default_content()
                    entry_iframe = wait.until(EC.presence_of_element_located((By.ID, "entryIframe")))
                    driver.switch_to.frame(entry_iframe)

                    # Scrape reviews and images
                    restaurant_data = scrape_reviews_and_images(restaurant_name)
                    all_restaurants_data.append(restaurant_data)
                    processed_restaurant_names.add(restaurant_name)

                    # Save progress after each restaurant
                    with open(data_file, "w", encoding="utf-8") as json_file:
                        json.dump(all_restaurants_data, json_file, ensure_ascii=False, indent=4)

                    # Return to the search iframe
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

                except Exception as e:
                    print(f"Error processing restaurant {idx}: {e}")
                    driver.switch_to.default_content()
                    driver.switch_to.frame("searchIframe")

            # Locate and click the "Next Page" button
            driver.switch_to.default_content()
            next_page_button = wait.until(
                EC.presence_of_element_located((By.XPATH, "//a[span[text()='다음페이지']]"))
            )
            if next_page_button.get_attribute("aria-disabled") == "true":
                print("Next Page button is disabled. No more pages to navigate.")
                break
            print("Navigating to the next page...")
            driver.execute_script("arguments[0].click();", next_page_button)
            time.sleep(10)

        except Exception as e:
            print(f"Error occurred during page navigation: {e}")
            break

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    print("Scraping complete.")

Navigating to page 5...
Reached the end of the scrollable content.

Found 20 restaurants on this page.
Skipping already processed restaurant: 옥뜨
Skipping already processed restaurant: 관저명태촌 봉명점
Processing restaurant 3: 진소우 유성봉명점
Clicking the 리뷰 tab...
Processing restaurant 4: 델리바이애슐리 NC대전유성점
Clicking the 리뷰 tab...
Skipping already processed restaurant: 대동집 대전 봉명점
Processing restaurant 6: 덕희하이볼클럽 대전봉명점
Clicking the 리뷰 tab...
Processing restaurant 7: 정원맥주
Clicking the 리뷰 tab...
Processing restaurant 8: 온천돌구이
Clicking the 리뷰 tab...
Processing restaurant 9: 유성닭발 봉명점
Clicking the 리뷰 tab...
Processing restaurant 10: 엔제리너스 대전유성D/I점
Clicking the 리뷰 tab...
Processing restaurant 11: 엄마식당
Clicking the 리뷰 tab...
Processing restaurant 12: 마실커피24
Clicking the 리뷰 tab...
Processing restaurant 13: 써브웨이 대전유성터미널점
Clicking the 리뷰 tab...
Processing restaurant 14: 노군꼬치 대전봉명점
Clicking the 리뷰 tab...
Processing restaurant 15: 육미당
Clicking the 리뷰 tab...
Processing restaurant 16: 쵸피
Clicking the 리뷰 tab...
Proces

# Matching kakao data ~ naver data

In [7]:
import pandas as pd
import json

# Paths to your files
csv_file_path = r"C:\Users\jeong\Downloads\kakao_map_restaurant_ratings.csv"
json_file_path = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"

# Load the CSV file
print("Loading CSV file...")
csv_data = pd.read_csv(csv_file_path)

# Corrected column name for restaurant titles
restaurant_column = 'Title'

# Load the JSON file
print("Loading JSON file...")
with open(json_file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Extract 'restaurant_name' from JSON
restaurant_names = [entry['restaurant_name'] for entry in json_data if 'restaurant_name' in entry]

# Compare titles in CSV with restaurant names in JSON
print("Comparing CSV 'Title' with JSON 'restaurant_name'...")
matches = csv_data[csv_data[restaurant_column].isin(restaurant_names)]

# Output the matching entries
output_file = r"C:\Users\jeong\Downloads\matching_restaurants.csv"
matches.to_csv(output_file, index=False)
print(f"Matching entries saved to {output_file}")


Loading CSV file...
Loading JSON file...
Comparing CSV 'Title' with JSON 'restaurant_name'...
Matching entries saved to C:\Users\jeong\Downloads\matching_restaurants.csv


In [14]:
import pandas as pd
import json
from rapidfuzz import process, fuzz

# Paths to your files
csv_file_path = r"C:\Users\jeong\Downloads\kakao_map_restaurant_ratings.csv"
json_file_path = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"

# Load the CSV file with proper encoding
print("Loading CSV file...")
csv_data = pd.read_csv(csv_file_path, encoding='utf-8')

# Load the JSON file
print("Loading JSON file...")
with open(json_file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Extract restaurant names from JSON
json_names = [entry['restaurant_name'] for entry in json_data if 'restaurant_name' in entry]

# Preserve original names
csv_data['kakao_name'] = csv_data['Title']  # Preserve original Kakao names
json_original_names = json_names.copy()  # Preserve original Naver names

# Preprocess names for comparison
def preprocess_name(name):
    if isinstance(name, str):
        return ''.join(e for e in name if e.isalnum()).lower()  # Remove special characters and convert to lowercase
    return ""

print("Preprocessing data for comparison...")
csv_data['Normalized_Title'] = csv_data['kakao_name'].apply(preprocess_name)
normalized_json_names = [preprocess_name(name) for name in json_original_names]

# Match names using fuzzy matching
print("Matching names with fuzzy logic...")
matches = []
for idx, row in csv_data.iterrows():
    best_match, score, _ = process.extractOne(
        row['Normalized_Title'], normalized_json_names, scorer=fuzz.ratio
    )
    if score > 80:  # Threshold for a good match
        matches.append({
            "kakao_name": row['kakao_name'],
            "naver_name": json_original_names[normalized_json_names.index(best_match)],
            "Match Score": score
        })

# Convert matches to a DataFrame
matches_df = pd.DataFrame(matches)

# Remove duplicates based on 'kakao_name' and 'naver_name'
matches_df = matches_df.drop_duplicates(subset=['kakao_name', 'naver_name'])

# Save matches to a CSV file
output_file = r"C:\Users\jeong\Downloads\fuzzy_matched_restaurants.csv"
matches_df.to_csv(output_file, index=False, encoding='utf-8-sig')  # Ensure correct encoding for Korean characters
print(f"Matched entries saved to {output_file}")


Loading CSV file...
Loading JSON file...
Preprocessing data for comparison...
Matching names with fuzzy logic...
Matched entries saved to C:\Users\jeong\Downloads\fuzzy_matched_restaurants.csv


### combining into one now. restaurant name, rating (from kakao dataset)(can get by matching with kakao name from fuzzy_matched_restaurants), reviews & images from naver dataset (can get by matching with naver name from fuzzy_matches_restaurants.)

In [15]:
import pandas as pd

# Paths to the files
csv_files = [
    r"C:\Users\jeong\Downloads\fuzzy_matched_restaurants.csv",
    r"C:\Users\jeong\Downloads\kakao_map_restaurant_ratings.csv"
]

# Inspect each CSV
for csv_file in csv_files:
    print(f"Inspecting file: {csv_file}")
    csv_data = pd.read_csv(csv_file, encoding='utf-8')
    print("Columns:", csv_data.columns)
    print()


Inspecting file: C:\Users\jeong\Downloads\fuzzy_matched_restaurants.csv
Columns: Index(['kakao_name', 'naver_name', 'Match Score'], dtype='object')

Inspecting file: C:\Users\jeong\Downloads\kakao_map_restaurant_ratings.csv
Columns: Index(['Title', 'Rating'], dtype='object')



In [16]:
import json

# Path to the JSON file
json_file_path = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"

# Load and inspect the JSON structure
print(f"Inspecting file: {json_file_path}")
with open(json_file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Print keys for the first entry in the JSON
if isinstance(json_data, list) and len(json_data) > 0:
    print("Keys in JSON:", json_data[0].keys())
else:
    print("JSON structure is not a list or is empty.")


Inspecting file: C:\Users\jeong\Downloads\scraped_restaurant_data.json
Keys in JSON: dict_keys(['restaurant_name', 'reviews', 'images'])


In [17]:
import pandas as pd
import json

# File paths
fuzzy_matches_path = r"C:\Users\jeong\Downloads\fuzzy_matched_restaurants.csv"
kakao_ratings_path = r"C:\Users\jeong\Downloads\kakao_map_restaurant_ratings.csv"
naver_data_path = r"C:\Users\jeong\Downloads\scraped_restaurant_data.json"

# Load fuzzy matched data
print("Loading fuzzy matched data...")
fuzzy_matches = pd.read_csv(fuzzy_matches_path, encoding='utf-8')

# Load Kakao ratings data
print("Loading Kakao ratings data...")
kakao_data = pd.read_csv(kakao_ratings_path, encoding='utf-8')

# Load Naver data
print("Loading Naver data...")
with open(naver_data_path, 'r', encoding='utf-8') as file:
    naver_data = json.load(file)

# Convert Naver JSON to DataFrame
naver_df = pd.DataFrame(naver_data)

# Merge Kakao ratings with fuzzy matches
print("Merging Kakao ratings...")
fuzzy_matches = fuzzy_matches.merge(
    kakao_data, left_on='kakao_name', right_on='Title', how='left'
)

# Merge Naver reviews and images with fuzzy matches
print("Merging Naver reviews and images...")
fuzzy_matches = fuzzy_matches.merge(
    naver_df[['restaurant_name', 'reviews', 'images']],
    left_on='naver_name',
    right_on='restaurant_name',
    how='left'
)

# Select and rename final columns
final_data = fuzzy_matches[['kakao_name', 'Rating', 'reviews', 'images']].rename(
    columns={
        'kakao_name': 'restaurant_name',
        'Rating': 'rating'
    }
)

# Convert to JSON format
final_json = final_data.to_dict(orient='records')

# Save the final JSON
output_json_file = r"C:\Users\jeong\Downloads\combined_restaurant_data.json"
with open(output_json_file, 'w', encoding='utf-8') as json_file:
    json.dump(final_json, json_file, ensure_ascii=False, indent=4)
print(f"Combined data saved to {output_json_file}")

Loading fuzzy matched data...
Loading Kakao ratings data...
Loading Naver data...
Merging Kakao ratings...
Merging Naver reviews and images...
Combined data saved to C:\Users\jeong\Downloads\combined_restaurant_data.json


In [18]:
import json
from collections import Counter

# Path to the JSON file
input_json_file = r"C:\Users\jeong\Downloads\combined_restaurant_data.json"
output_json_file = r"C:\Users\jeong\Downloads\updated_combined_restaurant_data.json"

# Load the JSON data
print("Loading JSON data...")
with open(input_json_file, 'r', encoding='utf-8') as file:
    restaurant_data = json.load(file)

# Count duplicates
restaurant_names = [entry["restaurant_name"] for entry in restaurant_data]
duplicates = Counter(restaurant_names)  # Count occurrences of each restaurant name
duplicate_count = sum(1 for count in duplicates.values() if count > 1)

print(f"Number of duplicate entries: {duplicate_count}")
print(f"Total unique restaurants: {len(duplicates)}")

# Remove duplicates by keeping the first occurrence
unique_data = {entry["restaurant_name"]: entry for entry in restaurant_data}.values()

# Save the updated data
print("Saving updated JSON file...")
with open(output_json_file, 'w', encoding='utf-8') as file:
    json.dump(list(unique_data), file, ensure_ascii=False, indent=4)

print(f"Updated JSON file saved to {output_json_file}")

Loading JSON data...
Number of duplicate entries: 41
Total unique restaurants: 166
Saving updated JSON file...
Updated JSON file saved to C:\Users\jeong\Downloads\updated_combined_restaurant_data.json
