In [3]:
import time
import random
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import pandas as pd

In [4]:
def setup_driver():
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    wait = WebDriverWait(driver, 10)
    return driver, wait

# 특정 페이지에서 레시피 링크 가져오기
def get_recipe_links(driver, page_url):
    driver.get(page_url) #링크로 이동
    time.sleep(random.uniform(1.5, 3)) #잠시 기기
    links = driver.find_elements(By.CLASS_NAME, "common_sp_link")
    return [link.get_attribute("href") for link in links if link.get_attribute("href")]


In [5]:
# 조리 순서를 포함한 레시피 데이터 추출
def extract_recipe_data_as_dataframe(driver, wait, href):
    try:
        driver.get(href)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "view2_summary_info")))
        time.sleep(random.uniform(1.5, 3))

        # 제목
        title = driver.title.strip()

        # 요약 정보
        summary_info = []
        infos = driver.find_elements(By.CLASS_NAME, "view2_summary_info")
        for info in infos:
            spans = info.find_elements(By.TAG_NAME, "span")
            summary_info.extend([span.text.strip() for span in spans])

        # 재료
        ingredients_dict = {}
        ingredient_items = driver.find_elements(By.CSS_SELECTOR, "#divConfirmedMaterialArea li")
        for item in ingredient_items:
            name = item.find_element(By.CLASS_NAME, "ingre_list_name").text.strip()
            try:
                quantity = item.find_element(By.CLASS_NAME, "ingre_list_ea").text.strip()
            except:
                quantity = ""  # 양이 없을 경우 공백 처리
            ingredients_dict[name] = quantity

        # 조리 도구
        try:
            tools_section = driver.find_element(By.XPATH, '//*[@id="contents_area_full"]/div[6]/div[4]')
            tools = [item.text.strip() for item in tools_section.find_elements(By.CLASS_NAME, "ingre_list_name")]
        except:
            tools = []

        # 조리 순서
        recipe_steps = {}
        step = 1
        while True:
            try:
                step_element = driver.find_element(By.ID, f"stepdescr{step}")
                main_text = step_element.text.split("\n")[0].strip()

                # 추가 정보 (괄호에 포함)
                additional_info = []
                p_elements = step_element.find_elements(By.TAG_NAME, "p")
                for p in p_elements:
                    if not p.find_elements(By.TAG_NAME, "a"):  # <a> 태그 없는 경우만 추가
                        additional_info.append(f"({p.text.strip()})")

                # 최종 조리 순서 텍스트
                full_text = f"{main_text} {' '.join(additional_info)}"
                recipe_steps[step] = full_text

                step += 1
            except:
                break

        # DataFrame 생성
        data = {
            "Title": title,
            "Summary Info": ", ".join(summary_info),
            "Ingredients": [f"{name}: {quantity}" for name, quantity in ingredients_dict.items()],
            "Tools": tools,
            "Recipe Steps": [recipe_steps[step] for step in sorted(recipe_steps.keys())]
        }

        # 하나의 DataFrame 행 생성
        return pd.DataFrame([data])

    except Exception as e:
        print(f"Error processing {href}: {e}")
        return None

In [7]:
def main():
    driver, wait = setup_driver()

    # 테스트용: 1페이지부터 2페이지까지
    start_page = 53
    end_page = 53
    all_recipes = pd.DataFrame()

    try:
        for page in range(start_page, end_page + 1):
            page_url = f"https://www.10000recipe.com/recipe/list.html?order=reco&page={page}"
            print(f"Processing page: {page}")
            recipe_links = get_recipe_links(driver, page_url)

            for href in recipe_links:
                print(f"Processing recipe: {href}")
                recipe_data = extract_recipe_data_as_dataframe(driver, wait, href)
                if recipe_data is not None:
                    all_recipes = pd.concat([all_recipes, recipe_data], ignore_index=True)

    finally:
        driver.quit()

    # 결과 저장
    all_recipes.to_csv("recipes.csv", index=False, encoding="utf-8-sig")
    print("Data saved to recipes.csv")


In [8]:
if __name__ == "__main__":
    main()

Processing page: 53
Processing recipe: https://www.10000recipe.com/recipe/6875498
Processing recipe: https://www.10000recipe.com/recipe/6922869
Processing recipe: https://www.10000recipe.com/recipe/6931396
Processing recipe: https://www.10000recipe.com/recipe/6830520
Processing recipe: https://www.10000recipe.com/recipe/6904454
Processing recipe: https://www.10000recipe.com/recipe/6909684
Processing recipe: https://www.10000recipe.com/recipe/6923036
Processing recipe: https://www.10000recipe.com/recipe/6904901
Processing recipe: https://www.10000recipe.com/recipe/6865434
Processing recipe: https://www.10000recipe.com/recipe/6936118
Processing recipe: https://www.10000recipe.com/recipe/6877054
Processing recipe: https://www.10000recipe.com/recipe/6918274
Processing recipe: https://www.10000recipe.com/recipe/6928699
Processing recipe: https://www.10000recipe.com/recipe/6837197
Processing recipe: https://www.10000recipe.com/recipe/6978403
Processing recipe: https://www.10000recipe.com/rec