In [None]:
import os
import time
import subprocess
from urllib.parse import urljoin
import html2text
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import yaml


class ColumnCrawler:
    def __init__(self, config, column=None):
        self.category_id = config["settings"]["category_id"]
        self.url_category = config["settings"]["url_category"]
        self.driver = None
        self.each_column = column
        self.filetype = config["settings"]["filetype"]
        self.base_url = config["settings"]["base_url"]
        self.base_path = config["settings"]["base_path"]
        self.folder_path = os.path.join(self.base_path, self.category_id, self.filetype)

    def start_chrome_debugging(self):
        chrome_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
        debugging_port = "9222"
        subprocess.Popen([chrome_path, f"--remote-debugging-port={debugging_port}"])
        time.sleep(2)

    def setup_selenium_driver(self):
        chrome_options = Options()
        chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=chrome_options)

    def get_column_links(self):
        self.driver.get(f"{self.base_url}/{self.url_category}")
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        links = soup.find_all(
            "a",
            class_="group/link mr-auto flex w-full min-w-0 flex-col gap-1 visited:text-fuchsia-900 xl:flex-row xl:gap-0",
        )
        return [f"{self.base_url}{link['href']}" for link in links if link.get("href")]

    def parse_page_text(self, url):
        self.driver.get(url)
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        soup = BeautifulSoup(self.driver.page_source, "html.parser")

        time.sleep(5)

        title = soup.find(
            "h3", "flex w-full items-center gap-1.5 break-words text-base font-medium"
        )
        article = soup.find(
            "div",
            "prose prose-hr:my-6 max-w-none overflow-x-auto whitespace-break-spaces prose-sm",
        )

        time.sleep(5)

        if not article:
            return "본문을 찾을 수 없습니다.", ""

        for img in article.find_all("img"):
            if img.get("src"):
                img["src"] = urljoin(url, img["src"])

        def custom_image_formatter(tag):
            src = tag.get("src", "")
            alt = tag.get("alt", "")
            return f"![{alt}]({src})\n\n"

        h = html2text.HTML2Text()
        h.ignore_links = False
        h.ignore_images = False
        h.protect_links = True
        h.images_to_alt = False
        h.images_as_html = False
        h.body_width = 0
        h.custom_tag_formatter = custom_image_formatter
        markdown_content = h.handle(str(article))

        return title.get_text(strip=True), markdown_content

    def save_markdown_file(self, title, markdown_content):
        md_filename = f"{title}.md"
        md_path = os.path.join(self.folder_path, md_filename)

        if not os.path.exists(self.folder_path):
            os.makedirs(self.folder_path)

        with open(md_path, "w", encoding="utf-8") as md_file:
            md_file.write(f"# {title}\n\n")
            md_file.write(markdown_content)

        print()

        return md_path

    def crawl_and_save(self):
        self.start_chrome_debugging()
        self.setup_selenium_driver()

        if self.each_column is not None:
            title, markdown_content = self.parse_page_text(self.each_column)
            self.save_markdown_file(title, markdown_content)
            print(f"{title}이 저장되었습니다")
            time.sleep(10)

        else:
            links = self.get_column_links()
            for link in links:
                title, markdown_content = self.parse_page_text(link)
                self.save_markdown_file(title, markdown_content)
                print(f"{title}이 저장되었습니다")
                time.sleep(10)

        self.driver.quit()


def load_yaml(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return yaml.safe_load(file)


config = load_yaml("../config/crawler.yaml")

# 개별 컬럼 md 저장
column = config["settings"]["column"]

crawler = ColumnCrawler(config, column)
crawler.crawl_and_save()


# 페이지내 여러 칼럼 md 저장
# crawler = ColumnCrawler(config)
# crawler.crawl_and_save()