In [1]:
import pandas as pd
import requests
import re
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(
    filename=f"sofurn_2024_{datetime.today().strftime('%y%m%d-%H%m')}.log",
    encoding="utf-8",
    level=logging.DEBUG,
)

## 크롤링 방법
각 회사마다 페이지를 만들어주고, id를 1씩 증가했다.
2023년의 마지막 id가 283이었고, 올해 참석한 업체 마지막 아이디가 329였다.

In [24]:
URL = "https://www.sofurn.or.kr/community/reference/id/"
RANGE_2024 = [str(i) for i in range(284, 329 + 1)]

In [4]:
def get_content(url: str) -> BeautifulSoup:
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")
    else:
        logger.debug(
            f"[DEBUG] Get content error: {response.status_code}\nProblem URL: {url}"
        )

In [5]:
test_url = "".join([URL, "289"])
soup = get_content(test_url)

In [6]:
a = soup.find_all("div", class_="table notice-table")[0].table.thead.tr.td.text

In [7]:
a

'2024 소펀&라이프쇼 브랜드 : 투투 시스템 샤워기'

In [17]:
def parse_content(soup: BeautifulSoup) -> dict:
    title = soup.find_all("div", class_="table notice-table")[0].table.thead.tr.td.text
    title = title.split(":")[1].strip()

    category_pattern = r"참가품목\s*:\s*([^\d\s&]+&?[^\d\s&]+)주요제품"
    product_pattern = r"주요제품\s*:\s*([^\n]*)대표전화"
    phone_pattern = r"대표전화\s*:\s*(\d*-\d*-\d*)"
    address_pattern = r"주\s*소\s*:\s*([^\n]*)"

    html_text = soup.get_text()

    category_match = re.search(category_pattern, html_text)
    product_match = re.search(product_pattern, html_text)
    phone_match = re.search(phone_pattern, html_text)
    address_match = re.search(address_pattern, html_text)

    category = category_match.group(1).strip() if category_match else ""
    product = product_match.group(1).strip() if product_match else ""
    phone = phone_match.group(1).strip() if phone_match else ""
    address = address_match.group(1).strip() if address_match else ""

    # homepage & sns는 a tag 사용
    homepage_link = soup.find("a", string="홈페이지")
    sns_link = soup.find("a", "SNS 바로가기")

    homepage_href = homepage_link["href"] if homepage_link else ""
    sns_href = sns_link["href"] if sns_link else ""

    return {
        "title": title,
        "category": category,
        "product": product,
        "phone": phone,
        "address": address,
        "url": homepage_href,
        "sns": sns_href,
    }

In [18]:
d = parse_content(soup)

In [19]:
d

{'title': '투투 시스템 샤워기',
 'category': '헬스&뷰티',
 'product': '이온수 샤워기',
 'phone': '032-670-8878',
 'address': '경기 부천시 도약로261,대우테크노파크 B동608호기능성 샤워기 제조업체 투투입니다-욕실,주방수전 전문 제조업체. 다양한 기능을 가진 기능성 샤워기입니다.홈페이지',
 'url': 'http://www.twotwo2.com',
 'sns': ''}

In [11]:
for k, v in d.items():
    print(f"{k}:{v}")

title:2024 소펀&라이프쇼 브랜드 : 투투 시스템 샤워기
category:헬스&뷰티
product:이온수 샤워기
phone:032-670-8878
address:경기 부천시 도약로261,대우테크노파크 B동608호기능성 샤워기 제조업체 투투입니다-욕실,주방수전 전문 제조업체. 다양한 기능을 가진 기능성 샤워기입니다.홈페이지
url:


In [20]:
def build_csv(url: str, filename: Path):
    if not filename.exists():
        with open(str(filename), "w") as f:
            f.write("id,company,category,product,phone,address,url,sns\n")

    for i, idx in enumerate(RANGE_2024):
        _url = "".join([url, idx])
        soup = get_content(_url)

        parsed_dict = parse_content(soup)
        with open(str(filename), "a") as f2:
            f2.write(
                f'{i},"{parsed_dict.get("title")}","{parsed_dict.get("category")}","{parsed_dict.get("product")}",{parsed_dict.get("phone")},"{parsed_dict.get("address")}","{parsed_dict.get("url")}","{parsed_dict.get("sns")}"\n'
            )

In [21]:
result_file_path = Path(".").absolute() / "parsed_sofurn.csv"

In [23]:
build_csv(url=URL, filename=result_file_path)