In [51]:
import pandas as pd
import requests
import re
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup

In [11]:
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(filename=f"sofurn_2024_{datetime.today().strftime('%y%m%d-%H%m')}.log", encoding="utf-8", level=logging.DEBUG)

In [15]:
URL = "https://www.sofurn.or.kr/community/reference/id/"
RANGE_2024 = [str(i) for i in range(284,329)]

In [12]:
def get_content(url: str) -> BeautifulSoup:
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")
    else:
        logger.debug(f"[DEBUG] Get content error: {response.status_code}\nProblem URL: {url}")
        

In [24]:
test_url = "".join([URL, "289"])
soup = get_content(test_url)

In [46]:
for idx, elem in enumerate(soup.find_all("table", attrs={"role": "presentation", "class": "stb-one-col"})[1].tbody.tr.td.div.find_all("span")):
    print(f"{idx}: {elem.text}")

0: 참가품목 : 헬스&뷰티
1: 헬스&뷰티


In [84]:
def parse_content(soup: BeautifulSoup) -> dict:
    title = soup.find_all("td", class_="title")[0].text if soup.find_all("td", class_="title") else None

    category_pattern = r"참가품목\s*:\s*([^\d\s&]+&?[^\d\s&]+)주요제품"
    product_pattern = r"주요제품\s*:\s*([^\n]*)대표전화"
    phone_pattern = r"대표전화\s*:\s*(\d*-\d*-\d*)"
    address_pattern = r"주\s*소\s*:\s*([^\n]*)"

    html_text = soup.get_text()

    category_match = re.search(category_pattern, html_text)
    product_match = re.search(product_pattern, html_text)
    phone_match = re.search(phone_pattern, html_text)
    address_match = re.search(address_pattern, html_text)

    category = category_match.group(1).strip() if category_match else ""
    product = product_match.group(1).strip() if product_match else ""
    phone = phone_match.group(1).strip() if phone_match else ""
    address = address_match.group(1).strip() if address_match else ""

    return {
        "title", title
        "category": category,
        "product": product,
        "phone": phone,
        "address": address
    }

In [85]:
d = parse_content(soup)

In [86]:
d

{'category': '헬스&뷰티',
 'product': '이온수 샤워기',
 'phone': '032-670-8878',
 'address': '경기 부천시 도약로261,대우테크노파크 B동608호기능성 샤워기 제조업체 투투입니다-욕실,주방수전 전문 제조업체. 다양한 기능을 가진 기능성 샤워기입니다.홈페이지'}

In [88]:
for k,v in d.items():
    print(f"{k}:{v}")

category:헬스&뷰티
product:이온수 샤워기
phone:032-670-8878
address:경기 부천시 도약로261,대우테크노파크 B동608호기능성 샤워기 제조업체 투투입니다-욕실,주방수전 전문 제조업체. 다양한 기능을 가진 기능성 샤워기입니다.홈페이지


In [55]:
test_file = Path(".").absolute() / "__init__.py"

In [56]:
print(test_file)

/home/ljin/Projects/expo_scraping/expo_scraping/__init__.py


In [57]:
test_file.exists()

True

In [98]:
def build_csv(url: str, filename: Path):
    if not filename.exists():
        with open(str(filename), "w") as f:
            f.write("id,company,category,product,phone,address\n")
    
    for i, idx in enumerate(RANGE_2024):
        _url = "".join([url, idx])
        soup = get_content(_url)
        
        parsed_dict = parse_content(soup)
        with open(str(filename), "a") as f2:
            f2.write(f'{i},\"{parsed_dict.get("company")}\",\"{parsed_dict.get("category")}\",\"{parsed_dict.get("product")}\",{parsed_dict.get("phone")},\"{parsed_dict.get("address")}\"\n')
    

In [99]:
result_file_path = Path(".").absolute() / "parsed_sofurn.csv"

In [100]:
build_csv(url=URL, filename=result_file_path)