In [140]:
import requests
from bs4 import BeautifulSoup
import json
import time
import os

# -----------------------------
# 기본 설정
# -----------------------------
BASE_LIST = "https://www.aitimes.com/news/articleList.html"
BASE_ORIGIN = "https://www.aitimes.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0",
    "Accept-Language": "ko-KR,ko;q=0.9"
}

# -----------------------------
# 목록 페이지 HTML 가져오기
# -----------------------------
def get_list_html(page=1):
    params = {"view_type": "sm", "page": page}
    try:
        r = requests.get(BASE_LIST, headers=HEADERS, params=params, timeout=10)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(f"[WARN] 페이지 {page} 요청 실패: {e}")
        return ""

# -----------------------------
# 기사 목록 파싱
# -----------------------------
def parse_list(html):
    soup = BeautifulSoup(html, "html.parser")
    articles = []

    for a in soup.select('a[href*="/news/articleView.html"]'):
        url = BASE_ORIGIN + a.get("href")
        title = a.get_text(strip=True)
        if not title:
            continue

        # 요약, 날짜 선택자 시도
        summary_node = a.find_next("p")
        summary = summary_node.get_text(strip=True) if summary_node else ""

        date_node = a.find_next(class_="date")
        date = date_node.get_text(strip=True) if date_node else ""

        articles.append({
            "title": title,
            "url": url,
            "summary": summary,
            "date": date
        })

    return articles

# -----------------------------
# 기사 본문 가져오기 (선택)
# -----------------------------
def get_article_body(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        content_div = soup.find("div", class_="article-view-content")
        if content_div:
            return content_div.get_text("\n", strip=True)
        return ""
    except Exception as e:
        print(f"[WARN] 본문 요청 실패: {url} → {e}")
        return ""

# -----------------------------
# 전체 뉴스 수집
# -----------------------------
def crawl_news(pages=3, get_body=False, delay=0.5):
    all_articles = []
    for page in range(1, pages+1):
        html = get_list_html(page)
        if not html:
            continue
        articles = parse_list(html)

        if get_body:
            for a in articles:
                a["content"] = get_article_body(a["url"])
                time.sleep(delay)

        all_articles.extend(articles)
        time.sleep(delay)

    return all_articles

# -----------------------------
# JSON 저장
# -----------------------------
def save_json(data, filename="aitimes_news.json"):
    os.makedirs("results", exist_ok=True)
    filepath = f"results/{filename}"
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"[OK] 저장 완료: {filepath} ({len(data)}건)")

# -----------------------------
# 메인 실행
# -----------------------------
if __name__ == "__main__":
    pages_to_crawl = 5
    collect_body = False  # True면 본문도 수집

    articles = crawl_news(pages=pages_to_crawl, get_body=collect_body)
    save_json(articles)


[OK] 저장 완료: results/aitimes_news.json (170건)


In [None]:
import json
import time
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from dateutil import parser as dateparser
from datetime import datetime

BASE_URL = "https://www.aitimes.com"
PAGING_URL = f"{BASE_URL}/ajaxArticlePaging.php"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/123.0.0.0 Safari/537.36"
}

TIMEOUT = 15
REQUEST_DELAY = 0.8


def fetch(url: str, params: Optional[Dict] = None) -> Optional[str]:
    try:
        resp = requests.get(url, headers=HEADERS, params=params, timeout=TIMEOUT)
        resp.raise_for_status()
        return resp.text
    except requests.RequestException as e:
        print(f"[WARN] Request failed: {url} | {e}")
        return None


def normalize_date(text: Optional[str]) -> Optional[str]:
    if not text:
        return None
    try:
        dt = dateparser.parse(text)
        if dt:
            return dt.isoformat()
    except Exception:
        pass
    for sep in ["-", ".", "/"]:
        parts = text.replace("년", sep).replace("월", sep).replace("일", "").split(sep)
        nums = [p.strip() for p in parts if p.strip().isdigit()]
        if len(nums) >= 3:
            try:
                y, m, d = int(nums[0]), int(nums[1]), int(nums[2])
                return datetime(y, m, d).isoformat()
            except Exception:
                pass
    return None


def parse_list(html: str) -> List[Dict]:
    """
    ajaxArticlePaging 응답 HTML에서 기사 목록 파싱.
    실제 클래스/구조는 Network → Response → Elements에서 확인 후 선택자 조정.
    """
    # lxml이 없다면 html.parser로 자동 폴백하는 방식 권장
    try:
        soup = BeautifulSoup(html, "lxml")
    except Exception:
        soup = BeautifulSoup(html, "html.parser")

    # 예시: 카드 래퍼 선택자 (실제 구조에 맞게 바꾸세요)
    items = soup.select(".list-block-item, .article-list .item, li.type2, .news-list .item")
    if not items:
        # 대체 선택자 시도
        items = soup.select("li, article")

    results = []
    for it in items:
        # 제목/URL
        a = it.select_one("a")
        title = a.get_text(strip=True) if a else None
        href = a.get("href") if a else None
        url = None
        if href:
            url = href if href.startswith("http") else BASE_URL + href

        # 요약
        summary_el = it.select_one(".lead, .summary, p.lead, .desc")
        summary = summary_el.get_text(strip=True) if summary_el else None

        # 날짜(목록 상 노출 시)
        date_el = it.select_one(".date, time, .info .date")
        raw_date = date_el.get_text(strip=True) if date_el else None
        date_iso = normalize_date(raw_date)

        if title and url:
            results.append({
                "title": title,
                "url": url,
                "summary": summary,
                "date": date_iso,
                "raw_date": raw_date,
            })
    return results


def parse_article(html: str) -> Dict:
    """
    상세 페이지에서 본문/제목/날짜/기자 파싱.
    """
    try:
        soup = BeautifulSoup(html, "lxml")
    except Exception:
        soup = BeautifulSoup(html, "html.parser")

    # 제목
    title_el = soup.select_one("#articleTitle, .article-title, h1.title, h1")
    title_detail = title_el.get_text(strip=True) if title_el else None

    # 본문
    body = soup.select_one("#articleBody, #article-view-content-div, .article-body, .content, .news-body")
    content = None
    if body:
        for tag in body(["script", "style"]):
            tag.decompose()
        content = body.get_text("\n", strip=True)

    # 날짜
    date_el = soup.select_one(".date, .news-date, time, .info .date")
    raw_date_detail = date_el.get_text(strip=True) if date_el else None
    date_detail = normalize_date(raw_date_detail)

    # 기자
    author_el = soup.select_one(".author, .reporter, .byline .name")
    author = author_el.get_text(strip=True) if author_el else None

    return {
        "title_detail": title_detail,
        "content": content,
        "date_detail": date_detail,
        "raw_date_detail": raw_date_detail,
        "author": author,
    }


def crawl_list_pages(start_page: int = 1, end_page: int = 5,
                     list_per_page: int = 20,
                     view_type: str = "smBlock",
                     extra_params: Optional[Dict] = None,
                     collect_detail: bool = False) -> List[Dict]:
    """
    ajaxArticlePaging.php?page=N 형태로 여러 페이지 수집.
    Network 탭에서 발견한 파라미터(total, list_per_page, view_type, ixno 등 반영).
    """
    data: List[Dict] = []
    extra_params = extra_params or {}

    for page in tqdm(range(start_page, end_page + 1), desc="Crawling pages"):
        params = {
            "page": page,
            "list_per_page": list_per_page,
            "view_type": view_type,
            # "total": 1439,  # 참고용(필수 아닐 수 있음)
            # "ixno": 0,      # 네트워크 탭에서 보이면 반영
        }
        params.update(extra_params)

        html = fetch(PAGING_URL, params=params)
        if not html:
            continue

        items = parse_list(html)
        if not items:
            print(f"[INFO] No items found on page {page}, stopping.")
            break

        if collect_detail:
            for it in items:
                url = it.get("url")
                if not url:
                    continue
                time.sleep(REQUEST_DELAY)
                art_html = fetch(url)
                if not art_html:
                    continue
                detail = parse_article(art_html)
                it.update(detail)

        data.extend(items)
        time.sleep(REQUEST_DELAY)
    return data


def save_json(data: List[Dict], path: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"[INFO] Saved {len(data)} records to {path}")


def make_stats(data: List[Dict]) -> Dict:
    def pick_date(rec):
        return rec.get("date_detail") or rec.get("date")

    dates = [pick_date(d) for d in data if pick_date(d)]
    by_day: Dict[str, int] = {}
    for iso in dates:
        day = iso[:10]
        by_day[day] = by_day.get(day, 0) + 1

    date_min = min(dates) if dates else None
    date_max = max(dates) if dates else None

    return {
        "total_articles": len(data),
        "date_min": date_min,
        "date_max": date_max,
        "by_day": by_day
    }


def main():
    # 섹션/카테고리 필터가 있다면 Network 탭에서 파라미터 확인 후 반영
    extra_params = {
        # 예시: "sc_section_code": "S1N1"
    }

    # 목록만 수집
    data = crawl_list_pages(start_page=1, end_page=5,
                            list_per_page=20,
                            view_type="smBlock",
                            extra_params=extra_params,
                            collect_detail=False)

    # 상세 본문까지 수집하려면 collect_detail=True로
    # data = crawl_list_pages(start_page=1, end_page=3, list_per_page=20,
    #                         view_type="smBlock", extra_params=extra_params,
    #                         collect_detail=True)

    save_json(data, "aitimes_articles.json")
    stats = make_stats(data)
    print(json.dumps(stats, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()


Crawling pages:  20%|█████████████▌                                                      | 1/5 [00:01<00:06,  1.75s/it]

[WARN] Request failed: https://www.aitimes.com/ajaxArticlePaging.php | 404 Client Error: Not Found for url: https://www.aitimes.com/ajaxArticlePaging.php?page=1&list_per_page=20&view_type=smBlock


Crawling pages:  40%|███████████████████████████▏                                        | 2/5 [00:02<00:04,  1.38s/it]

[WARN] Request failed: https://www.aitimes.com/ajaxArticlePaging.php | 404 Client Error: Not Found for url: https://www.aitimes.com/ajaxArticlePaging.php?page=2&list_per_page=20&view_type=smBlock


Crawling pages:  60%|████████████████████████████████████████▊                           | 3/5 [00:03<00:02,  1.23s/it]

[WARN] Request failed: https://www.aitimes.com/ajaxArticlePaging.php | 404 Client Error: Not Found for url: https://www.aitimes.com/ajaxArticlePaging.php?page=3&list_per_page=20&view_type=smBlock


Crawling pages:  80%|██████████████████████████████████████████████████████▍             | 4/5 [00:04<00:01,  1.03s/it]

[WARN] Request failed: https://www.aitimes.com/ajaxArticlePaging.php | 404 Client Error: Not Found for url: https://www.aitimes.com/ajaxArticlePaging.php?page=4&list_per_page=20&view_type=smBlock


Crawling pages: 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.07s/it]

[WARN] Request failed: https://www.aitimes.com/ajaxArticlePaging.php | 404 Client Error: Not Found for url: https://www.aitimes.com/ajaxArticlePaging.php?page=5&list_per_page=20&view_type=smBlock
[INFO] Saved 0 records to aitimes_articles.json
{
  "total_articles": 0,
  "date_min": null,
  "date_max": null,
  "by_day": {}
}





: 