In [2]:
## 규원 05.09 중앙일보 클롱링 csv파일 저장 
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import csv
from tqdm import tqdm  # tqdm 추가
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed 
import time

## 2024년부터 오늘 날짜 까지 클롤링실시 
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 1, 5)
#end_date = datetime.now()

limit = 5

## 날짜 형식을 바꿔주는 함수 
def date_range(start, end):
    for n in range(int((end - start).days) + 1):
        yield start + timedelta(n)

# input : 해당 날짜의 url , output : 해당 날짜에 생성된 기사의 url List 
def fetch_news_articles(url):
    try:
        response = requests.get(url,timeout = limit)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        articles_url = []
        for article in soup.find_all('li', class_='card'):
            title_link = article.find('a')
            if title_link and title_link.has_attr('href'):
                articles_url.append(title_link['href'])
                #print(title_link['href'])
        return articles_url
    except requests.exceptions.Timeout:
        print("Timeout occurred while fetching articles from", url)
        return []
    except requests.exceptions.RequestException as e:
        print("Error occurred while fetching articles:", e)
        return []

## input : 기사의 url , output : 해당 기사에서 추출한 url, 제목, 내용의 튜플리스트
def fetch_articles_with_details(url):
    try:
        response = requests.get(url,timeout = limit)
        response.raise_for_status()  # HTTP 에러가 있을 경우 예외를 발생시킵니다.

        soup = BeautifulSoup(response.text, 'html.parser')

        articles = []

        for article in soup.find_all('article'):
            # 제목 추출
            headline = article.find('h1', class_='headline')
            title = headline.get_text(strip=True) if headline else "No title"
            # 본문 내용 추출
            body = article.find('div', class_='article_body fs3')
            text = []
            if body:
                for p in body.find_all('p', attrs={"data-divno": True}):
                    text.append(p.get_text(strip=True))
            text_content = ' '.join(text)

            #print(url+"진행중")
            articles.append([url, title, text_content])

        return articles
    except requests.exceptions.Timeout:
        print("Timeout occurred while fetching articles from", url)
        return []
    except requests.exceptions.RequestException as e:
        print("Error occurred while fetching articles:", e)
        return []

# input : fetch_news_articles() 에 날짜를 for문으로 넣어서 사용
# output : 지정한 날짜의 모든 뉴스기사의 url list 
# 해당 작업은 그렇게 오래걸리지않음 5개월치 1분걸림 
def get_news_links():
    ## 시작 요일부터 하루씩 올려가며, 데이터 크롤링 
    links_url = []
    for single_date in date_range(start_date, end_date):
        formatted_date = single_date.strftime("%Y/%m/%d")
        # 날짜를 하루씩 증가시키면서, url 오픈 24월1월1일~ 
        url = f'https://www.joongang.co.kr/sitemap/index/{formatted_date}'
        # 해당 날짜에서의 기사 url list를 추출 
        links = fetch_news_articles(url)
        print(f"Date: {formatted_date}")
        for link in links:
            links_url.append(link)
    ## 모든 크롤링한 모든 링크의 url 전달, 1월1일부터의 ~ 모든 뉴스기사의 링크 
    return links_url


# 크롤링하고 결과를 바로 CSV 파일에 저장하는 함수
def joongang_crawler():
    # 결과를 저장할 폴더 생성
    result_dir = 'result'
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    # CSV 파일 경로 설정
    csv_file_path = os.path.join(result_dir, 'joongang_news(test).csv')
    
    # CSV 파일을 열고 데이터를 저장합니다.
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # CSV 파일의 헤더를 작성합니다.
        writer.writerow(['URL', 'Title', 'Content'])
            
        # 멀티쓰레드로 
        with ThreadPoolExecutor(max_workers=16) as executor:
            future_to_url = {}
            
            # 모든 날짜에 대해 URL을 생성하고 크롤링을 실행
            for single_date in date_range(start_date, end_date):
                formatted_date = single_date.strftime("%Y/%m/%d")
                url = f'https://www.joongang.co.kr/sitemap/index/{formatted_date}'
                
                # 해당 날짜의 기사 links 리스트 추출
                links = fetch_news_articles(url)
                #print(f"Date: {formatted_date} - Fetched {len(links)} articles")

                # 각 링크에 대해 비동기 작업을 예약합니다.
                for link in links:
                    future = executor.submit(fetch_articles_with_details, link)
                    future_to_url[future] = link

            # 완료된 작업을 순서대로 처리합니다.
            for future in as_completed(future_to_url):
                articles = future.result()
                if articles:
                    for article in articles:
                        writer.writerow(article)
                print(f"Completed: {future_to_url[future]}")
    
        
# 데이터가 예시로 준비되어 있다고 가정하고, 이 함수를 사용하여 저장합니다.
example_data = [
    ('https://example.com/article1', 'Title 1', 'Content of article 1...'),
    ('https://example.com/article2', 'Title 2', 'Content of article 2...'),
    ('https://example.com/article3', 'Title 3', 'Content of article 3...'),
    ('https://example.com/gitACtionTest', 'Title 3', 'Content of article 3...'),
    ('https://example.com/article3', 'Action 3', 'Content of article 3...')
]

# # CSV 파일 저장
if __name__ == "__main__":
    start_time = time.time()  # 코드 실행 시작 시간 기록
    joongang_crawler()
    end_time = time.time()  # 코드 실행 종료 시간 기록
    print(f"Execution time: {end_time - start_time} seconds")  # 실행 시간 출력

Completed: https://www.joongang.co.kr/article/25249818
Completed: https://www.joongang.co.kr/article/25218840
Completed: https://www.joongang.co.kr/article/25218823
Completed: https://www.joongang.co.kr/article/25218831
Completed: https://www.joongang.co.kr/article/25218861
Completed: https://www.joongang.co.kr/article/25218814
Completed: https://www.joongang.co.kr/article/25218847
Completed: https://www.joongang.co.kr/article/25218854
Completed: https://www.joongang.co.kr/article/25218817
Completed: https://www.joongang.co.kr/article/25218808
Completed: https://www.joongang.co.kr/article/25218822
Completed: https://www.joongang.co.kr/article/25218813
Completed: https://www.joongang.co.kr/article/25218811
Completed: https://www.joongang.co.kr/article/25218830
Completed: https://www.joongang.co.kr/article/25218839
Completed: https://www.joongang.co.kr/article/25218860
Completed: https://www.joongang.co.kr/article/25218846
Completed: https://www.joongang.co.kr/article/25218867
Completed:

Completed: https://www.joongang.co.kr/article/25218960
Completed: https://www.joongang.co.kr/article/25218962
Completed: https://www.joongang.co.kr/article/25218966
Completed: https://www.joongang.co.kr/article/25218964
Completed: https://www.joongang.co.kr/article/25218967
Completed: https://www.joongang.co.kr/article/25218970
Completed: https://www.joongang.co.kr/article/25218969
Completed: https://www.joongang.co.kr/article/25218973
Completed: https://www.joongang.co.kr/article/25249818
Completed: https://www.joongang.co.kr/article/25218974
Completed: https://www.joongang.co.kr/article/25218972
Completed: https://www.joongang.co.kr/article/25218968
Completed: https://www.joongang.co.kr/article/25218971
Completed: https://www.joongang.co.kr/article/25249891
Completed: https://www.joongang.co.kr/article/25218976
Completed: https://www.joongang.co.kr/article/25218975
Completed: https://www.joongang.co.kr/article/25249874
Completed: https://www.joongang.co.kr/article/25249940
Completed:

Completed: https://www.joongang.co.kr/article/25219111
Completed: https://www.joongang.co.kr/article/25219102
Completed: https://www.joongang.co.kr/article/25219112
Completed: https://www.joongang.co.kr/article/25219115
Completed: https://www.joongang.co.kr/article/25219114
Completed: https://www.joongang.co.kr/article/25219113
Completed: https://www.joongang.co.kr/article/25219117
Completed: https://www.joongang.co.kr/article/25219118
Completed: https://www.joongang.co.kr/article/25219116
Completed: https://www.joongang.co.kr/article/25219119
Completed: https://www.joongang.co.kr/article/25219121
Completed: https://www.joongang.co.kr/article/25219123
Completed: https://www.joongang.co.kr/article/25219122
Completed: https://www.joongang.co.kr/article/25219126
Completed: https://www.joongang.co.kr/article/25219130
Completed: https://www.joongang.co.kr/article/25219128
Completed: https://www.joongang.co.kr/article/25219124
Completed: https://www.joongang.co.kr/article/25219125
Completed:

Completed: https://www.joongang.co.kr/article/25219259
Completed: https://www.joongang.co.kr/article/25219260
Completed: https://www.joongang.co.kr/article/25219258
Completed: https://www.joongang.co.kr/article/25219264
Completed: https://www.joongang.co.kr/article/25219265
Completed: https://www.joongang.co.kr/article/25219263
Completed: https://www.joongang.co.kr/article/25219266
Completed: https://www.joongang.co.kr/article/25219262
Completed: https://www.joongang.co.kr/article/25219267
Completed: https://www.joongang.co.kr/article/25219269
Completed: https://www.joongang.co.kr/article/25219268
Completed: https://www.joongang.co.kr/article/25219261
Completed: https://www.joongang.co.kr/article/25219212
Completed: https://www.joongang.co.kr/article/25219272
Completed: https://www.joongang.co.kr/article/25219257
Completed: https://www.joongang.co.kr/article/25219270
Completed: https://www.joongang.co.kr/article/25219271
Completed: https://www.joongang.co.kr/article/25219274
Completed:

Completed: https://www.joongang.co.kr/article/25219407
Completed: https://www.joongang.co.kr/article/25219415
Completed: https://www.joongang.co.kr/article/25219412
Completed: https://www.joongang.co.kr/article/25219408
Completed: https://www.joongang.co.kr/article/25219409
Completed: https://www.joongang.co.kr/article/25219411
Completed: https://www.joongang.co.kr/article/25219414
Completed: https://www.joongang.co.kr/article/25219413
Completed: https://www.joongang.co.kr/article/25219410
Completed: https://www.joongang.co.kr/article/25219416
Completed: https://www.joongang.co.kr/article/25219421
Completed: https://www.joongang.co.kr/article/25219417
Completed: https://www.joongang.co.kr/article/25219420
Completed: https://www.joongang.co.kr/article/25219422
Completed: https://www.joongang.co.kr/article/25219418
Completed: https://www.joongang.co.kr/article/25219424
Completed: https://www.joongang.co.kr/article/25219423
Completed: https://www.joongang.co.kr/article/25219419
Completed:

Completed: https://www.joongang.co.kr/article/25219554
Completed: https://www.joongang.co.kr/article/25219552
Completed: https://www.joongang.co.kr/article/25219557
Completed: https://www.joongang.co.kr/article/25219558
Completed: https://www.joongang.co.kr/article/25219553
Completed: https://www.joongang.co.kr/article/25219556
Completed: https://www.joongang.co.kr/article/25219562
Completed: https://www.joongang.co.kr/article/25219561
Completed: https://www.joongang.co.kr/article/25219555
Completed: https://www.joongang.co.kr/article/25219564
Completed: https://www.joongang.co.kr/article/25219565
Completed: https://www.joongang.co.kr/article/25219563
Completed: https://www.joongang.co.kr/article/25219570
Completed: https://www.joongang.co.kr/article/25219559
Completed: https://www.joongang.co.kr/article/25219567
Completed: https://www.joongang.co.kr/article/25219568
Completed: https://www.joongang.co.kr/article/25219560
Completed: https://www.joongang.co.kr/article/25219573
Completed:

Completed: https://www.joongang.co.kr/article/25219705
Completed: https://www.joongang.co.kr/article/25219711
Completed: https://www.joongang.co.kr/article/25219709
Completed: https://www.joongang.co.kr/article/25219710
Completed: https://www.joongang.co.kr/article/25219712
Completed: https://www.joongang.co.kr/article/25219713
Completed: https://www.joongang.co.kr/article/25219703
Completed: https://www.joongang.co.kr/article/25219715
Completed: https://www.joongang.co.kr/article/25219708
Completed: https://www.joongang.co.kr/article/25219706
Completed: https://www.joongang.co.kr/article/25219714
Completed: https://www.joongang.co.kr/article/25219717
Completed: https://www.joongang.co.kr/article/25219716
Completed: https://www.joongang.co.kr/article/25219719
Completed: https://www.joongang.co.kr/article/25219707
Completed: https://www.joongang.co.kr/article/25219720
Completed: https://www.joongang.co.kr/article/25219721
Completed: https://www.joongang.co.kr/article/25219718
Completed:

Completed: https://www.joongang.co.kr/article/25219853
Completed: https://www.joongang.co.kr/article/25219858
Completed: https://www.joongang.co.kr/article/25219855
Completed: https://www.joongang.co.kr/article/25219856
Completed: https://www.joongang.co.kr/article/25219859
Completed: https://www.joongang.co.kr/article/25219860
Completed: https://www.joongang.co.kr/article/25219854
Completed: https://www.joongang.co.kr/article/25219842
Completed: https://www.joongang.co.kr/article/25219862
Completed: https://www.joongang.co.kr/article/25219852
Completed: https://www.joongang.co.kr/article/25219863
Completed: https://www.joongang.co.kr/article/25219851
Completed: https://www.joongang.co.kr/article/25219864
Completed: https://www.joongang.co.kr/article/25219866
Completed: https://www.joongang.co.kr/article/25219865
Completed: https://www.joongang.co.kr/article/25219867
Completed: https://www.joongang.co.kr/article/25219868
Completed: https://www.joongang.co.kr/article/25219861
Completed:

Completed: https://www.joongang.co.kr/article/25220002
Completed: https://www.joongang.co.kr/article/25220000
Completed: https://www.joongang.co.kr/article/25220004
Completed: https://www.joongang.co.kr/article/25220003
Completed: https://www.joongang.co.kr/article/25219997
Completed: https://www.joongang.co.kr/article/25220008
Completed: https://www.joongang.co.kr/article/25220006
Completed: https://www.joongang.co.kr/article/25220007
Completed: https://www.joongang.co.kr/article/25220009
Completed: https://www.joongang.co.kr/article/25220011
Completed: https://www.joongang.co.kr/article/25220017
Completed: https://www.joongang.co.kr/article/25220012
Completed: https://www.joongang.co.kr/article/25220014
Completed: https://www.joongang.co.kr/article/25220013
Completed: https://www.joongang.co.kr/article/25220015
Completed: https://www.joongang.co.kr/article/25220018
Completed: https://www.joongang.co.kr/article/25220005
Completed: https://www.joongang.co.kr/article/25220010
Completed: