In [80]:
## 조선일보 크롤링
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import csv
from tqdm import tqdm  # tqdm 추가
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed 
import time
import json

## 2024년부터 오늘 날짜 까지 클롤링실시 연합은 2일치만 해도 만개가 넘는다.. 너무큼 
start_date = datetime(2024, 1, 1)
#end_date = datetime(2024, 1, 2)
# end_date = datetime.now()  
end_date = datetime.now() - timedelta(days=12) ## 연합은 업데이트가 좀 느려서 간격을 둠 

limit = 5

## 날짜 형식을 바꿔주는 함수 
def date_range(start, end):
    for n in range(int((end - start).days) + 1):
        yield start + timedelta(n)

def fetch_news_articles(url):
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    story = soup.find('ul', class_='list')
    urls = [a['href'] for a in story.find_all('a')] if story else []
    return urls

#제목 기사 url 추출함수
def fetch_articles_with_details(url):
    try:
        response = requests.get(url,timeout = limit)
        response.raise_for_status()  # HTTP 에러가 있을 경우 예외를 발생시킵니다.

        soup = BeautifulSoup(response.text, 'html.parser')
        articles = []
        
        # 제목 추출
        script = soup.find('script', id='contentJsonData').text
        try:
            # JSON 데이터 파싱
            first_brace_index = script.find('{')
            start = script.find('{', first_brace_index + 1)
            end = script.find('}') + 1
            json_data = json.loads(script[start:end])

            # 제목과 본문 추출
            title = json_data['TITLE']
            text_content = json_data['BODY']
            text_content = text_content.replace('\n', ' ')
            #print(url)
            articles.append([url, title, text_content])
        except (json.JSONDecodeError, KeyError) as e:
            print(f"Error parsing JSON data for URL {url}: {e}")
            
        return articles
    except requests.exceptions.Timeout:
        print("Timeout occurred while fetching articles from", url)
        return []
    except requests.exceptions.RequestException as e:
        print("Error occurred while fetching articles:", e)
        return []

    
    
# 크롤링하고 결과를 바로 CSV 파일에 저장하는 함수
def hap_crawler():
    # 결과를 저장할 폴더 생성
    result_dir = 'result'
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    # CSV 파일 경로 설정
    csv_file_path = os.path.join(result_dir, 'yeonhap_news.csv')
    
    # CSV 파일을 열고 데이터를 저장합니다.
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # CSV 파일의 헤더를 작성합니다.
        writer.writerow(['URL', 'Title', 'Content'])
            
        # 멀티쓰레드로 
        with ThreadPoolExecutor(max_workers=16) as executor:
            future_to_url = {}
            
            # 모든 날짜에 대해 URL을 생성하고 크롤링을 실행
            for single_date in date_range(start_date, end_date):
                formatted_date = single_date.strftime("%Y/%m/%d")
                for i in range(1, 3):  # 여기서는 예시로 하루에 두 개의 URL을 생성합니다.
                    url = f'https://www.yna.co.kr/sitemap/articles/{formatted_date}-{i}.htm'

                    # 해당 날짜의 기사 links 리스트 추출
                    links = fetch_news_articles(url)
                    #print(f"Date: {formatted_date} - Fetched {len(links)} articles")

                    # 각 링크에 대해 비동기 작업을 예약합니다.
                    for link in links:
                        future = executor.submit(fetch_articles_with_details, link)
                        future_to_url[future] = link

            # 완료된 작업을 순서대로 처리합니다.
            for future in as_completed(future_to_url):
                articles = future.result()
                if articles:
                    for article in articles:
                        writer.writerow(article)
                #print(f"Completed: {future_to_url[future]}")
                
# # CSV 파일 저장
# 2일하는데,5
if __name__ == "__main__":
    start_time = time.time()  # 코드 실행 시작 시간 기록
    hap_crawler()
    end_time = time.time()  # 코드 실행 종료 시간 기록
    print(f"Execution time: {end_time - start_time} seconds")  # 실행 시간 출력


https://www.yna.co.kr/view/AKR20231231014500003
https://www.yna.co.kr/view/AKR20231229068400030
https://www.yna.co.kr/view/AKR20231229141700530
https://www.yna.co.kr/view/AKR20240101000200051
https://www.yna.co.kr/view/MYH20240101000100641https://www.yna.co.kr/view/AKR20240101000100011

https://www.yna.co.kr/view/AKR20231229107400704
https://www.yna.co.kr/view/AKR20240101001100079
https://www.yna.co.kr/view/AKR20240101000600071
https://www.yna.co.kr/view/AKR20231230039500017
https://www.yna.co.kr/view/AKR20240101000700004
https://www.yna.co.kr/view/AKR20231229135800001
https://www.yna.co.kr/view/MYH20231229011400704
https://www.yna.co.kr/view/MYH20240101000200641
https://www.yna.co.kr/view/AKR20240101000500060
https://www.yna.co.kr/view/AKR20240101001400082
https://www.yna.co.kr/view/AKR20240101001000072
https://www.yna.co.kr/view/AKR20240101001900071
https://www.yna.co.kr/view/AKR20231231027152007
https://www.yna.co.kr/view/AKR20231231039251088https://www.yna.co.kr/view/AKR20240101001

https://www.yna.co.kr/view/AKR20240101015400051
https://www.yna.co.kr/view/AKR20240101015100009https://www.yna.co.kr/view/AKR20231229126900530

https://www.yna.co.kr/view/AKR20240101014000004
https://www.yna.co.kr/view/AKR20231229086900064
https://www.yna.co.kr/view/AKR20240101014700504
https://www.yna.co.kr/view/AKR20231229116300057
https://www.yna.co.kr/view/AKR20240101016300005
https://www.yna.co.kr/view/AKR20231229091800061https://www.yna.co.kr/view/AKR20240101016400005

https://www.yna.co.kr/view/AKR20240101016500007
https://www.yna.co.kr/view/MYH20240101000500641
https://www.yna.co.kr/view/MYH20240101000400641
https://www.yna.co.kr/view/MYH20240101004700641
https://www.yna.co.kr/view/AKR20240101017800009
https://www.yna.co.kr/view/AKR20240101017000003
https://www.yna.co.kr/view/MYH20240101004800641
https://www.yna.co.kr/view/AKR20240101017200054
https://www.yna.co.kr/view/AKR20240101015600001
https://www.yna.co.kr/view/AKR20240101018200061
https://www.yna.co.kr/view/AKR2024010101

https://www.yna.co.kr/view/MYH20240101006500641
https://www.yna.co.kr/view/AKR20240101029500001
https://www.yna.co.kr/view/AKR20240101027700073
https://www.yna.co.kr/view/MYH20240101006400641
https://www.yna.co.kr/view/AKR20240101030000074
https://www.yna.co.kr/view/AKR20240101028800001
https://www.yna.co.kr/view/MYH20240101004500641
https://www.yna.co.kr/view/MYH20240101006600641
https://www.yna.co.kr/view/AKR20240101030100084
https://www.yna.co.kr/view/AKR20240101029800003
https://www.yna.co.kr/view/AKR20240101029900009
https://www.yna.co.kr/view/MYH20240101007200704https://www.yna.co.kr/view/AKR20240101030400704

https://www.yna.co.kr/view/MYH20240101006200641
https://www.yna.co.kr/view/MYH20240101007300641https://www.yna.co.kr/view/MYH20240101006800641

https://www.yna.co.kr/view/MYH20240101007400641
https://www.yna.co.kr/view/MYH20240101007100641
https://www.yna.co.kr/view/MYH20240101006900641
https://www.yna.co.kr/view/MYH20240101006700641
https://www.yna.co.kr/view/MYH2024010100

https://www.yna.co.kr/view/MYH20240101015500032
https://www.yna.co.kr/view/MYH20240101015400641https://www.yna.co.kr/view/MYH20240101015100641

https://www.yna.co.kr/view/MYH20240101015600641
https://www.yna.co.kr/view/AKR20240101039500007
https://www.yna.co.kr/view/AKR20240101045000504
https://www.yna.co.kr/view/AKR20240101045200063
https://www.yna.co.kr/view/AKR20240101045100062
https://www.yna.co.kr/view/AKR20240101045400009
https://www.yna.co.kr/view/AKR20240101039400007
https://www.yna.co.kr/view/AKR20240101045700052
https://www.yna.co.kr/view/MYH20240101015900641
https://www.yna.co.kr/view/MYH20240101016000641
https://www.yna.co.kr/view/MYH20240101016200641
https://www.yna.co.kr/view/MYH20240101015800641
https://www.yna.co.kr/view/AKR20240101045800063
https://www.yna.co.kr/view/MYH20240101015700641
https://www.yna.co.kr/view/MYH20240101016300641
https://www.yna.co.kr/view/AKR20231230044451002
https://www.yna.co.kr/view/AKR20240101030851073
https://www.yna.co.kr/view/AKR2024010104

https://www.yna.co.kr/view/MYH20240102002300032
https://www.yna.co.kr/view/MYH20240102002400641https://www.yna.co.kr/view/AKR20240102010100505

https://www.yna.co.kr/view/MYH20240102002600641
https://www.yna.co.kr/view/AKR20240102012800007
https://www.yna.co.kr/view/AKR20231227031500054
https://www.yna.co.kr/view/AKR20231229090900054https://www.yna.co.kr/view/AKR20240102014800051

https://www.yna.co.kr/view/AKR20231227127800054
https://www.yna.co.kr/view/AKR20231226063000054
https://www.yna.co.kr/view/AKR20240102009000505
https://www.yna.co.kr/view/AKR20231227116500054
https://www.yna.co.kr/view/AKR20231215114800004https://www.yna.co.kr/view/AKR20231229085500065https://www.yna.co.kr/view/MYH20240102002800641


https://www.yna.co.kr/view/AKR20240101040200063
https://www.yna.co.kr/view/AKR20240101053351057
https://www.yna.co.kr/view/AKR20240102014700005
https://www.yna.co.kr/view/AKR20240102017500505https://www.yna.co.kr/view/AKR20231227094400054

https://www.yna.co.kr/view/MYH2024010200

https://www.yna.co.kr/view/AKR20240102039200001https://www.yna.co.kr/view/AKR20240102044000007
https://www.yna.co.kr/view/AKR20240102045900007https://www.yna.co.kr/view/AKR20240102041000054
https://www.yna.co.kr/view/AKR20240102018600003

https://www.yna.co.kr/view/AKR20240102044100052

https://www.yna.co.kr/view/AKR20240102046200052
https://www.yna.co.kr/view/AKR20240102042400002
https://www.yna.co.kr/view/AKR20240102045700030
https://www.yna.co.kr/view/AKR20240102044800002
https://www.yna.co.kr/view/AKR20240102046400002
https://www.yna.co.kr/view/AKR20240102046500002
https://www.yna.co.kr/view/AKR20240102048100005https://www.yna.co.kr/view/AKR20240102040500003

https://www.yna.co.kr/view/AKR20240102024200009
https://www.yna.co.kr/view/AKR20240102047700062
https://www.yna.co.kr/view/AKR20240102044900017https://www.yna.co.kr/view/AKR20240102047300064
https://www.yna.co.kr/view/AKR20240102049100505

https://www.yna.co.kr/view/AKR20240102006151073https://www.yna.co.kr/view/AKR20240102042

https://www.yna.co.kr/view/AKR20240102064500053
https://www.yna.co.kr/view/AKR20240102067500062
https://www.yna.co.kr/view/AKR20240102057800004
https://www.yna.co.kr/view/AKR20240102068600063
https://www.yna.co.kr/view/AKR20240102067900007
https://www.yna.co.kr/view/AKR20240102059700530
https://www.yna.co.kr/view/AKR20240102068900061
https://www.yna.co.kr/view/AKR20240102069100062
https://www.yna.co.kr/view/AKR20240102069300007
https://www.yna.co.kr/view/AKR20240102054953001
https://www.yna.co.kr/view/AKR20240102065200001
https://www.yna.co.kr/view/AKR20240102069500061
https://www.yna.co.kr/view/AKR20240102060400004
https://www.yna.co.kr/view/AKR20240102070500062
https://www.yna.co.kr/view/AKR20240102067000504https://www.yna.co.kr/view/AKR20240102070700001

https://www.yna.co.kr/view/AKR20240102067200055
https://www.yna.co.kr/view/AKR20240102067600005
https://www.yna.co.kr/view/AKR20240102068200053
https://www.yna.co.kr/view/AKR20240102070000017https://www.yna.co.kr/view/AKR20240102071

https://www.yna.co.kr/view/MYH20240102008000641https://www.yna.co.kr/view/MYH20240102008100641https://www.yna.co.kr/view/AKR20240102085700004


https://www.yna.co.kr/view/MYH20240102007900641
https://www.yna.co.kr/view/AKR20240102085800009
https://www.yna.co.kr/view/AKR20240102082300001
https://www.yna.co.kr/view/AKR20240102087100505https://www.yna.co.kr/view/AKR20240102085300056https://www.yna.co.kr/view/MYH20240102008200641


https://www.yna.co.kr/view/AKR20240102085600061
https://www.yna.co.kr/view/MYH20240102008300641
https://www.yna.co.kr/view/AKR20240102085900073
https://www.yna.co.kr/view/AKR20240102082900504
https://www.yna.co.kr/view/AKR20240102083800504https://www.yna.co.kr/view/AKR20240102083200017
https://www.yna.co.kr/view/AKR20240102087200062

https://www.yna.co.kr/view/AKR20240102088300060
https://www.yna.co.kr/view/AKR20240102089300007
https://www.yna.co.kr/view/MYH20240102008500641
https://www.yna.co.kr/view/AKR20240102086400061
https://www.yna.co.kr/view/AKR2024010208

https://www.yna.co.kr/view/AKR20240102113600007
https://www.yna.co.kr/view/AKR20240102106651004
https://www.yna.co.kr/view/MYH20240102011600641https://www.yna.co.kr/view/AKR20240102112600053

https://www.yna.co.kr/view/AKR20240102114200064
https://www.yna.co.kr/view/MYH20240102011500641
https://www.yna.co.kr/view/AKR20240102113000052
https://www.yna.co.kr/view/AKR20240102106700004
https://www.yna.co.kr/view/AKR20240102114000004
https://www.yna.co.kr/view/AKR20240102114300057https://www.yna.co.kr/view/MYH20240102011700641

https://www.yna.co.kr/view/AKR20240102114400051https://www.yna.co.kr/view/AKR20240102108900065

https://www.yna.co.kr/view/AKR20240102113900052https://www.yna.co.kr/view/AKR20240102099651504

https://www.yna.co.kr/view/AKR20240102111700017
https://www.yna.co.kr/view/AKR20240102111900002
https://www.yna.co.kr/view/AKR20240102108800003
https://www.yna.co.kr/view/AKR20240102110200009
https://www.yna.co.kr/view/AKR20240102112300017
https://www.yna.co.kr/view/AKR2024010211

https://www.yna.co.kr/view/MYH20240102014200641
https://www.yna.co.kr/view/MYH20240102014100704
https://www.yna.co.kr/view/AKR20240102135600002https://www.yna.co.kr/view/AKR20240102133300083

https://www.yna.co.kr/view/AKR20240102133900504
https://www.yna.co.kr/view/AKR20240102135200063
https://www.yna.co.kr/view/AKR20240102096551001https://www.yna.co.kr/view/AKR20240102121851030

https://www.yna.co.kr/view/AKR20240102134800061https://www.yna.co.kr/view/AKR20240102134600009https://www.yna.co.kr/view/AKR20240102136500057
https://www.yna.co.kr/view/AKR20240102132500004


https://www.yna.co.kr/view/AKR20240102133400062
https://www.yna.co.kr/view/AKR20240102056251001
https://www.yna.co.kr/view/AKR20240102078751073
https://www.yna.co.kr/view/AKR20240102131400064
https://www.yna.co.kr/view/AKR20240102127300022https://www.yna.co.kr/view/AKR20240102133700051
https://www.yna.co.kr/view/AKR20240102130300064

https://www.yna.co.kr/view/AKR20240102136600704
https://www.yna.co.kr/view/MYH2024010201

https://www.yna.co.kr/view/MYH20240102020200641
https://www.yna.co.kr/view/MYH20240102020300641
https://www.yna.co.kr/view/MYH20240102020500641
https://www.yna.co.kr/view/MYH20240102020600032
https://www.yna.co.kr/view/AKR20240102156400109
https://www.yna.co.kr/view/MYH20240102020400641
https://www.yna.co.kr/view/AKR20240102155900007
https://www.yna.co.kr/view/MYH20240102020900032https://www.yna.co.kr/view/AKR20240102150251001
https://www.yna.co.kr/view/AKR20240102152800085
https://www.yna.co.kr/view/MYH20240102020800641
https://www.yna.co.kr/view/MYH20240102020700641

https://www.yna.co.kr/view/AKR20240102156900053https://www.yna.co.kr/view/MYH20240102021000641

https://www.yna.co.kr/view/AKR20240102157400007
https://www.yna.co.kr/view/MYH20240102021200641
https://www.yna.co.kr/view/MYH20240102021300641
https://www.yna.co.kr/view/MYH20240102021100641
https://www.yna.co.kr/view/GYH20240102001300044
https://www.yna.co.kr/view/AKR20240102146651099
https://www.yna.co.kr/view/AKR2024010214