In [3]:
#!/usr/bin/env python
# -*- coding: utf-8, euc-kr -*-
#!pip install KoreaNewsCrawler

import os
import platform
import calendar
import requests
import re
from time import sleep
from bs4 import BeautifulSoup
from multiprocessing import Process
from korea_news_crawler.exceptions import *
from korea_news_crawler.articleparser import ArticleParser
from korea_news_crawler.articlecrawler import ArticleCrawler
from korea_news_crawler.writer import Writer
import urllib.parse as urlparse
from urllib.parse import parse_qs


class ArticleCrawler(object):
    def __init__(self):
        # self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, '세계': 104, 'IT과학': 105, '오피니언': 110,
        #                    'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'world': 104, 'IT_science': 105, 'opinion': 110}

        self.categories = {'금융': 259, '증권': 258, '산업_재계': 261, '중기_벤처': 771, '부동산': 260, '글로벌경제': 262, '생활경제': 310,
                           '경제 일반': 263}

        self.selected_categories = []

        self.date = {'start_year': 0, 'start_month': 0, 'end_year': 0, 'end_month': 0}
        self.user_operating_system = str(platform.system())

    def set_category(self, *args):
        for key in args:
            if self.categories.get(key) is None:
                raise InvalidCategory(key)
        self.selected_categories = args

    def set_date_range(self, start_year, start_month, end_year, end_month):
        args = [start_year, start_month, end_year, end_month]
        if start_year > end_year:
            raise InvalidYear(start_year, end_year)
        if start_month < 1 or start_month > 12:
            raise InvalidMonth(start_month)
        if end_month < 1 or end_month > 12:
            raise InvalidMonth(end_month)
        if start_year == end_year and start_month > end_month:
            raise OverbalanceMonth(start_month, end_month)
        for key, date in zip(self.date, args):
            self.date[key] = date
        print(self.date)

    @staticmethod
    def make_news_page_url(category_name, category_url, start_year, end_year, start_month, end_month):

        made_urls = []
        page_count = {}
        for year in range(start_year, end_year + 1):
            target_start_month = start_month
            target_end_month = end_month

            if start_year != end_year:
                if year == start_year:
                    target_start_month = start_month
                    target_end_month = 12
                elif year == end_year:
                    target_start_month = 1
                    target_end_month = end_month
                else:
                    target_start_month = 1
                    target_end_month = 12

            for month in range(target_start_month, target_end_month + 1):
                for month_day in range(1, calendar.monthrange(year, month)[1] + 1):
                    if len(str(month)) == 1:
                        month = "0" + str(month)
                    if len(str(month_day)) == 1:
                        month_day = "0" + str(month_day)

                    # 날짜별로 Page Url 생성
                    url = category_url + str(year) + str(month) + str(month_day)

                    # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄
                    # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect)

                    totalpage = ArticleParser.find_news_totalpage(url + "&page=10000")

                    parsed = urlparse.urlparse(url)
                    current_date = parse_qs(parsed.query)['date'][0]

                    print("[", category_name, "][", current_date, "] >>>>>> ", totalpage)

                    page_count["[" + category_name + "][" + current_date + "]"] = totalpage
                    for page in range(1, totalpage + 1):
                        made_urls.append(url + "&page=" + str(page))

        return made_urls, page_count

    @staticmethod
    def get_url_data(url, max_tries=5):
        remaining_tries = int(max_tries)
        while remaining_tries > 0:
            try:
                return requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            except requests.exceptions:
                sleep(1)
            remaining_tries = remaining_tries - 1
        raise ResponseTimeout()

    def crawling(self, category_name):
        # Multi Process PID
        print(category_name + " PID: " + str(os.getpid()))

        # writer = Writer(category='Article', article_category=category_name, date=self.date)
        # 기사 url 형식
        url_format = f'http://news.naver.com/main/list.nhn?mode=LS2D&mid=sec&sid1=101&sid2={self.categories.get(category_name)}&date='
        # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
        target_urls, page_count = self.make_news_page_url(category_name, url_format, self.date['start_year'],
                                                          self.date['end_year'], self.date['start_month'],
                                                          self.date['end_month'])

        sleep(5)

        print("[" + category_name + "]", "**************** Urls are generated ****************")
        print("[" + category_name + "]", "**************** The crawler starts ****************")

        sleep(5)
        # print("Target_URLS >>>> ", len(target_urls))

        for url in target_urls:

            parsed = urlparse.urlparse(url)
            # print(parse_qs(parsed.query)['date'][0])
            current_date = parse_qs(parsed.query)['date'][0]
            page_num = parse_qs(parsed.query)['page'][0]

            page_total = page_count["[" + category_name + "][" + current_date + "]"]

            # 크롤링 진행상황을 확인한다.
            print("[" + category_name + "][" + current_date + "]", page_num, "/", page_total, url)

            writer = Writer(category='Article', article_category=category_name, date=current_date, page_num=page_num,
                            total_page=len(target_urls))

            request = self.get_url_data(url)
            document = BeautifulSoup(request.content, 'html.parser')

            # html - newsflash_body - type06_headline, type06
            # 각 페이지에 있는 기사들 가져오기
            temp_post = document.select('.newsflash_body .type06_headline li dl')
            temp_post.extend(document.select('.newsflash_body .type06 li dl'))

            # 각 페이지에 있는 기사들의 url 저장
            post_urls = []
            for line in temp_post:
                # 해당되는 page에서 모든 기사들의 URL을 post_urls 리스트에 넣음
                post_urls.append(line.a.get('href'))
            del temp_post

            for content_url in post_urls:  # 기사 url
                # 크롤링 대기 시간
                sleep(0.2)

                # 기사 HTML 가져옴
                request_content = self.get_url_data(content_url)

                try:
                    document_content = BeautifulSoup(request_content.content, 'html.parser')
                except:
                    continue

                try:
                    # 기사 제목 가져옴
                    tag_headline = document_content.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'})
                    # 뉴스 기사 제목 초기화
                    text_headline = ''
                    text_headline = text_headline + ArticleParser.clear_headline(
                        str(tag_headline[0].find_all(text=True)))
                    # 공백일 경우 기사 제외 처리
                    if not text_headline:
                        continue

                    # 기사 본문 가져옴
                    tag_content = document_content.find_all('div', {'id': 'articleBodyContents'})
                    # 뉴스 기사 본문 초기화
                    text_sentence = ''
                    text_sentence = text_sentence + ArticleParser.clear_content(str(tag_content[0].find_all(text=True)))
                    # 공백일 경우 기사 제외 처리
                    if not text_sentence:
                        continue

                    # 기사 언론사 가져옴
                    tag_company = document_content.find_all('meta', {'property': 'me2:category1'})

                    # 언론사 초기화
                    text_company = ''
                    text_company = text_company + str(tag_company[0].get('content'))

                    # 공백일 경우 기사 제외 처리
                    if not text_company:
                        continue

                    # 기사 시간대 가져옴
                    time = re.findall('<span class="t11">(.*)</span>', request_content.text)[0]

                    # CSV 작성
                    writer.write_row([time, category_name, text_company, text_headline, text_sentence, content_url])
                    # print("Content_url >>>>>>>>>>> ", content_url)
                    del time
                    del text_company, text_sentence, text_headline
                    del tag_company
                    del tag_content, tag_headline
                    del request_content, document_content

                # UnicodeEncodeError
                except Exception as ex:
                    del request_content, document_content
                    pass
        writer.close()

    def start(self):
        # MultiProcess 크롤링 시작
        for category_name in self.selected_categories:
            proc = Process(target=self.crawling, args=(category_name,))
            proc.start()

    # def start(self):
    #     # MultiProcess 크롤링 시작
    #     for category_name in self.selected_categories_sub2:
    #         proc = Process(target=self.crawling, args=(category_name,))
    #         proc.start()


if __name__ == "__main__":
    Crawler = ArticleCrawler()
    # Crawler.set_category('산업_재계',"증권")
    Crawler.set_category('금융', '증권', '산업_재계', '중기_벤처', '부동산', '글로벌경제', '생활경제', '경제 일반')
    Crawler.set_date_range(2020, 7, 2020, 9)
    Crawler.start()


Collecting KoreaNewsCrawler
  Downloading KoreaNewsCrawler-1.50-py3-none-any.whl (11 kB)
Installing collected packages: KoreaNewsCrawler
Successfully installed KoreaNewsCrawler-1.50
{'start_year': 2020, 'start_month': 7, 'end_year': 2020, 'end_month': 9}
