In [None]:
import pandas as pd
import requests
import numpy as np
import time
import re
from bs4 import BeautifulSoup
from datetime import datetime
from urllib import parse

from IPython.display import clear_output # 주피터 노트북으로 실행할때 clear
from os import system, name # cmd 로 실행할때 clear

def clear() :
    '''
    출력된 결과들을 clear 시키는 함수 (cmd 환경)
    '''
    # for windows 
    if name == 'nt': 
        _ = system('cls') 

        
filepath = './data/'

def hk_news_crawler() :
    header_list = ['기업명', '기사 제목', '기사 URL', '언론사명', '기사 업로드 날짜', '종목코드']
    
    enter_df = pd.read_excel(filepath + 'crp_list.xls', encoding = 'utf-8', dtype = {'종목코드' : object, '상장폐지일' : object})
    enter_df = enter_df[['종목코드', '기업명', '상장폐지일']]
    
    try : 
        total_df = pd.read_csv(filepath + 'hk_news_url.csv', encoding = 'utf-8', dtype = {'종목코드' : object})
    except :
        total_df = pd.DataFrame(columns= header_list)
        total_df.to_csv(filepath + 'hk_news_url.csv', encoding = 'utf-8', index = False)
    
    copy_df = enter_df.copy()
    copy_df.set_index('종목코드', inplace = True)
    search_df = copy_df.loc[np.setdiff1d(enter_df['종목코드'], total_df['종목코드'])]
    search_df.reset_index(inplace = True)
    
    
    
    for idx, _ in enumerate(search_df.index) :
        clear_output(wait = True)
        
        print('남은 기업 수 : ', len(search_df) - idx)
        
        enternum = search_df.iloc[idx]['종목코드']
        keyword = search_df.iloc[idx]['기업명']
        end_date = search_df.iloc[idx]['상장폐지일']
        journal = '한국경제'
        curr_page = 1
        
        hk_list = []
        
        if pd.isnull(end_date) :
            start_date = '2016.12.31'
            end_date = '2019.12.31'
        else :
            end_date = search_df.iloc[idx]['상장폐지일'].strftime('%Y.%m.%d')
            start_date = datetime.strptime(end_date, '%Y.%m.%d')
            start_date = start_date.replace(year = start_date.year - 3)
            start_date = start_date.strftime("%Y.%m.%d")
              
        url = f'https://search.hankyung.com/apps.frm/search.news?query={keyword}' +\
        f'&sort=DATE%2FASC%2CRANK%2FDESC&period=DATE&area=ALL&mediaid_clust=HKPAPER&sdate={start_date}&edate={end_date}' +\
        f'&exact=&include=&except=&page={curr_page}'
        
        try :
            res = requests.get(url)
            soup = BeautifulSoup(res.text, 'lxml')
            
            try :
                last_page_num = int(soup.select_one('div.paging a.last').get('href').split('=')[-1])
                
                if last_page_num == 0 :
                    # 진행과정 print()
                    print('{} {} / {}'.format(keyword, curr_page, last_page_num))
                    mk_list.append([keyword, np.nan, np.nan, journal, np.nan, enternum])
                    hk_df = pd.DataFrame(mk_list, columns= header_list)
                    hk_df.to_csv(filepath + 'hk_news_url.csv', mode = 'a', index = False, encoding = 'utf-8', header = False)
                    continue
                
                for page in range(1, last_page_num + 1) :
                    curr_page = page
                    
                    # 진행과정 print()
                    print('{} {} / {}'.format(keyword, curr_page, last_page_num))
                    
                    url = f'https://search.hankyung.com/apps.frm/search.news?query={keyword}' +\
                        f'&sort=DATE%2FASC%2CRANK%2FDESC&period=DATE&area=ALL' +\
                        f'&mediaid_clust=HKPAPER&sdate={start_date}&edate={end_date}' +\
                        f'&exact=&include=&except=&page={curr_page}'
                    try :
                        res = requests.get(url)
                        soup = BeautifulSoup(res.text, 'lxml')
                        
                        try :
                            news_list = soup.select('ul.article li')
                            
                            for tag in news_list:
                                title = tag.select_one('em.tit').text.strip()
                                link = tag.select_one('div.txt_wrap a').get('href')
                                time_txt = tag.select_one('p.info span.date_time').text
                                hk_list.append([keyword, title, link, journal, time_txt, enternum])
                        except Exception as e :
                            print('news_list 가져오기 실패')
                            print('url : ', e)
                            print('다음페이지로')
                            continue
                        
                    except Exception as e :
                        print('page url 접근 실패 : ' , e)
                        print('url : ', url)
                        print('재시도')
                        page -= 1
                        continue
                
                # 파일 저장
                hk_df = pd.DataFrame(hk_list, columns = header_list)
                hk_df.to_csv(filepath + 'hk_news_url.csv', mode = 'a', encoding='UTF-8', index = False, header = False)
                
            except Exception as e :
                print('last page num Error : ', e)
                hk_list.append([keyword, np.nan, np.nan, journal, np.nan, enternum])
                hk_df = pd.DataFrame(hk_list, columns= header_list)
                hk_df.to_csv(filepath + 'hk_news_url.csv', mode = 'a', index = False, encoding = 'utf-8', header = False)
                
        except Exception as e :
            print('url 접근 실패 : ', e)
            print('url : ', url)
            
        time.sleep(0.5)

In [None]:
hk_news_crawler()