# Data Acquisition
---

In [1]:
from bs4 import BeautifulSoup as bs
import datetime
import pandas as pd
import pickle
import re
import requests
# from selenium import webdriver
import sqlite3
import time as time_module

In [2]:
press_list = ['daum', 'naver',
              'seoulilbo', 'dtoday', 'asiailbo', 'labortoday', 'm-i',
              'ekn', 'busan', 'imaeil', 'kookje', 'yeongnam']
press_dict = {'daum':'다음', 'naver':'네이버',
              'seoulilbo':'서울일보', 'dtoday':'일간투데이', 'asiailbo':'아시아일보',
              'labortoday':'매일노동뉴스', 'm-i':'매일일보', 'ekn':'에너지경제',
              'busan':'부산일보', 'imaeil':'매일신문', 'kookje':'국제신문',
              'yeongnam':'영남일보'}

daum_press_list = ['EBS', 'IT동아', 'JTBC', 'KBS', 'KTV',
                   'MBC', 'MBN', 'SBS', 'SBS CNBC', 'YTN',
                   'ZDNet Korea', 'bnt뉴스', '게임동아', '게임톡', '경향신문',
                   '국민일보', '기자협회보', '노컷뉴스', '뉴스1', '뉴시스',
                   '데일리안', '동아사이언스', '동아일보', '디지털타임스', '로이터',
                   '매경게임진', '매일경제', '머니S', '머니투데이', '문화일보',
                   '미디어오늘', '서울경제', '서울신문', '세계일보', '아시아경제',
                   '아이뉴스24', '연합뉴스', '연합뉴스TV', '오마이뉴스', '오토타임즈',
                   '이데일리', '전자신문', '조선비즈', '조선일보', '중앙일보',
                   '채널A', '코리아헤럴드', '쿠키뉴스', '파이낸셜뉴스', '포토친구',
                   '프레시안', '한겨레', '한국경제', '한국경제TV', '한국일보',
                   '헤럴드경제']
naver_press_list = ['JTBC', 'KBS', 'MBC', 'MBN', 'SBS',
                    'SBS CNBC', 'TV조선', 'YTN', 'ZDNet Korea', '강원일보',
                    '경향신문', '국민일보', '기자협회보', '노컷뉴스', '뉴스1',
                    '뉴시스', '데일리안', '동아사이언스', '동아일보', '디지털데일리',
                    '디지털타임스', '로이터', '매일경제', '매일신문', '머니S',
                    '머니투데이', '문화일보', '미디어오늘', '부산일보', '블로터',
                    '서울경제', '서울신문', '세계일보', '아시아경제', '아이뉴스24',
                    '여성신문', '연합뉴스', '연합뉴스TV', '오마이뉴스', '이데일리',
                    '일다', '전자신문', '조선비즈', '조선일보', '조세일보',
                    '중앙일보', '참세상', '채널A', '코리아헤럴드', '코메디닷컴',
                    '파이낸셜뉴스', '프레시안', '한겨레', '한국경제', '한국경제TV',
                    '한국일보', '헤럴드경제', '헬스조선']

section_list = ['society', 'politics', 'economic', 'culture', 'digital', 'global']
section_dict = {'society':'사회', 'politics':'정치', 'economic':'경제',
               'culture':'문화', 'digital':'IT', 'global':'세계'}

base_urls = {'daum':
             {'society':'http://media.daum.net/breakingnews/society',
              'politics':'http://media.daum.net/breakingnews/politics',
              'economic':'http://media.daum.net/breakingnews/economic',
              'culture':'http://media.daum.net/breakingnews/culture',
              'digital':'http://media.daum.net/breakingnews/digital',
              'global':'http://media.daum.net/breakingnews/foreign'
             },
             'seoulilbo':
             {'society':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N10&view_type=sm',
              'politics':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N8&view_type=sm',
              'economic':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N9&view_type=sm',
              'culture':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N11&view_type=sm',
              'digital':'',
              'global':''
             },
             'dtoday':
             {'society':'',
              'politics':'http://www.dtoday.co.kr/news/articleList.html?sc_section_code=S1N1&view_type=sm',
              'economic':'http://www.dtoday.co.kr/news/articleList.html?sc_section_code=S1N2&view_type=sm',
              'culture':'',
              'digital':'',
              'global':''
             },
             'asiailbo':
             {'society':'http://www.asiailbo.co.kr/etnews/?cid=21030000',
              'politics':'http://www.asiailbo.co.kr/etnews/?cid=21010000',
              'economic':'http://www.asiailbo.co.kr/etnews/?cid=21020000',
              'culture':'http://www.asiailbo.co.kr/etnews/?cid=21040000',
              'digital':'',
              'global':''
             },
             'labortoday':
             {'society':'http://www.labortoday.co.kr/news/articleList.html?sc_section_code=S1N3&view_type=sm',
              'politics':'http://www.labortoday.co.kr/news/articleList.html?sc_section_code=S1N2&view_type=sm',
              'economic':'',    # 정치, 경제
              'culture':'',
              'digital':'',
              'global':''
             },
             'm-i':
             {'society':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N3&view_type=sm',
              'politics':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N1&view_type=tm',
              'economic':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N2&view_type=sm',
              'culture':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N22&view_type=tm',
              'digital':'',
              'global':''
             },
             'ekn':
             {'society':'http://www.ekn.kr/news/section_list_all.html?sec_no=25',
              'politics':'',    # 정치, 사회
              'economic':'http://www.ekn.kr/news/section_list_all.html?sec_no=130',
              'culture':'',
              'digital':'',
              'global':''
             },
             'busan':
             {'society':'http://news20.busan.com/news/social.jsp',
              'politics':'http://news20.busan.com/news/politics.jsp',
              'economic':'http://news20.busan.com/EconomyAndOcean/econocean.jsp',
              'culture':'http://news20.busan.com/news/culture.jsp',
              'digital':'',
              'global':''
             },
             'imaeil':
             {'society':'http://news.imaeil.com/SocietyAll/',
              'politics':'http://news.imaeil.com/PoliticsAll/',
              'economic':'http://news.imaeil.com/EconomyAll/',
              'culture':'http://news.imaeil.com/CultureAll/',
              'digital':'',
              'global':'http://news.imaeil.com/InternationalAll/'
             },
             'kookje':
             {'society':'http://www.kookje.co.kr/sub.htm?code=0300&vHeadTitle=%BB%E7%C8%B8',
              'politics':'http://www.kookje.co.kr/sub.htm?code=0100&vHeadTitle=%C1%A4%C4%A1',
              'economic':'http://www.kookje.co.kr/sub.htm?code=0200&vHeadTitle=%B0%E6%C1%A6',
              'culture':'http://www.kookje.co.kr/sub.htm?code=0500&vHeadTitle=%B9%AE%C8%AD',
              'digital':'http://www.kookje.co.kr/sub.htm?code=0800&vHeadTitle=IT%B0%FA%C7%D0',
              'global':'http://www.kookje.co.kr/sub.htm?code=0400&vHeadTitle=%B1%B9%C1%A6'
             },
             'yeongnam':
             {'society':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=04',
              'politics':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=02',
              'economic':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=03',
              'culture':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=08',
              'digital':'',    # 교육, 과학
              'global':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=06'
             }
            }

headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}

In [3]:
reg_ex = {'email':r'[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'}

## Create DB
---

In [4]:
conn = sqlite3.connect('./db/daum.db')

cur = conn.cursor()

try:
    cur.execute("CREATE TABLE daum(a_ids TEXT primary key, \
                dates DATE, times TIME, titles TEXT, contents TEXT, \
                press TEXT, authors TEXT, sections TEXT, urls TEXT)")

    conn.commit()
except:
    pass

## Scrap URLs
---

In [5]:
def get_period(date_end_time_delta, day_period):
    date_end = datetime.date.today() - datetime.timedelta(date_end_time_delta)
    date_start = date_end - datetime.timedelta(day_period - 1)
    
    return  date_start, date_end

In [10]:
def scrap_daum_news_urls(date_end_time_delta=1, day_period=7, page_limit=10000):
    date_start, date_end = get_period(date_end_time_delta, day_period)
    
    press = 'daum'

    # Section iteration
    for section in section_list:
        if base_urls[press][section] == '':
            continue
        else:
            print(press, section_dict[section], base_urls[press][section])
            print(date_start, ' - ', date_end)
            print('--------------------------------------------------\n')
            
            # Get section
            section_ko = section_dict[section]

            # Date iteration
            for time_delta in range(day_period):
                # Get date
                date = str(date_end - datetime.timedelta(time_delta))
                print(date)

                urls = []
                
                # Page iteration
                page = 1
                while(True):
                    if (page % 100) == 0:
                        print('{0:6,} / {1:6,}'.format(page, page_limit))

                    req_url = base_urls[press][section] + '?page=' + str(page) + '&regDate=' + date.replace('-', '')
                    while(True):
                        try:
                            resp = requests.get(req_url, headers=headers, timeout=1.5)
                        except:
#                             print('Timeout: retry')
                            continue
                        else:
                            break
                        
                    html = bs(resp.text, "lxml")

                    url_html = html.select('#mArticle .tit_thumb > .link_txt')                

                    urls_len = len(urls)
                    
                    # Get url
                    for i in url_html:
                        urls.append(i.get('href'))
                        
                    urls_len_diff = len(urls) - urls_len
                    
#                     # [Degug]
#                     if page == 533 or page == 534:
#                         print(len(urls))

                    if page == page_limit:
                        break
                    elif urls_len_diff == 0:
                        print('#', page, ': end of pages.')
                        break
                    else:    
                        page += 1
                
                with open('db/urls/daum/' + press + '_' + section + '_' + date.replace('-', '') + '.pkl', 'wb') as f:
                    pickle.dump(urls, f)

            print('\n--------------------------------------------------\n\n')

In [11]:
scrap_daum_news_urls(1, 14)

daum 사회 http://media.daum.net/breakingnews/society
2018-08-10  -  2018-08-23
--------------------------------------------------

2018-08-23
   100 / 10,000
   200 / 10,000
   300 / 10,000
   400 / 10,000
   500 / 10,000
# 534 : end of pages.
2018-08-22
   100 / 10,000
   200 / 10,000
   300 / 10,000
   400 / 10,000
   500 / 10,000
# 524 : end of pages.
2018-08-21
   100 / 10,000
   200 / 10,000
   300 / 10,000
   400 / 10,000
# 480 : end of pages.
2018-08-20
   100 / 10,000
   200 / 10,000
   300 / 10,000
   400 / 10,000
# 434 : end of pages.
2018-08-19
   100 / 10,000
# 193 : end of pages.
2018-08-18
   100 / 10,000
# 134 : end of pages.
2018-08-17
   100 / 10,000
   200 / 10,000
   300 / 10,000
   400 / 10,000
# 412 : end of pages.
2018-08-16
   100 / 10,000
   200 / 10,000
   300 / 10,000
   400 / 10,000
# 485 : end of pages.
2018-08-15
   100 / 10,000
   200 / 10,000
# 283 : end of pages.
2018-08-14
   100 / 10,000
   200 / 10,000
   300 / 10,000
   400 / 10,000
# 476 : end of page

## [Debug] Scrap URLs
---

In [12]:
press = 'daum'
section = 'politics'
date = '20180823'

with open('db/urls/daum/' + press + '_' + section + '_' + date + '.pkl', 'rb') as f:
    urls = pickle.load(f)
    
type(urls)

list

In [13]:
urls[:10]

['http://v.media.daum.net/v/20180823235953274',
 'http://v.media.daum.net/v/20180823235712261',
 'http://v.media.daum.net/v/20180823234956198',
 'http://v.media.daum.net/v/20180823234929195',
 'http://v.media.daum.net/v/20180823234832186',
 'http://v.media.daum.net/v/20180823225456524',
 'http://v.media.daum.net/v/20180823224023322',
 'http://v.media.daum.net/v/20180823223902308',
 'http://v.media.daum.net/v/20180823223627276',
 'http://v.media.daum.net/v/20180823223304232']

## Scrap News (수정 중)
---

In [5]:
def scrap_daum_news(date_end_delta = 1, day_period=7):
    date_start, date_end = get_period(date_end_delta, day_period)
    
    press = 'daum'

    for section in section_list:
        if base_urls[press][section] == '':
            continue
        else:
            print(press, section_dict[section], base_urls[press][section])
            print('--------------------------------------------------\n')

            section_ko = section_dict[section]
            urls = []

            for page in range(20):
                resp = requests.get(base_urls[press][section] + '?page=' + str(page), headers=headers, timeout=0.2)
#                 time_module.sleep(0.5)
                html = bs(resp.text, "lxml")

                url_html = html.select('#mArticle .tit_thumb > .link_txt')                
                
                for idx, i in enumerate(url_html):
                    urls.append(i.get('href'))
                    
            for idx, url in enumerate(urls):
                if (idx % 100) == 0:
                    print('{0:6,} / {1:6,}'.format(idx, len(urls)))
        
                resp = requests.get(url, headers=headers, timeout=1)
#                 time_module.sleep(0.5)
                html = bs(resp.text, "lxml")
                
                a_id = 'da_' + url[-17:]
                
                # TODO: date 변경
                date_html = html.select('.head_view .info_view')
                date = re.search(r'\d{2,4}[.-]?\d+[.-]?.\d+', date_html[0].text).group(0)
                time = re.search(r'\d+:\d+', date_html[0].text).group(0)
                
                title_html = html.select('.head_view .tit_view')
                title = title_html[0].text
                
                if re.search(r'[ㄱ-ㅎㅏ-ㅣ가-힣]+', title) == None:
                    continue
                else:                
                    content_html = html.select('.news_view #harmonyContainer')
                    content = content_html[0].text.strip()
                    
                    # content : 기자 이메일부터 이하 내용 제거
                    try:
                        email_idx = re.search(reg_ex['email'], content).start()
                        content_temp = content[:email_idx].strip()
                        if (len(content_temp) / len(content)) > 0.6:
                            content = content_temp
                    except:
                        pass

                    try:
                        author = re.search(r'\w+\s*기자', content_html[0].text).group(0).replace(' 기자', '').replace('기자', '')
                    except:
                        author = None

                    press_html = html.select('.head_view .thumb_g')
                    try:
                        press_ko = press_html[0].get('alt')
                    except:
                        press_ko = None

                    data = (a_id, date, time, title, content, press_ko, author, section_ko, url)
                    try:
                        cur.execute("INSERT INTO daum(a_ids, dates, times, titles, contents, press, authors, sections, urls) \
                                    values(?,?,?,?,?,?,?,?,?)", data)
                    except:
                        pass
                    else:
                        conn.commit()

            print('\n--------------------------------------------------\n\n')

In [6]:
scrap_daum_news()

daum 사회 http://media.daum.net/breakingnews/society
--------------------------------------------------

     0 /    300
   100 /    300
   200 /    300

--------------------------------------------------


daum 정치 http://media.daum.net/breakingnews/politics
--------------------------------------------------

     0 /    300
   100 /    300
   200 /    300

--------------------------------------------------


daum 경제 http://media.daum.net/breakingnews/economic
--------------------------------------------------

     0 /    300
   100 /    300
   200 /    300

--------------------------------------------------


daum 문화 http://media.daum.net/breakingnews/culture
--------------------------------------------------

     0 /    300
   100 /    300
   200 /    300

--------------------------------------------------


daum IT http://media.daum.net/breakingnews/digital
--------------------------------------------------

     0 /    300
   100 /    300
   200 /    300

--------------------------

In [7]:
# # 에러 발생 시 DB의 특정 section data 삭제
# conn = sqlite3.connect('db/daum.db')
# cur = conn.cursor()

# cur.execute("DELETE FROM daum WHERE sections='IT'")

# conn.commit()