# PM2

September 21st, 2020 <br/>Hannah Kim

## Data Collection

Import necessary packages.

In [9]:
import requests
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import json
import time
import datetime
from selenium.common.exceptions import NoSuchElementException

Definition for the getNaverNewsLinks() function, which returns the list of all Naver News links for the input news publisher on the strike of the doctors through the input searchURL.

In [2]:
def getNaverNewsLinks(searchURL, name):
    """A browser must be defined before running this function."""
    
    browser.get(searchURL)
    
    # set news publisher
    togglePublisherPreference(name)
    
    # scrape links from search engine results
    mainArea = browser.find_element_by_css_selector('div.news.mynews.section._prs_nws')
    allLinks = mainArea.find_elements_by_css_selector('ul li dl dd.txt_inline a')
    allLinks = [x.get_attribute('href') for x in allLinks if x.text=='네이버뉴스']
    nextPageButton = mainArea.find_elements_by_css_selector('div.paging a.next')
    
    while nextPageButton:
        nextPageButton[0].click()
        mainArea = browser.find_element_by_css_selector('div.news.mynews.section._prs_nws')
        links = mainArea.find_elements_by_css_selector('ul li dl dd.txt_inline a')
        links = [x.get_attribute('href') for x in links if x.text=='네이버뉴스']
        allLinks.extend(links)
        nextPageButton = mainArea.find_elements_by_css_selector('div.paging a.next')
        
    # reset news publisher
    togglePublisherPreference(name)
        
    return allLinks

Definition for the togglePublisherPreference() function, which toggles the selection of the news publisher in the setting for the news source preference for the news publisher with the input name. This is a helper function used in the getNaverNewsLinks() function.

In [3]:
def togglePublisherPreference(name):
    """A helper function for the getNaverNewsLinks() function. A browser must be defined beforehand."""
    preferenceButton = [x for x in browser.find_elements_by_css_selector('li.menu > a.m') if (x.text=='언론사' or x.text=='출처선택')][0]
    preferenceButton.click() # open box containing news publisher preference options
    popUpBox = browser.find_element_by_css_selector('#_nx_option_media')
    pubOption = popUpBox.find_elements_by_css_selector('ul.viewlst li')
    for option in pubOption: # find option with desired news publisher
        if option.find_element_by_css_selector('label').text == name:
            pubOption = option.find_element_by_css_selector('input')
            break
    pubOption.click()
    submitButton = popUpBox.find_element_by_css_selector('div.view_btn button._submit_btn')
    submitButton.click() # submit preference

Defines the browser, the search URL containing the query for the search engine, a list of central news publishers, and the dictionary linksByPublisher, which contains all relevant Naver News links for each publisher. The list valid_types contains a list of numbers representing the type of article that provide the desired relevant search results. Prints out the number of article links obtained for each news publisher.

In [4]:
browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')

valid_types = [1, 3] # 1 for photo, 3 for paper

publishers = ['경향신문', '국민일보', '내일신문', '동아일보', '문화일보', 
              '서울신문', '세계일보', '조선일보', '중앙일보', '한겨레', '한국일보']

linksByPublisher = dict()

searchLink = """https://search.naver.com/search.naver?where=news
&query=%22%EC%9D%98%EC%82%AC%22%20%22%ED%8C%8C%EC%97%85%22&sm=tab_opt&sort=0
&photo={}&field=0&reporter_article=&pd=3&ds=2020.07.23&de=2020.10.20&docid=
&nso=so%3Ar%2Cp%3Afrom20200723to20200920%2Ca%3Aall&mynews=1&refresh_start=0&related=0"""

for publisher in publishers:
    linksByPublisher[publisher] = []
    for article_type in valid_types:
        linksByPublisher[publisher].extend(getNaverNewsLinks(searchLink.format(article_type), publisher))
    print(publisher, len(linksByPublisher[publisher]))
    
browser.close()

경향신문 262
국민일보 250
내일신문 0
동아일보 173
문화일보 86
서울신문 4191
세계일보 398
조선일보 245
중앙일보 26678
한겨레 2637
한국일보 653


Definition for the getHtmlDataAndInformation() function, which returns a tuple consisting of 1) the raw HTML data of the page and 2) a dictionary containing information about the article accessed through the input link.

In [7]:
def getHtmlDataAndInformation(link):
    """A browser must be defined before running this function."""
    browser.get(link)
    time.sleep(0.5) # wait for page to load
    
    html = browser.page_source
    information = dict()
    
    try:
        # title area
        titleArea = browser.find_element_by_css_selector('div.article_info')
        information['title'] = titleArea.find_element_by_css_selector('#articleTitle').text

        # convert text to datetime object
        publishedTime = titleArea.find_element_by_css_selector('span.t11').text
        publishedTime = publishedTime.replace('오전', 'AM')
        publishedTime = publishedTime.replace('오후', 'PM')
        information['datetime'] = datetime.datetime.strptime(publishedTime, '%Y.%m.%d. %p %I:%M')

        numComments = ''.join(titleArea.find_element_by_css_selector('span.lo_txt').text.split(','))
        information['num_comments'] = int(numComments) if numComments.isdigit() else 0

        # get article content without links to irrelevant articles
        information['content'] = '\n'.join([x.strip() for x in browser.find_element_by_css_selector('#articleBodyContents').text.split('\n') if not x.startswith('▶')])

        # bottom area
        bottomArea = browser.find_element_by_css_selector('#spiLayer')
        # reactions
        numReactions_good = int(''.join(bottomArea.find_element_by_css_selector('li.good span._count').text.split(',')))
        numReactions_warm = int(''.join(bottomArea.find_element_by_css_selector('li.warm span._count').text.split(',')))
        numReactions_sad = int(''.join(bottomArea.find_element_by_css_selector('li.sad span._count').text.split(',')))
        numReactions_angry = int(''.join(bottomArea.find_element_by_css_selector('li.angry span._count').text.split(',')))
        numReactions_want = int(''.join(bottomArea.find_element_by_css_selector('li.want span._count').text.split(',')))
        information['reactions'] = {'good': numReactions_good, 
                                    'warm': numReactions_warm,
                                    'sad': numReactions_sad, 
                                    'angry': numReactions_angry, 
                                    'want': numReactions_want
                                   }
        # recommends
        numRecommends = ''.join(bottomArea.find_element_by_css_selector('em.u_cnt._count').text.split(','))
        information['recommends'] = int(numRecommends) if numRecommends.isdigit() else 0
        
    except NoSuchElementException:
        print("Failed to extract information from {}".format(link))
    
    return (html, information)

Stores raw HTML data for all articles for each news publisher in the dictionary allRawHtmlData.
Also stores article information for each article for each news publisher in the dictionary articleInfo.

In [10]:
browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')

allRawHtmlData = dict() # make json later
articleInfo = dict()

for publisher in linksByPublisher:
    allRawHtmlData[publisher] = dict()
    articleInfo[publisher] = dict()
    newsLinks = linksByPublisher[publisher]
    for newsLink in newsLinks:
        data, info = getHtmlDataAndInformation(newsLink)
        allRawHtmlData[publisher][newsLink] = data
        articleInfo[publisher][newsLink] = info
    print(publisher, len(newsLinks), "complete")

browser.close()

경향신문 262 complete
Failed to extract information from https://sports.news.naver.com/news.nhn?oid=005&aid=0001357300
국민일보 250 complete
내일신문 0 complete
동아일보 173 complete
문화일보 86 complete
서울신문 4191 complete
세계일보 398 complete
조선일보 245 complete
중앙일보 26678 complete
한겨레 2637 complete
한국일보 653 complete


Stores variables for use in the other Jupyter Notebook.

In [18]:
%store publishers
%store articleInfo

Stored 'publishers' (list)
Stored 'articleInfo' (dict)
