# Web scraping

In [1]:
import requests

In [7]:
resp = requests.request('get','https://naver.com')

In [8]:
resp

<Response [200]>

In [15]:
resp = requests.get("https://search.naver.com", params={'query': '무역전쟁'})

In [16]:
resp

<Response [200]>

In [17]:
resp.text

'<script type="text/javascript">window.location.replace("http://www.naver.com/");</script>\n'

In [18]:
resp.content

b'<script type="text/javascript">window.location.replace("http://www.naver.com/");</script>\n'

In [19]:
resp.encoding

'UTF-8'

In [20]:
resp.headers

{'Date': 'Mon, 03 Jun 2019 01:44:34 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '99', 'Connection': 'keep-alive', 'Last-Modified': 'Thu, 30 May 2019 08:32:41 GMT', 'ETag': '"5a-58a16bdc0dc40-gzip"', 'Accept-Ranges': 'bytes', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip', 'Server': 'nxg'}

In [21]:
resp.url

'https://search.naver.com/?query=%EB%AC%B4%EC%97%AD%EC%A0%84%EC%9F%81'

In [22]:
resp.json

<bound method Response.json of <Response [200]>>

### json

In [24]:
json_url = "https://crix-api-cdn.upbit.com/v1/crix/candles/days?code=CRIX.UPBIT.KRW-BTC&count=200"
resp2 = requests.get(json_url)

In [28]:
json_content = resp2.json()
json_content

[{'code': 'CRIX.UPBIT.KRW-BTC',
  'candleDateTime': '2019-06-03T00:00:00+00:00',
  'candleDateTimeKst': '2019-06-03T09:00:00+09:00',
  'openingPrice': 10505000.0,
  'highPrice': 10509000.0,
  'lowPrice': 10386000.0,
  'tradePrice': 10450000.0,
  'candleAccTradeVolume': 1087.22658665,
  'candleAccTradePrice': 11356847991.76565,
  'timestamp': 1559526416537,
  'prevClosingPrice': 10505000.0,
  'change': 'FALL',
  'changePrice': 55000.0,
  'changeRate': 0.0052356021,
  'signedChangeRate': -0.0052356021,
  'signedChangePrice': -55000.0},
 {'code': 'CRIX.UPBIT.KRW-BTC',
  'candleDateTime': '2019-06-02T00:00:00+00:00',
  'candleDateTimeKst': '2019-06-02T09:00:00+09:00',
  'openingPrice': 10350000.0,
  'highPrice': 10599000.0,
  'lowPrice': 10340000.0,
  'tradePrice': 10505000.0,
  'candleAccTradeVolume': 6868.79564903,
  'candleAccTradePrice': 71775106042.42062,
  'timestamp': 1559520000126,
  'prevClosingPrice': 10350000.0,
  'change': 'RISE',
  'changePrice': 155000.0,
  'changeRate': 0.01

In [31]:
type(json_content)

list

In [32]:
type(json_content[0])

dict

## 네이버 키워드로 뉴스 수집하기

In [108]:
import requests
from bs4 import BeautifulSoup

NAVER_SEARCH_URL = "https://search.naver.com/search.naver"
query = "무역전쟁"

params = {
    'where': 'news',
    'query': query,
    'sm': 'tab_jum',
    'ie': 'utf8',
}

resp = requests.get(NAVER_SEARCH_URL, params)

soup = BeautifulSoup(resp.content, 'html.parser')

ul_contents = soup.find('ul', class_ = 'type01')

# 맨 앞에 빈값 지우기
new_ul_contents = []
for contents in ul_contents.contents:
    if not str(contents).strip():
        continue
    new_ul_contents.append(contents)

new_ul_contents

news_list = []

for li in new_ul_contents:
    news_dict = {}
    a_tag = li.find('dl').find('dt').find('a')

    news_dict['link'] = a_tag['href']
    news_dict['title'] = a_tag.text
    
    news_list.append(news_dict)
    
print(news_list)

[{'link': 'http://news.chosun.com/site/data/html_dir/2019/06/02/2019060200610.html?utm_source=naver&utm_medium=original&utm_campaign=news', 'title': '중국 또 무역전쟁 백서..."협상 좌절은 모두 美 정부 책임"'}, {'link': 'http://yna.kr/AKR20190603028500009?did=1195m', 'title': '"미중 무역전쟁 고조되면 9개월내 글로벌 경기침체 발생"'}, {'link': 'http://www.newsis.com/view/?id=NISX20190603_0000669509&cID=10101&pID=10100', 'title': '"中 백서, 美에 무역협상 재개의지 신호 보내" WSJ'}, {'link': 'http://www.yonhapnewstv.co.kr/MYH20190603002100038/?did=1825m', 'title': '미·중 관세전쟁 심화…한국 새우등 터지나'}, {'link': 'http://www.inews24.com/view/1183126', 'title': '[G2무역전쟁㊦] 패권경쟁 색깔 짙어…장기화시 미·중도 타격 불가피'}, {'link': 'http://www.segyefn.com/newsView/20190603001439?OutUrl=naver', 'title': '무역분쟁에 韓수출 OECD 회원국 중 가장 큰 타격'}, {'link': 'http://news.mt.co.kr/mtview.php?no=2019060214393713456', 'title': '"무역전쟁 두렵지 않다"…\'초강수\' 예고한 중국'}, {'link': 'http://news.wowtv.co.kr/NewsCenter/News/Read?articleId=A201906030020&t=NNv', 'title': '[월가브리핑] 中, 미중 무역전쟁 백서 발표 "협상결렬은 美 책임"'}, {'link': 

In [107]:
from pprint import pprint
pprint(news_list)

[{'link': 'http://news.chosun.com/site/data/html_dir/2019/06/02/2019060200610.html?utm_source=naver&utm_medium=original&utm_campaign=news',
  'title': '중국 또 무역전쟁 백서..."협상 좌절은 모두 美 정부 책임"'},
 {'link': 'http://yna.kr/AKR20190603028500009?did=1195m',
  'title': '"미중 무역전쟁 고조되면 9개월내 글로벌 경기침체 발생"'},
 {'link': 'http://www.dt.co.kr/contents.html?article_no=2019060302109957607005&ref=naver',
  'title': '"미중 무역전쟁 지속땐 3분기 내 글로벌 경기침체 발생"'},
 {'link': 'http://www.newsis.com/view/?id=NISX20190603_0000669509&cID=10101&pID=10100',
  'title': '"中 백서, 美에 무역협상 재개의지 신호 보내" WSJ'},
 {'link': 'http://weeklytrade.co.kr/news/view.html?section=1&category=136&item=&no=53378',
  'title': '모건 스탠리 "무역전쟁 계속되면 3분기내 경기 침체"'},
 {'link': 'http://www.yonhapnewstv.co.kr/MYH20190603002100038/?did=1825m',
  'title': '미·중 관세전쟁 심화…한국 새우등 터지나'},
 {'link': 'http://www.inews24.com/view/1183126',
  'title': '[G2무역전쟁㊦] 패권경쟁 색깔 짙어…장기화시 미·중도 타격 불가피'},
 {'link': 'http://www.segyefn.com/newsView/20190603001439?OutUrl=naver',
  'title'

In [34]:
from bs4 import BeautifulSoup

In [35]:
url = "https://www.naver.com"
resp = requests.get(url)

In [36]:
soup = BeautifulSoup(resp.content, 'html.parser')

In [38]:
print(soup.prettify())

<!DOCTYPE doctype html>
<html lang="ko">
 <head>
  <meta charset="utf-8"/>
  <meta content="origin" name="Referrer"/>
  <meta content="text/javascript" http-equiv="Content-Script-Type"/>
  <meta content="text/css" http-equiv="Content-Style-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=1100" name="viewport"/>
  <meta content="NAVER" name="apple-mobile-web-app-title">
   <meta content="index,nofollow" name="robots">
    <meta content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요" name="description">
     <meta content="네이버" property="og:title"/>
     <meta content="https://www.naver.com/" property="og:url"/>
     <meta content="https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png" property="og:image"/>
     <meta content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요" property="og:description">
      <meta content="summary" name="twitter:card"/>
      <meta content="" name="twitter:title"/>
      <meta content="https://www.naver.com/" name="twit

In [40]:
soup.title.string

'NAVER'

In [41]:
soup.title.parent

<meta content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요" name="twitter:description">
<link href="/favicon.ico?1" rel="shortcut icon" type="image/x-icon"/>
<link href="https://pm.pstatic.net/css/main_v190523.css" rel="stylesheet" type="text/css"/>
<link href="https://pm.pstatic.net/css/webfont_v170623.css" rel="stylesheet" type="text/css"/>
<link href="https://ssl.pstatic.net/sstatic/search/pc/css/api_atcmp_181122.css" rel="stylesheet" type="text/css"/>
<script src="https://pm.pstatic.net/js/c/nlog_v181107.js" type="text/javascript"></script>
<script src="https://ssl.pstatic.net/tveta/libs/assets/js/common/min/probe.min.js" type="text/javascript"></script>
<script type="text/javascript">
var nsc = "navertop.v3";
document.domain = "naver.com";
var jindoAll = "";
var iframeLazyLoad = false;
if (!!!window.console) {window.console={};window.console["log"]=function(){}}
var isLogin = false; 
function refreshLcs(etc) {etc = etc ? etc : {};if(document.cookie.indexOf("nrefreshx=1") != -1) {etc["mrf"]="

In [43]:
soup.p

<p class="dsc"><em class="txt">동일한 시간대/연령/남녀별</em> 사용자 그룹의<br/>관심사에 맞춰 자동완성을 제공합니다.</p>

In [42]:
soup.p.attrs

{'class': ['dsc']}

### 실시간 scraping

In [45]:
resp = requests.get("https://search.naver.com/search.naver")
resp

<Response [200]>

In [46]:
soup = BeautifulSoup(resp.content,'html.parser')

In [52]:
soup.find('div') #find_all / find

<div id="nxtt_div" style="display:none;position:absolute;border-width:0;z-index:11000"></div>

In [55]:
len(soup.find_all(['a','p'])) #a,p 태그 동시에 찾기

299

In [57]:
len(soup.find_all(['a','p'], limit = 10)) #limitation in numbers

10

In [59]:
len(soup.find_all("div", recursive = False))

0

In [61]:
realtime_search = soup.find(class_ = 'realtime_srch')
realtime_search

bs4.element.Tag

In [62]:
inb_tag = soup.find(attrs = {'href': '#inb'})
inb_tag

### Naver 크롤링 실습

In [156]:
import requests
from bs4 import BeautifulSoup

NAVER_SEARCH_URL = "https://search.naver.com/search.naver"
query = "무역전쟁"

params = {
    'where': 'news',
    'query': query,
    'sm': 'tab_jum',
    'ie': 'utf8',
}

resp = requests.get(NAVER_SEARCH_URL, params)

soup = BeautifulSoup(resp.content, 'html.parser')

ul_contents = soup.find('ul', class_ = 'type01')

In [112]:
new_ul_contents = []
for contents in ul_contents.contents:
    if not str(contents).strip():
        continue
    new_ul_contents.append(contents)

new_ul_contents

[<li id="sp_nws1"> <div class="thumb"><a class="sp_thmb thmb80" href="http://news.chosun.com/site/data/html_dir/2019/06/02/2019060200610.html?utm_source=naver&amp;utm_medium=original&amp;utm_campaign=news" onclick="return goOtherCR(this, 'a=nws*h.img&amp;r=1&amp;i=880000ED_000000000000000003450805&amp;g=023.0003450805&amp;u='+urlencode(this.href));" target="_blank"><img alt="중국 또 무역전쟁 백서" class="" height="80" onerror="this.parentNode.style.display='none';" src="https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F023%2F2019%2F06%2F02%2F3450805.jpg&amp;type=ofullfill80_80_q75_re2" width="80"/><span class="thmb_v"></span></a></div> <dl> <dt><a class="_sp_each_title" href="http://news.chosun.com/site/data/html_dir/2019/06/02/2019060200610.html?utm_source=naver&amp;utm_medium=original&amp;utm_campaign=news" onclick="return goOtherCR(this, 'a=nws*h.tit&amp;r=1&amp;i=880000ED_000000000000000003450805&amp;g=023.0003450805&amp;u='+urlencode(this.href));" 

In [176]:
news_list = []
for li in new_ul_contents:
    news_dict = {}
    news_dict2 = {}
    a_tag = li.find('dl').find('dt').find('a')
    
    tag1 = li.find('dl').find('dd')
#     print(tag1.span.text)
    

    news_dict['link'] = a_tag['href']
    
    testing = requests.get(a_tag['href'])
    soup2 = BeautifulSoup(testing.content, 'html.parser')
    print(soup2.text)
#     news_dict['title'] = a_tag.text
    
#     news_dict['source'] = tag1.span.text
    
#     tag2 = li.find('dl').find('dd').findNext('dd')
    
#     news_list.append(news_dict)
#     news_list.append(tag2.text)
    
    
print(news_list)








중국 또 무역전쟁 백서..."협상 좌절은 모두 美 정부 책임" - 조선닷컴 - 국제 > 중국




























 






    // CSS reload
    var uniqueNum = new Date().getTime(),
        cssURL = 'http://news.chosun.com/dhtm18/css/art/cs_art_2018.css?ver=' + uniqueNum,
        cssElement=document.createElement('link')
    cssElement.setAttribute('rel','stylesheet')
    cssElement.setAttribute('type','text/css')
    cssElement.setAttribute('href', cssURL);
    document.getElementsByTagName("head")[0].appendChild(cssElement);

    var _sf_startpt=(new Date()).getTime();








 
	var ArtID = "2019060200610",
	    CatID = "43",
	    OrgID = "",
	    ArtTitle = "중국 또 무역전쟁 백서...\"협상 좌절은 모두 美 정부 책임\"",
	    VoiceURL = "https://s3-ap-northeast-2.amazonaws.com/chosun-polly-mpeg/2019060200610.mp3";

	if ( ArtTitle.length > 45 ){
        var ArtTitleMin = ( ArtTitle.substring(0,45) + "..." );
    } else {
		var ArtTitleMin = ArtTitle;
	};

	var TempDate="입력 : 2019.06.02 13:17";
    var scrabYN = 'Y',
        arrReferrer







모건 스탠리 '무역전쟁 계속되면 3분기내 경기 침체' :: 한국무역신문, 주간무역, 한국무역의 길잡이 한국무역신문







 var domain = "http://"+document.domain+"/"












function down_key(evt) {		//[추가] 키보드 상하에 따라 색과 검색어에 선택된 색 value 입력 - 랭크업 류혜미 2013.01.16
	if($('t_search_list_0').value == undefined || $('t_search_list_0').value == "") return;
	var event = evt || event;
	var num,max_num=-1;
	for(var i=0; i<6; i++) {		// 색칠된 list의 num 결정
		if($('t_search_list_'+i).style.backgroundColor == "#d4d4d4")  num=i;
		if(($('t_search_list_'+i).value != undefined) && ($('t_search_list_'+i).value != '')) max_num =max_num+1;
	}

	if(num==undefined) {num= -1;}	// 첫번째 list에 num 임의로 주기
	if(event.keyCode == 40) {
		if(num=='-1') $('skey_hidden').value = $('skey').value;
		if(($('t_search_list_'+max_num).style.backgroundColor == "#d4d4d4") && num==max_num) {	// 마지막list에서는 '↓' 눌렀을 경우 초기화
			$('t_search_bar').style.display = 'none';
			skey_clear();
		}
		if(num!='5') {
			$('skey').value = $('t_search_list_'+(num+1)).valu















미·중 관세전쟁 심화…한국 새우등 터지나 | 연합뉴스TV :: 대한민국 뉴스의 시작. 채널 23













var themeSettings = {
		infoToggle: 90,
ajaxVideoForListLargeView: true
	},
	ajaxurl = 'http://www.yonhapnewstv.co.kr/wp-admin/admin-ajax.php',
	ajaxerror = "Something\'s error. Please try again later!";




















/* <![CDATA[ */
var ai1ec_event = {"language":"ko"};
/* ]]> */



/* <![CDATA[ */
var DavesWordPressLiveSearchConfig = {"resultsDirection":"down","showThumbs":"false","showExcerpt":"false","showMoreResultsLink":"true","minCharsToSearch":"1","xOffset":"0","yOffset":"0","blogURL":"http:\/\/www.yonhapnewstv.co.kr","ajaxURL":"http:\/\/www.yonhapnewstv.co.kr\/wp-admin\/admin-ajax.php","viewMoreText":"\ub354 \ub9ce\uc740 \uacb0\uacfc \ubcf4\uae30","outdatedJQuery":"Dave's WordPress Live Search requires jQuery 1.2.6 or higher. WordPress ships with current jQuery versions. But if you are seeing this message, it's likely that another plugin is including an earlier version."};
/* ]]> */















/* static proxy setting */

window.addEventListener("error", function (e, url) {

    var eleArray =["IMG","SCRIPT", "LINK"];
    var resourceMap = {
        "IMG": "Picture file",
        "SCRIPT": "JavaScript file",
        "LINK": "CSS file"
    };
    
    var ele = e.target;

    if(eleArray.indexOf(ele.tagName) != -1){
        try{
        var url = ele.tagName == "LINK" ? ele.href: ele.src;
        var parser = document.createElement('a');
            parser.href =url;
            switch(parser.hostname){

                case "www.inews24.com" :
                case "www.joynews24.com":

                    return false;
                    break;

            }

            
            var head = document.head;

            switch(ele.tagName){
                case "LINK" :
                    ele.href="http://www.inews24.com"+parser.pathname+parser.search;              
                    break;
                case "IMG" :
                case "SCRIPT" :
              




"무역전쟁 두렵지 않다"…'초강수' 예고한 중국 - 머니투데이 뉴스































































	$(document).ready(function() {
        var subjectTop = $('.view .view_top .subject').offset().top + $('.view .view_top .subject').outerHeight();

        //스크롤시 프로그래시브바 이벤트
        $(window).scroll(function() {
            // 추가 190411 성수 // 
            var docHeight = $(document).height(),
            windowHeight = $(window).height(),
            scrollPercent;
            // //추가 190411 성수 // 
            scrollPercent = $(window).scrollTop() / (docHeight - windowHeight) * 100;
            $('.progressbar').width(scrollPercent + '%');
        });

		//스크롤시 고정헤더 이벤트
		$(window).scroll(function() {
			if (subjectTop < $(window).scrollTop()) {
				$('body').addClass('fix_header');
			} else {
				$('body').removeClass('fix_header');
			}
			$('#mt_at_fixheader .gnbbox .amvbox').hide();
		});
		/*엠티리포트&평창 랜덤*/
		var cookie = $.cookie('rdmView');
		var rdmV



















[월가브리핑] 中, 미중 무역전쟁 백서 발표 ＂협상결렬은 美 책임＂













        /********************************************************************************
         * Facebook(https://developers.facebook.com/)
         ********************************************************************************/
        var FacebookAppId     = "1934458713480006";
        var FacebookAPIVer    = "v2.10";
        var FacebookUserToken = "EAAbfYURzG0YBAIcybLPzotI7oy1OYG6Dz0ncwwaMcxruO3lMfFSST78qkGnY6N2k12wJFyXrzJ5uV40bE851FjPFZAZCFVUECSjzY3BUZBdZAWN1qSrNV2Q3WF2yc1JsAzOBAKtXNlIZAwowP7h22VIqt2zUL1BBhjTkS2al4r41SKYWYCxZCQoVZAq26FaZBWoZD";

        window.fbAsyncInit = function () {
            FB.init({
                appId: FacebookAppId,
                autoLogAppEvents: true,
                xfbml: true,
                version: FacebookAPIVer
            });
        };

        (function (d, s, id) {
            var js, fjs = d.getElementsByTagName(s)[0];
            if (d.get

In [155]:
from pprint import pprint
pprint(news_list)

[{'link': 'http://news.chosun.com/site/data/html_dir/2019/06/02/2019060200610.html?utm_source=naver&utm_medium=original&utm_campaign=news',
  'source': '조선일보언론사 선정',
  'title': '중국 또 무역전쟁 백서..."협상 좌절은 모두 美 정부 책임"'},
 '상무부 부부장 기자회견 오사카 G20 트럼프⋅시진핑 회동 가능성에 "전할 게 없다" 중국이 2일 미⋅중 무역전쟁에 대한 두 번째 백서를 '
 '펴냈다. 중국 국무원 신문판공실은 일요일인 이날 기자회견을 갖고 ‘중⋅미...',
 {'link': 'http://yna.kr/AKR20190603028500009?did=1195m',
  'source': '연합뉴스',
  'title': '"미중 무역전쟁 고조되면 9개월내 글로벌 경기침체 발생"'},
 '블룸버그 통신, 미국 CNBC방송 등에 따르면 모건스탠리의 수석 이코노미스트인 체탄 아히야는 2일(현지시간) 보고서를 통해 투자자들이 '
 '미중 무역전쟁의 위험성을 과소평가한다면서 이같이 지적했다. 아히야는...',
 {'link': 'http://www.newsis.com/view/?id=NISX20190603_0000669509&cID=10101&pID=10100',
  'source': '뉴시스',
  'title': '"中 백서, 美에 무역협상 재개의지 신호 보내" WSJ'},
 "중국이 '중-미 경제무역 협의에 대한 중국의 입장'이란 제목의 백서에서 미국발 무역전쟁을 강하게 비판하기는 했지만, 미국과 협상을 갖는 "
 '것이 최우선 사안임을 나타냈다고 월스트리트저널(WSJ)이 2일(현지시간)...',
 {'link': 'http://weeklytrade.co.kr/news/view.html?section=1&category=136&item=&no=53378',
  'source': '한국무역신문',
  'title': '모

### pagination

In [None]:
import requests
from bs4 import BeautifulSoup

NAVER_SEARCH_URL = "https://search.naver.com/search.naver"
query = "무역전쟁"

params = {
    'where': 'news',
    'query': query,
    'sm': 'tab_jum',
    'ie': 'utf8',
}

resp = requests.get(NAVER_SEARCH_URL, params)

soup = BeautifulSoup(resp.content, 'html.parser')

ul_contents = soup.find('ul', class_ = 'type01')

[1, 11, 21, 31, 41, 51, 61, 71, 81, 91]

### webbrowser practice

In [65]:
import webbrowser

urls = ["www.naver.com","www.google.com"]

for url in urls:
    webbrowser.open_new(url)

In [66]:
search_words = ['python web scraping', 'python webbrowser']

google_url = "www.google.com/#q="
for search_word in search_words:
    webbrowser.open_new(google_url + search_word)

In [69]:
url = "https://www.alexa.com/topsites/countries/KR"

html_website_ranking = requests.get(url).text
soup_website_ranking = BeautifulSoup(html_website_ranking, "lxml")

website_ranking = soup_website_ranking.select('p a')

In [74]:
website_ranking[1:6]

[<a href="/siteinfo/naver.com">Naver.com</a>,
 <a href="/siteinfo/google.com">Google.com</a>,
 <a href="/siteinfo/youtube.com">Youtube.com</a>,
 <a href="/siteinfo/daum.net">Daum.net</a>,
 <a href="/siteinfo/google.co.kr">Google.co.kr</a>]

In [76]:
website_ranking[1].get_text()

'Naver.com'

In [79]:
website_ranking_address = [website_ranking_element.get_text() for website_ranking_element in website_ranking]
website_ranking_address[1:6]

['Naver.com', 'Google.com', 'Youtube.com', 'Daum.net', 'Google.co.kr']

In [165]:
from bs4 import BeautifulSoup as BS
import ssl, urllib 
import traceback 
base_url = 'https://search.naver.com/search.naver'
values = { 'where': 'news', 
          'sm': 'tab_jum', 
          'ie': 'utf8', 
          'query': '무역전쟁',
        }  

query_string = urllib.parse.urlencode(values) 
context = ssl._create_unverified_context() 
try: 
    res = urllib.request.urlopen(base_url + '?' + query_string, context=context) 
except: 
    traceback.print_exc() 
html_data = BS(res.read(), 'html.parser')


Traceback (most recent call last):
  File "<ipython-input-169-bfd1066a6d30>", line 7, in <module>
    if span:
NameError: name 'span' is not defined
