# 간단한 소켓 통신

In [1]:
import socket
def main():
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect(('google.com', 80))
    request = 'GET http://google.com HTTP/1.1\n\n'.encode()
    s.send(request)
    print(s.recv(4096).decode())
    
main()

HTTP/1.1 301 Moved Permanently
Location: http://www.google.com/
Content-Type: text/html; charset=UTF-8
Date: Wed, 01 Sep 2021 15:15:31 GMT
Expires: Fri, 01 Oct 2021 15:15:31 GMT
Cache-Control: public, max-age=2592000
Server: gws
Content-Length: 219
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN

<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
<TITLE>301 Moved</TITLE></HEAD><BODY>
<H1>301 Moved</H1>
The document has moved
<A HREF="http://www.google.com/">here</A>.
</BODY></HTML>



* socket.AF_INET: 주소체계를 표현할때 사용(IPv4를 사용하는..)
* socket.SOCK_STREAM: 소켓유형. 일반적으로 SOCK_STREAM, SOCK_DGRAM을 많이 사용한다.
    - SOCK_STREAM: TCP프로토콜 전송방식
    - SOCK_DGRAM : UDP프로토콜 전송방식
* `'GET http://google.com HTTP/1.1\n\n'`: header을 가져온다.
* s.recv(buffersize): 소켓에서 데이터 수신 

Socket은 TCP-IP프로토콜에서 일어나는 동작

## TCP/IP
* TCP : 응용프로그램간 통신의 프로토콜
* IP : 컴퓨터와 컴퓨터간 통신 프로토콜

### Application layer에서 사용되는 프로토콜
* HTTP
* HTTPS
* FTP
* DNS
* SSH

[Internet Protocol suite](https://en.wikipedia.org/wiki/Internet_protocol_suite)

## HTTP

* GET : read, 정보를 요청
* POST : create, 정보 생성/변형 요청
* HEAD : 헤더 정보만 요청
* PUT : update, 정보 수정 요청
* DELETE : delete, 정보 삭제 요청
* OPTIONS : 사용가능한 메소드 확인

# API
데이터 제공해주는 api 홈페이지
* [Google API](https://developers.google.com/apis-explorer)
* [KaKao Dev](https://developers.kakao.com/)
* [Twitter](https://developer.twitter.com/en/docs/api-reference-index)
* [Papago](https://developers.naver.com/docs/papago/)

# 웹 크롤러 만들기

In [2]:
import urllib
def download(url):
    return urllib.request.urlopen(url)

파싱은 \<html>과 \</html> 사이에 있는 값을 파싱해서 가져와야 한다.

In [4]:
from urllib.error import URLError, HTTPError, ContentTooShortError

def download(url):
    try:
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error', e.reason)
        html = None
    return html

download('https://www.google.com')

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2021/get-vaccinated-wear-a-mask-save-lives-august-31-copy-6753651837109312-law.gif" itemprop="image"><meta content="Get Vaccinated. Wear a Mask. Save Lives." property="twitter:title"><meta content="Get Vaccinated. Wear a Mask. Save Lives. #GoogleDoodle" property="twitter:description"><meta content="Get Vaccinated. Wear a Mask. Save Lives. #GoogleDoodle" property="og:description"><meta content="summary_large_image" property="twitter:card"><meta content="@GoogleDoodles" property="twitter:site"><meta content="https://www.google.com/logos/doodles/2021/get-vaccinated-wear-a-mask-

웹 크롤링할 때 많이 사용하는 패키지
* BeautifulSoup
* Requests

In [5]:
import requests
url = 'http://www.google.com'
response = requests.get(url)
response

<Response [200]>

In [7]:
def download2(url):
    try:
        response = requests.get(url) # 여기부분에서 requests를 사용
        html = response.text
    except requests.ConnectionError: # 여기부분에서 requests를 사용
        print('Connection error')
        html = None
    return html

download2('https://www.google.com')

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2021/get-vaccinated-wear-a-mask-save-lives-august-31-copy-6753651837109312-law.gif" itemprop="image"><meta content="Get Vaccinated. Wear a Mask. Save Lives." property="twitter:title"><meta content="Get Vaccinated. Wear a Mask. Save Lives. #GoogleDoodle" property="twitter:description"><meta content="Get Vaccinated. Wear a Mask. Save Lives. #GoogleDoodle" property="og:description"><meta content="summary_large_image" property="twitter:card"><meta content="@GoogleDoodles" property="twitter:site"><meta content="https://www.google.com/logos/doodles/2021/get-vaccinated-wear-a-mask-s

### Beautiful Soup
html tag를 쉽게 사용할 수 있다.

In [9]:
from bs4 import BeautifulSoup

html = requests.get('http://www.google.com')
soup = BeautifulSoup(html.text, 'html.parser')

In [13]:
soup.html.body

<body bgcolor="#fff"><script nonce="AwIxfrk5bv2vQWJdNJJSTw==">(function(){var src='/images/nav_logo229.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
}
})();</script><div id="mngb"><div id="gbar"><nobr><b class="gb1">Search</b> <a class="gb1" href="http://www.google.com/imghp?hl=en&amp;tab=wi">Images</a> <a class="gb1" href="http://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a> <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a> <a class="gb1" href="http://www.youtube.com/?gl=US&amp;tab=w1">YouTube</a> <a class="gb1" href="https://news.google.com/?tab=wn">News</a> <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a> <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u>

In [14]:
soup.html.body.div

<div id="mngb"><div id="gbar"><nobr><b class="gb1">Search</b> <a class="gb1" href="http://www.google.com/imghp?hl=en&amp;tab=wi">Images</a> <a class="gb1" href="http://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a> <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a> <a class="gb1" href="http://www.youtube.com/?gl=US&amp;tab=w1">YouTube</a> <a class="gb1" href="https://news.google.com/?tab=wn">News</a> <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a> <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u> »</a></nobr></div><div id="guser" width="100%"><nobr><span class="gbi" id="gbn"></span><span class="gbf" id="gbf"></span><span id="gbe"></span><a class="gb4" href="http://www.google.com/history/optout?hl=en">Web History</a> | <a class="gb4" href="/preferences?hl=en">Settings</a> | <a class="gb4" href="https://accounts.goo

In [16]:
soup.findAll({'span'}) # 인자로 넘긴 태그 찾기. ('span')해도 되는데 굳이 ({'span'})이렇게 쓴 이유는?

[<span class="gbi" id="gbn"></span>,
 <span class="gbf" id="gbf"></span>,
 <span id="gbe"></span>,
 <span class="ds"><span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google Search"/></span></span>,
 <span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google Search"/></span>,
 <span class="ds"><span class="lsbb"><input class="lsb" id="tsuid1" name="btnI" type="submit" value="I'm Feeling Lucky"/><script nonce="AwIxfrk5bv2vQWJdNJJSTw==">(function(){var id='tsuid1';document.getElementById(id).onclick = function(){if (this.form.q.value){this.checked = 1;if (this.form.iflsig)this.form.iflsig.disabled = false;}
 else top.location='/doodles/';};})();</script><input name="iflsig" type="hidden" value="ALs-wAMAAAAAYS-yRY6VF8vPAUuNr6Aju9aJjW5U37v-"/></span></span>,
 <span class="lsbb"><input class="lsb" id="tsuid1" name="btnI" type="submit" value="I'm Feeling Lucky"/><script nonce="AwIxfrk5bv2vQWJdNJJSTw==">(function(){var id='tsuid1';document.getElementByI

In [17]:
soup.findAll('span')

[<span class="gbi" id="gbn"></span>,
 <span class="gbf" id="gbf"></span>,
 <span id="gbe"></span>,
 <span class="ds"><span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google Search"/></span></span>,
 <span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google Search"/></span>,
 <span class="ds"><span class="lsbb"><input class="lsb" id="tsuid1" name="btnI" type="submit" value="I'm Feeling Lucky"/><script nonce="AwIxfrk5bv2vQWJdNJJSTw==">(function(){var id='tsuid1';document.getElementById(id).onclick = function(){if (this.form.q.value){this.checked = 1;if (this.form.iflsig)this.form.iflsig.disabled = false;}
 else top.location='/doodles/';};})();</script><input name="iflsig" type="hidden" value="ALs-wAMAAAAAYS-yRY6VF8vPAUuNr6Aju9aJjW5U37v-"/></span></span>,
 <span class="lsbb"><input class="lsb" id="tsuid1" name="btnI" type="submit" value="I'm Feeling Lucky"/><script nonce="AwIxfrk5bv2vQWJdNJJSTw==">(function(){var id='tsuid1';document.getElementByI

In [24]:
test_html = """
<html>
    <p>test</p>
    <pn>test</pn>
</html>
"""

test_soup = BeautifulSoup(test_html,'html.parser')
test_soup.findAll({'p'}), test_soup.findAll('p')

([<p>test</p>], [<p>test</p>])

똑같은데 뭐가 다른지 모르겠다.

In [26]:
# 나라 이름 span class=stock_item
# 가격 span class=stock_price

html = requests.get('https://m.stock.naver.com/marketindex/index.nhn')
soup = BeautifulSoup(html.text, 'html.parser')

In [47]:
country = soup.findAll(['span','strong'], class_=['stock_item', 'stock_price'])
country_list = []
money_list=[]
for idx, content in enumerate(country):
    content_text = content.string
    if idx %2==0:
        country_list.append(content_text)
    else:
        money_list.append(content_text)

In [48]:
import pandas as pd
total_df = pd.DataFrame({'country':country_list, 'stock':money_list})
total_df.head()

Unnamed: 0,country,stock
0,미국 USD,1159.5
1,유럽 EUR,1370.59
2,일본 JPY,1050.89
3,중국 CNY,179.47
4,국제금,1815.0


# 서울시 산과 공원 생태관광 정보 크롤링

In [51]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver -y

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease                
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:4 http://security.ubuntu.com/ubuntu bionic-security/multiverse amd64 Packages [26.7 kB]
Get:5 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2,295 kB]
Get:6 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]    
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates/multiverse amd64 Packages [34.4 kB]
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [2,200 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [2,731 kB]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates/restricted amd64 Packages [575 kB]
Get:11 http://security.ubuntu.com/ubuntu bionic-security/r

7[24;0f[42m[30mProgress: [ 25%][49m[39m [##############............................................] 8Unpacking libfile-basedir-perl (0.07-1) ...
7[24;0f[42m[30mProgress: [ 26%][49m[39m [###############...........................................] 87[24;0f[42m[30mProgress: [ 28%][49m[39m [################..........................................] 8Selecting previously unselected package libfile-desktopentry-perl.
Preparing to unpack .../07-libfile-desktopentry-perl_0.22-1_all.deb ...
7[24;0f[42m[30mProgress: [ 29%][49m[39m [################..........................................] 8Unpacking libfile-desktopentry-perl (0.22-1) ...
7[24;0f[42m[30mProgress: [ 30%][49m[39m [#################.........................................] 87[24;0f[42m[30mProgress: [ 32%][49m[39m [##################........................................] 8Selecting previously unselected package libfile-mimeinfo-perl.
Preparing to unpack .../08-libfile-mimeinfo-perl_0.

7[24;0f[42m[30mProgress: [ 97%][49m[39m [########################################################..] 87[24;0f[42m[30mProgress: [ 99%][49m[39m [#########################################################.] 8Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Processing triggers for hicolor-icon-theme (0.17-2) ...
Processing triggers for mime-support (3.60ubuntu1) ...
Processing triggers for desktop-file-utils (0.23-1ubuntu3.18.04.2) ...
Processing triggers for libc-bin (2.27-3ubuntu1.4) ...

7[0;24r8[1A[J

In [52]:
crawling_path='http://data.seoul.go.kr/dataList/OA-12962/S/1/datasetView.do'
html = requests.get(crawling_path)
soup = BeautifulSoup(html.text, 'html.parser')

In [55]:
# btnCsv
import os
from selenium import webdriver
import time
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=chrome_options)

# 웹드라이버 실행 및 페이지 이동
driver.get(crawling_path)      # 우리가 원하는 URL로 이동합니다.
time.sleep(5)       # 해당 화면이 다 로딩할 때까지 5초간 충분히 기다려 줍니다. 
    
#csv파일 다운로드 버튼 클릭하기
driver.find_element_by_css_selector("#btnCsv").click()   # 사람이 누른 것처럼 다운로드 버튼을 클릭한 후
time.sleep(3)     # 다운로드가 완료될 때까지 3초간 기다려 줍니다. 

driver.quit()      # 브라우저를 닫습니다.

In [60]:
import glob
files = glob.glob('{}/aiffel/Git_Project/AIFFEL/week10/node/서울시*.csv'.format(os.getenv('HOME')))
print(files)

['/aiffel/aiffel/Git_Project/AIFFEL/week10/node/서울시 산과공원 생태관광 정보 (한국어).csv']


In [62]:
seoul_m_p_df = pd.read_csv(files[0], encoding='CP949')
seoul_m_p_df.head()

Unnamed: 0,키,명칭,대분류,주소,행정 시,행정 구,행정 동,대표전화,면적,지정일,교?안내
0,BE_IW14-0020,진관내동 생태경관보전지역,생태탐방,은평구 진관동 282-1번지 일대(북한산국립공원 북한산성 입구 주변 습지 ),서울특별시,은평구,진관동,02-2115-7550~5 02-350-1397,16639㎡,2002년 12월 30일,지하철 3호선 구파발역 1번 출구에서 704번 34번 버스를 타고 북한산성 입구에서...
1,BE_IW14-0109,안산공원,산과공원,서울특별시 서대문구 홍제동 산33번지 일대,서울특별시,서대문구,홍제1동,02-330-1395,,,
2,BE_IW14-0110,여의도공원,산과공원,서울특별시 영등포구 여의공원로68(여의도동 2번지),서울특별시,영등포구,여의동,02-761-4079,,,
3,BE_IW14-0111,염창공원,산과공원,서울 강서구 염창동 산24-1,서울특별시,강서구,염창동,02-2600-4186,,,
4,BE_IW14-0112,영등포공원,산과공원,서울특별시 영등포구 영등포동 582-3번지,서울특별시,영등포구,영등포본동,02-2670-3715~7,,,
