In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

# 한글이 깨지는 문제
font_name = fm.FontProperties(fname="C:\\Windows\\Fonts\\malgun.ttf").get_name()
plt.rc("font", family=font_name)

# 한글 적용 후 기호가 미출력
import matplotlib as mlp
mlp.rcParams["axes.unicode_minus"] = False

# 1. XML

- 특징
    - 메타 언어 : GML > SGML > XML
    - 데이터를 위한 언어 : 태그 이름을 마음대로 만들 수 있음
        - HTML : <font color:red>홍길동</font>
        - XML : <name>홍길동</name> 
    - DBMS : 대량 데이터 저장 가능, 삽입삭제수정조회 가능
    - 데이터 표준화 : 이기종 시스템 간의 정보 교환, 웹 서비스, 유비쿼터스, 사물인터넷 등등...
    
- 구성 요소
    - dtd : xml 설계도
    - xml schema : xml 설계도, 복잡함
    - xml
    - xsl, xslt : xml에 디자인 설정, css 같음
    
- https://docs.python.org/3/library/xml.etree.elementtree.html

In [2]:
import xml.etree.ElementTree as et

##### XML api 사용 방법
# 1. xml이 파일로 존재하는 경우 : parse()
# 2. xml이 메모리에 로드된 경우 : fromstring()

## (1) XML 데이터 불러오기

In [4]:
##### xml이 파일로 존재하는 경우

tree1 = et.parse("data/users.xml") # 트리 형태로 저장되어 있음
tree1

<xml.etree.ElementTree.ElementTree at 0x20e5f288730>

In [6]:
##### xml이 이미 메모리에 로드되어 있는 경우

xmlstr = """<?xml version="1.0" encoding="utf-8" ?>
<users>
    <user grade="gold">
            <name>Kim Cheol Soo</name>
            <age>25</age>
            <birthday>19940215</birthday>
    </user>
    <user grade="diamond">
            <name>Kim Yoo Mee</name>
            <age>21</age>
            <birthday>19980417</birthday>
    </user>
</users>
"""

tree2 = et.fromstring(xmlstr)
tree2
# Element : <></> 태그는 하나의 Element이며 노드가 된다.

<Element 'users' at 0x0000020E5FA64B80>

## (2) XML 데이터 다루기

### 1) 태그명 검색

In [9]:
print(tree1.find("user"))
print(tree1.find("user[1]"))
print(tree1.find("user[2]"))

<Element 'user' at 0x0000020E5F9F34A0>
<Element 'user' at 0x0000020E5F9F34A0>
<Element 'user' at 0x0000020E5F9F3630>


In [21]:
data = tree1.find("user[2]")
print(type(data))
dir(data)

print(data.tag)
print(data.attrib) # 속성
print(data.get("grade"))

print("----------------------------------------------")

username = data.find("name")
print(username.tag)
print(username.attrib) # 속성

print(username.text) # text노드(name의 자식 노드이자 단말테그)

<class 'xml.etree.ElementTree.Element'>
user
{'grade': 'diamond'}
diamond
----------------------------------------------
name
{}
Kim Yoo Mee


### 2) 태그 조건으로 검색

In [29]:
# data = tree1.find("./user[@grade]") # grade gold
# data = tree1.find("./user[@grade][2]")
data = tree1.find("./user[@grade = 'diamond']")

print(data)
print(data.attrib) # dict 형식
print(data.keys()) # dict의 key값만
print(data.items()) # dict를 튜플로

<Element 'user' at 0x0000020E5F9F3630>
{'grade': 'diamond'}
['grade']
[('grade', 'diamond')]


### 3) 여러 개의 태그 한 번에 가져오기

In [32]:
users = tree1.findall("./user")
for i in users:
    print(i.attrib)
    print(i.find("name").text)

{'grade': 'gold'}
Kim Cheol Soo
{'grade': 'diamond'}
Kim Yoo Mee


# 2 JSON

In [33]:
import json

# dumps() : data 저장
# loads() : data 불러올 때

In [47]:
j1 = {"name":"홍길동", "birth":"0101", "age":20}
print(type(j1))
print(j1)

print("----------------------------------------------------")

j2 = json.dumps(j1)
print(type(j2)) # JSON은 네트워크에 최적화된 문자열 형식
print(j2) # encoding : 홍길동 > \ud64d\uae38\ub3d9

print("----------------------------------------------------")

j3 = json.dumps((1,2,3))
print(type(j3))
print(j3) # JSON에는 소괄호 없음 중괄호 혹은 대괄호(여러개일 때 묶어줌)

print("----------------------------------------------------")

j4 = json.loads(j2)
print(type(j4))
print(j4)

<class 'dict'>
{'name': '홍길동', 'birth': '0101', 'age': 20}
----------------------------------------------------
<class 'str'>
{"name": "\ud64d\uae38\ub3d9", "birth": "0101", "age": 20}
----------------------------------------------------
<class 'str'>
[1, 2, 3]
----------------------------------------------------
<class 'dict'>
{'name': '홍길동', 'birth': '0101', 'age': 20}


In [51]:
obj = """
{
	"id": "0001",
	"type": "donut",
	"name": "Cake",
	"ppu": 0.55,
	"batters":
		{
			"batter":
				[
					{ "id": "1001", "type": "Regular" },
					{ "id": "1002", "type": "Chocolate" },
					{ "id": "1003", "type": "Blueberry" },
					{ "id": "1004", "type": "Devil's Food" }
				]
		},
	"topping":
		[
			{ "id": "5001", "type": "None" },
			{ "id": "5002", "type": "Glazed" },
			{ "id": "5005", "type": "Sugar" },
			{ "id": "5007", "type": "Powdered Sugar" },
			{ "id": "5006", "type": "Chocolate with Sprinkles" },
			{ "id": "5003", "type": "Chocolate" },
			{ "id": "5004", "type": "Maple" }
		]
}
"""

print(type(obj))

<class 'str'>


In [60]:
# id가 0001인 값 추출
data = json.loads(obj)
print(data["id"])

# id가 1001인 값 추출
print(data["batters"]["batter"][0]["id"]) # {}는 key값으로 []는 인덱스로 접근하기

# Chocolate토핑 추출
print(data["topping"][5]["type"])

0001
1001
Chocolate


# 3. BeautifulSoup

## (1) 웹 소스 가져오기

In [2]:
import urllib.request as req
from urllib.error import HTTPError, URLError

In [68]:
google = req.urlopen("https://google.com")
html = google.read()
# b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head>
# b는 현재 데이터는 모두 1바이트로 표시되었다는 뜻

print(type(html))

<class 'bytes'>


In [74]:
try:
    google = req.urlopen("https://googladsfae.com") # URLError :  <urlopen error [Errno 11001] getaddrinfo failed>
#     google = req.urlopen("https://google.com/a.jsp") # HTTPError :  HTTP Error 404: Not Found
except HTTPError as e: 
    print("HTTPError : ", e)
except URLError as e:
    print("URLError : ", e)
else:
    html = google.read()

URLError :  <urlopen error [Errno 11001] getaddrinfo failed>


In [75]:
##### 이미지 가져오기

# 1. urlretrieve 사용하기 : 이미지를 가공할 필요가 없을 때

req.urlretrieve("https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png", 
                "data/google.png") #가져오고자 하는 이미지 경로, #내 컴퓨터에 저장할 경로

('data/google.png', <http.client.HTTPMessage at 0x20e61f86d30>)

In [82]:
# 2. urlopen 사용하기 : 이미지를 가공해야 할 때

img = req.urlopen("https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png")
img_data = img.read()
img_data # 이미지가 숫자로 인코딩되어 불러와짐

f = open("data/google2.png", "wb") # 1바이트씩 저장시킬 때 wb
f.write(img_data)
f.close()

## (2) BeautifulSoup 사용법

- pip install beautifulsoup4

In [5]:
from bs4 import BeautifulSoup

In [13]:
page = open("data/test_first.html")
soup = BeautifulSoup(page, "html.parser") # 문자열을 tree 구조로 재구성
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Very Simple HTML Code by Netsong7
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy PinkWink.
    <a href="http://netsong7.synology.me" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science.
    <a href="https://www.python.org" id="py-link">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is funny.
   </b>
  </p>
  <p class="outer-text">
   <b>
    All I need is Love.
   </b>
  </p>
 </body>
</html>


### 1) 원하는 위치에 접근하는 방법(순차적인 방식)

In [36]:
list(soup.children)
list(soup.children)[0]
list(soup.children)[1]
list(soup.children)[2]

<html><head>
<title>Very Simple HTML Code by Netsong7</title>
</head><body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
</html>

In [29]:
html = list(soup.children)[2]
body = list(html.children)[1]
list(body.children)

['\n',
 <div>
 <p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
 </p>
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>
 </div>,
 '\n',
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 '\n',
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>,
 '\n']

### 2) 태그명으로 접근하는 방법

In [37]:
soup.head
soup.body
soup.body.div
soup.body.div.p
soup.p # soup.body.div.p 와 같은 결과

<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>

### 3) 함수로 접근하는 방법
- find(), find_all()

In [39]:
soup.find("p") # 한 개만 찾아줌

<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>

In [41]:
soup.find_all("p")

[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [42]:
soup.find("p", class_="outer-text") # 해당 태그에 클래스 속성이 있다면

<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>

In [43]:
soup.find("p", id="second")

<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>

### 4) 다양한 순차 접근 방식

In [55]:
soup.head.next_sibling # head 태그의 바로 다음 형제 태그에 접근
soup.body.previous_sibling # body 태그의 이전 형제 태그에 접근
soup.body.div.next_siblings # 하나가 아니므로 객체로 만들어짐
list(soup.body.div.next_siblings)
soup.body.div.parent # div의 부모, parent

<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>

### 5) Text Node 가져오기

- get_text()
- text

In [57]:
soup.html.get_text() # 모든 문자를 가져옴

'\nVery Simple HTML Code by Netsong7\n\n\n\n                Happy PinkWink.\n                PinkWink\n\n\n                Happy Data Science.\n                Python\n\n\n\n\n                Data Science is funny.\n            \n\n\n\n                All I need is Love.\n            \n\n\n'

In [58]:
soup.head.get_text()

'\nVery Simple HTML Code by Netsong7\n'

In [59]:
# 해당 텍스트를 가진 태그에 가깝게 접근하기
soup.title.get_text()

'Very Simple HTML Code by Netsong7'

In [60]:
soup.div.get_text()

'\n\n                Happy PinkWink.\n                PinkWink\n\n\n                Happy Data Science.\n                Python\n\n'

In [61]:
soup.p.get_text() # 첫번째 p태그

'\n                Happy PinkWink.\n                PinkWink\n'

In [66]:
for p in soup.find_all("p"): print(p.get_text())

lst = [p.get_text() for p in soup.find_all("p")]
print(lst)


                Happy PinkWink.
                PinkWink


                Happy Data Science.
                Python



                Data Science is funny.
            



                All I need is Love.
            

['\n                Happy PinkWink.\n                PinkWink\n', '\n                Happy Data Science.\n                Python\n', '\n\n                Data Science is funny.\n            \n', '\n\n                All I need is Love.\n            \n']


### 6) 속성에 접근하기

In [76]:
# a = soup.find("a")
# a

# a["href"]

soup.a["href"]

for n in soup.find_all("a"):
    print(n["href"])

http://netsong7.synology.me
https://www.python.org


## (3) 실습

### 1) 네이버에서 환율 정보 가져오기

- https://finance.naver.com/marketindex/


In [79]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [86]:
url = "https://finance.naver.com/marketindex/"
page = urlopen(url)
page

# 가져온 웹 소스를 트리 구조로 변환
soup = BeautifulSoup(page, "html.parser")
soup.prettify

<bound method Tag.prettify of 
<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market"></script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20220615142521/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript"></script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20220615142521/js/jindo.1.5.3.element-text-patch.js" type="text/javascript"></script>
<div id="container" style="padding-bottom:0px;">
<div class="market_include">
<div class="market_data">
<div class="market1">
<div class="title">
<h2 class="h_market1"><span>환전 고시 환율</span></h2>
</div>
<!-- data -->
<div class="data">
<ul class="data_lst" id="exchangeList">
<li class="on">
<a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
<h3 class="h_lst"><span class="blind">미국 USD</span></h3>
<div class="head_info point_dn">
<span class="value">1,290.

In [92]:
# span 태그로 접근
span = soup.find_all("span", class_="value")
print(span)
print("미 환율 : ", span[0].get_text())

[<span class="value">1,290.40</span>, <span class="value">955.75</span>, <span class="value">1,359.24</span>, <span class="value">193.20</span>, <span class="value">134.9600</span>, <span class="value">1.0536</span>, <span class="value">1.2256</span>, <span class="value">104.4800</span>, <span class="value">107.99</span>, <span class="value">2114.43</span>, <span class="value">1835.6</span>, <span class="value">76346.99</span>]
미 환율 :  1,290.40


In [106]:
# div 태그로 접근
div = soup.find_all("div", class_="head_info point_dn")
print(div[0].get_text())


1,290.40
원
 2.60
하락



In [102]:
div[0]

<div class="head_info point_dn">
<span class="value">1,290.40</span>
<span class="txt_krw"><span class="blind">원</span></span>
<span class="change"> 2.60</span>
<span class="blind">하락</span>
</div>

In [110]:
# css 문법 이용

span = soup.select_one("div.head_info > span.value") # select로 시작하는 함수는 css 사용하는 것
# div 중 class가 head_info인 것(class 접근 기호 . 사용), 그 중 span의 값을 가져옴(자식에 접근 > 사용)

print(span.get_text())

1,290.40


## 2) 스크래핑 연습

In [112]:
##### 웹 소스 가져오기

url = "http://www.pythonscraping.com/pages/warandpeace.html"
page = urlopen(url)

soup = BeautifulSoup(page, "html.parser")

In [127]:
##### 녹색 단어만 골라오기

# lst = []
# green = soup.find_all("span", class_="green")
# for i in green:
#     lst.append(i.get_text())
    
# lst

green = soup.find_all("span", class_="green")
for i in green:
    print(i.get_text())
    
print("----------------------------------------")

green = soup.find_all("span", {"class" : "green"})
for i in green:
    print(i.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna
----------------------------------------
Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Ann

In [129]:
green = soup.select("div#text > span.green")
for i in green:
    print(i.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [136]:
##### 제목(h숫자)

# soup.h1
titles = soup.find_all(["h1", "h2"])
print([t.get_text() for t in titles])

['War and Peace', 'Chapter 1']


In [139]:
##### 웹 소스 가져오기 
url = "http://www.pythonscraping.com/pages/page3.html"
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")

In [172]:
##### 테이블 행 수집

# original
tables = soup.find_all("tr", class_="gift")
tables

# class 속성이 없다면?
tables = soup.find_all("tr", {"id":["gift1", "gift2", "gift3", "gift4", "gift5"]})
tables

# tr의 id가 너무 많다면?
tables = soup.select("table#giftList > tr")[1:]
tables
for t in tables:
    print(t.get_text())
    
# tables = []
# for i in range(1, 6):
#     tables.append(soup.find_all("tr", id="gift{}".format(i)))

# tables


Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00




Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!

$10,000.52




Fish Painting

If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!

$10,005.00




Dead Parrot

This is an ex-parrot! Or maybe he's only resting?

$0.50




Mystery Box

If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!

$1.50





In [177]:
tb = list(soup.find("table", {"id":"giftList"}).tr.next_siblings)
tb

for i in tb:
    print(i.get_text()) # \n도 포함되기
    
########################################################################

tb = soup.find("table", {"id":"giftList"}).tr.next_siblings

for i in tb:
    if(type(i) == bs4.element.Tag): # 태그의 자료 타입 : bs4.element.Tag
        print(i.get_text()) # 태그일 때만 출력




Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00






Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!

$10,000.52






Fish Painting

If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!

$10,005.00






Dead Parrot

This is an ex-parrot! Or maybe he's only resting?

$0.50






Mystery Box

If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!

$1.50






Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00




Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite dolls

In [179]:
tb = soup.find("table", {"id":"giftList"}).tr.next_siblings

for i in tb:
    if(type(i) == bs4.element.Tag): # 태그의 자료 타입 : bs4.element.Tag
        print(i.get_text()) # 태그일 때만 출력
        print(list(i.children)[3].img["src"]) # 자식 태그 중 네번째 태그가 img 태그


Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00



../img/gifts/img1.jpg

Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!

$10,000.52



../img/gifts/img2.jpg

Fish Painting

If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!

$10,005.00



../img/gifts/img3.jpg

Dead Parrot

This is an ex-parrot! Or maybe he's only resting?

$0.50



../img/gifts/img4.jpg

Mystery Box

If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!

$1.50



../img/gifts/img6.jpg


#### $15.00 찾기

In [188]:
list(soup.select("tr#gift1 > td"))[2].get_text()

'\n$15.00\n'

In [190]:
list(soup.find("tr", id="gift1"))[2].get_text()

'\n$15.00\n'

In [199]:
list(soup.tr.next_siblings)[1].td.next_sibling.next_sibling.get_text()

'\n$15.00\n'

In [202]:
soup.select("td")[2].get_text().replace("\n", "")

'$15.00'

In [207]:
price = soup.find("img", src="../img/gifts/img1.jpg").parent.previous_sibling
print(price.get_text())


$15.00



In [217]:
soup.find("table", id="giftList").td.next_sibling.next_sibling.get_text().replace("\n", "")

'$15.00'