In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
soup = BeautifulSoup(html_doc)

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [4]:
soup.title
# <title>The Dormouse's story</title>

<title>The Dormouse's story</title>

In [5]:
soup.title.name
# u'title'

'title'

In [6]:
soup.title.string
# u'The Dormouse's story'

"The Dormouse's story"

In [7]:
soup.p
# <p class="title"><b>The Dormouse's story</b></p>

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
soup.p['class']
# u'title'

['title']

In [9]:
soup.title.parent.name
# u'head'

'head'

In [10]:
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [11]:
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [12]:
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [13]:
for link in soup.find_all('a'):
    print(link.get('href'))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


XML 문서 파싱 및 처리

In [14]:
from bs4 import BeautifulSoup

In [15]:
with open("book.xml", "r", encoding = "utf-8") as book_xml:
      xml = book_xml.read()
soup = BeautifulSoup(xml,"html.parser")


In [16]:
titles = soup.find_all('title')
for title in titles:
    print(title.get_text())
  

태백산맥
Inside XML
쉽게 배우는 영어


In [17]:
titles = soup.find_all('title')
authors = soup.find_all('author')
publishers = soup.find_all('publisher')
prices = soup.find_all('price')

In [18]:
for i in range(0, len(titles)):
    print(titles[i].get_text(), end=',')
    print(authors[i].get_text(),end=',')
    print(prices[i].get_text(), "원 ")
     

태백산맥,조정래,25000 원 
Inside XML,홍질동,35000 원 
쉽게 배우는 영어,채규태,20000 원 


In [19]:
from bs4 import BeautifulSoup
import requests as req
    
basic_url = "http://www.kma.go.kr/XML/weather/sfc_web_map.xml"
resp = req.get(basic_url)
#resp.content

In [20]:
soup = BeautifulSoup(resp.content, "xml") 

In [21]:
soup

<?xml version="1.0" encoding="utf-8"?>
<current xmlns:="current">
<weather day="05" hour="00" month="06" year="2020">
<local desc="맑음" icon="01" stn_id="90" ta="24.1">속초</local>
<local desc="박무" icon="17" stn_id="93" ta="19.8">북춘천</local>
<local desc="구름많음" icon="03" stn_id="95" ta="18.2">철원</local>
<local desc="구름많음" icon="03" stn_id="98" ta="18.8">동두천</local>
<local desc="구름많음" icon="03" stn_id="99" ta="18.3">파주</local>
<local desc="맑음" icon="01" stn_id="100" ta="18.7">대관령</local>
<local desc="-" icon="" stn_id="101" ta="21.8">춘천</local>
<local desc="맑음" icon="01" rn_hr1="0.0" stn_id="102" ta="14.1">백령도</local>
<local desc="맑음" icon="01" stn_id="104" ta="23.0">북강릉</local>
<local desc="맑음" icon="01" stn_id="105" ta="25.7">강릉</local>
<local desc="맑음" icon="01" stn_id="106" ta="25.2">동해</local>
<local desc="흐림" icon="04" rn_hr1="0.2" stn_id="108" ta="19.7">서울</local>
<local desc="비" icon="08" rn_hr1="0.9" stn_id="112" ta="18.1">인천</local>
<local desc="구름많음" icon="03" stn_id="114" ta="22

In [22]:
locals = soup.find_all('local')

In [23]:
temp = []
for i in range(0, len(locals)):
    print(locals[i].get_text(), end=",")
    print(locals[i]['ta'], end=",")
    print(locals[i]['desc'])
    temp.append(float(locals[i]['ta']))

속초,24.1,맑음
북춘천,19.8,박무
철원,18.2,구름많음
동두천,18.8,구름많음
파주,18.3,구름많음
대관령,18.7,맑음
춘천,21.8,-
백령도,14.1,맑음
북강릉,23.0,맑음
강릉,25.7,맑음
동해,25.2,맑음
서울,19.7,흐림
인천,18.1,비
원주,22.6,구름많음
울릉도,25.8,맑음
수원,19.1,비
영월,23.6,구름많음
충주,23.1,구름많음
서산,18.4,흐림
울진,25.2,구름조금
청주,23.1,박무
대전,22.7,흐림
추풍령,22.0,맑음
안동,23.6,맑음
상주,23.7,맑음
포항,27.4,맑음
군산,21.1,구름많음
대구,27.0,맑음
전주,21.0,맑음
울산,25.1,맑음
창원,20.1,맑음
광주,21.9,맑음
부산,22.2,맑음
통영,19.4,맑음
목포,19.9,박무
여수,20.9,맑음
흑산도,15.0,안개
완도,19.2,맑음
고창,20.5,맑음
순천,18.8,맑음
홍성,18.9,비
제주,19.2,구름많음
고산,19.1,맑음
성산,17.9,맑음
서귀포,21.7,맑음
진주,19.9,맑음
강화,16.7,구름조금
양평,21.6,구름많음
이천,21.3,흐림
인제,18.9,맑음
홍천,22.4,구름조금
태백,21.3,맑음
정선군,22.5,구름조금
제천,22.6,구름많음
보은,22.4,흐림
천안,22.4,흐림
보령,18.5,흐림
부여,20.4,흐림
금산,22.5,구름많음
세종,21.7,구름많음
부안,20.6,맑음
임실,18.9,맑음
정읍,19.7,맑음
남원,21.5,맑음
장수,18.8,맑음
고창군,19.7,맑음
영광군,20.1,맑음
김해시,21.6,맑음
순창군,21.6,맑음
북창원,21.6,맑음
양산시,21.7,맑음
보성군,21.5,맑음
강진군,20.2,맑음
장흥,19.5,맑음
해남,18.8,구름조금
고흥,20.1,맑음
의령군,23.2,맑음
함양군,24.1,맑음
광양시,21.4,맑음
진도군,18.3,구름많음
봉화,18.0,맑음
영주,23.5,구름조금
문경,22.7,구름조금
청송군,18.4,맑음
영덕,26.7,맑음
의성,19.

In [24]:
import numpy as np

ar = np.array(temp)

In [25]:
ar.max()

27.4

In [26]:
ar.min()

14.1