In [2]:
from bs4 import BeautifulSoup

# HTMLをpythonの変数として格納
basehtml = '''
<h1>フルーツの一覧</h1>
<ul>
<li>りんご</li>
<li>みかん</li>
<li>バナナ</li>
</ul>
'''

# HTMLテキストを「html.parser」を使って解析
soup = BeautifulSoup(basehtml,"html.parser")

# 解析したテキスト「soup」からH1タグ情報を見つける
title = soup.find("h1")
title.name = "h2" # タグをh2要素に変更
title["class"] = "fruit" # fruitという名前のclassを付与

# H1情報を出力
print(title)

<h2 class="fruit">フルーツの一覧</h2>


In [6]:
# import urllib2ではなく下記の文言を使用
import urllib.request, urllib.error
from bs4 import BeautifulSoup

# アクセスするURL
url = "https://www.yahoo.co.jp/"
# URLを開く
html = urllib.request.urlopen(url)
# BeautifulSoupで開く
soup = BeautifulSoup(html, "html.parser")



title_tag = soup.title
# titleタグの出力。Python3ではprint(変数)と記述する
print(title_tag)


desc_tag = soup.find(attrs={'name' : 'description'})

desc_txt = desc_tag['content']

print(desc_txt)

<title>Yahoo! JAPAN</title>
あなたの毎日をアップデートする情報ポータル。検索、ニュース、天気、スポーツ、メール、ショッピング、オークションなど便利なサービスを展開しています。


In [8]:
import csv

# アクセスするURL
url = "https://mainichi.jp/"
# URLを開く
html = urllib.request.urlopen(url)
# BeautifulSoupで開く
soup = BeautifulSoup(html, "html.parser")

# HTMLからニュース一覧に使用しているaタグを絞りこんでいく
tag_mainbox = soup.select_one(".main-box")
tag_listA = tag_mainbox.select_one(".list-typeA")
news_tag = tag_listA.findAll("a")
# 配列の作成。
csvlist = []
for news_txt in news_tag:
    news_txt = news_txt.text
    csvlist.append(news_txt)

# CSVファイルを開く。ファイルがなければ新規作成する。
f = open("output.csv", "w")
writecsv = csv.writer(f, lineterminator='\n')

# 出力
writecsv.writerow(csvlist)

# CSVファイルを閉じる。
f.close()

In [10]:
# アクセスするURL
url = "https://mainichi.jp/"
# URLを開く
html = urllib.request.urlopen(url)
# BeautifulSoupで開く
soup = BeautifulSoup(html, "html.parser")

# HTMLからニュース一覧に使用しているaタグを絞りこんでいく
tag_mainbox = soup.select_one(".main-box")
tag_listA = tag_mainbox.select_one(".list-typeA")
news_tag = tag_listA.findAll("a")
# 配列の作成。表の見出し部分の情報を入力しておく。
csvlist = [["","ニュースリスト"]]
num = 0
for news_txt in news_tag:
    news_txt = news_txt.text
    csvlist.append([num, news_txt])
    num += 1

# CSVファイルを開く。ファイルがなければ新規作成する。
f = open("output.csv", "w")
writecsv = csv.writer(f, lineterminator='\n')

# 出力
writecsv.writerows(csvlist)

# CSVファイルを閉じる。
f.close()


In [12]:
url = 'https://scraping-for-beginner.herokuapp.com/ranking/'
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

tag_mainbox = soup.select_one('.row')
tag_listA = tag_mainbox.select_one('.u_areaListRankingBox row')

quote_elms = soup.find_all('div', {'class': 'u_areaListRankingBox row'})
print(quote_elms)

[<div class="u_areaListRankingBox row">
<div class="u_title col s12">
<p><h2><span class="badge">1</span>観光地 1</h2></p>
</div>
<!-- 観光地イメージ -->
<div class="place_img col s12">
<img alt="" src="/static/assets/img/img1.JPG"/>
</div>
<!-- 総合評価 -->
<div class="u_rankBox col s12">
<span style="--rate: 94.0%;"></span><span class="evaluateNumber">4.7</span><br/>
</div>
<!-- 各カテゴリ評価 -->
<div class="u_categoryTipsItem col s12">
<dl>
<dt>楽しさ</dt>
<dd class="is_rank"><span class="evaluateNumber">4.6</span></dd>
<dd class="comment">とてもエンジョイした</dd>
</dl>
<dl>
<dt>人混みの多さ</dt>
<dd class="is_rank"><span class="evaluateNumber">4.5</span></dd>
<dd class="comment">非常に空いていた</dd>
</dl>
<dl>
<dt>景色</dt>
<dd class="is_rank"><span class="evaluateNumber">4.9</span></dd>
<dd class="comment">景色に魅了された</dd>
</dl>
<dl>
<dt>アクセス</dt>
<dd class="is_rank"><span class="evaluateNumber">4.2</span></dd>
<dd class="comment">船で2時間ほどであった</dd>
</dl>
</div>
<div class="divider">
</div>
</div>, <div class="u_areaListRankingBox 

In [23]:
# こちらで用意したHTML
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# BeautifulSoupの初期化
soup = BeautifulSoup(html_doc, 'html.parser') # BeautifulSoupの初期化

In [24]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [30]:
# <title>The Dormouse's story</title>を取得してください。
print(soup.title)
# The Dormouse's story を取得してください。
print(soup.title.string)

<title>The Dormouse's story</title>
The Dormouse's story


In [33]:
print(soup.find_all('a'))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [35]:
for atag in soup.find_all("a"):
    print(atag)
    print(atag.string)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie


In [39]:
tags = soup.find_all('a')
for tag in tags:
    print(tag)
    
for tag in tags:
    print(tag.string)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Elsie
Lacie
Tillie


In [40]:
soup.a.get('href')

'http://example.com/elsie'

In [41]:
for link in tags:
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [45]:
response = requests.get("https://review-of-my-life.blogspot.com/")
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [48]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [49]:
soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [51]:
tags = soup.find_all("a")
for tag in tags:
    print(tag.string)

Elsie
Lacie
Tillie


In [52]:
soup.find('body')

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>