In [43]:
import requests
from bs4 import BeautifulSoup

In [44]:
# スクレイピング対象のurlを指定してアクセス
url = "https://www.oreilly.co.jp/catalog/"
response = requests.get(url)

In [45]:
# htmlをBeautifulSoupで使えるようにパース
soup = BeautifulSoup(response.text, "html.parser")

In [46]:
# Chromeの開発者向けツールでCSSセレクタを生成すると以下のエラーが出る
# Only the following pseudo-classes are implemented: nth-of-type.
# そのため，nth-child -> nth-of-type のように変更
print(soup.select("#bookTable > tbody > tr:nth-of-type(1) > td:nth-of-type(1)"))
print(soup.select("#bookTable > tbody > tr:nth-of-type(1) > td:nth-of-type(1)")[0].string)

[<td>4-87311-061-0</td>]
4-87311-061-0


In [47]:
# 各オライリー本のISBNをリストに格納
n = 518 # 取得するISBNの数（2018/09/23現在，進化的アーキテクチャ）
ISBN_list = []
for i in range(1, n + 1):
    isbn = soup.select("#bookTable > tbody > tr:nth-of-type(" + str(i) + ") > td:nth-of-type(1)")
    isbn_str = isbn[0].string.replace("-", "") # urlに用いるため"-"を除外
    ISBN_list.append(isbn_str)
print(ISBN_list)

['4873110610', '4873110637', '4873110653', '4873110785', '4873110793', '4873110831', '4873110904', '4873110939', '4873110963', '4873110971', '4873110998', '487311117X', '4873111382', '4873111471', '487311148X', '4873111501', '487311151X', '4873111544', '4873111641', '4873111714', '4873111870', '4873111951', '487311196X', '4873111978', '4873112044', '487311215X', '4873112168', '4873112222', '4873112265', '4873112281', '487311232X', '4873112354', '4873112370', '4873112389', '4873112451', '487311246X', '4873112494', '4873112516', '4873112524', '4873112532', '4873112540', '4873112559', '4873112567', '4873112656', '4873112664', '4873112672', '4873112699', '4873112702', '4873112710', '4873112729', '4873112737', '4873112753', '4873112796', '487311280X', '4873112818', '4873112826', '4873112834', '4873112842', '4873112850', '4873112869', '4873112877', '4873112885', '4873112893', '4873112907', '4873112915', '487311294X', '4873112982', '4873112990', '4873113008', '4873113024', '4873113032', '4873

In [59]:
# 各オライリー本の画像を保存
base_url = "https://www.oreilly.co.jp/books/"
for target in ISBN_list:
    url = base_url + target + "/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    img_url = soup.find("img", class_="cover").get("src") # 画像のURL
    img = requests.get(url + img_url)
    with open('../datasets/original/' + target + '.png', 'wb') as f:
        f.write(img.content)
    print("ISBN: " + target + "saved")

ISBN: 4873110610saved
ISBN: 4873110637saved
ISBN: 4873110653saved
ISBN: 4873110785saved
ISBN: 4873110793saved
ISBN: 4873110831saved
ISBN: 4873110904saved
ISBN: 4873110939saved
ISBN: 4873110963saved
ISBN: 4873110971saved
ISBN: 4873110998saved
ISBN: 487311117Xsaved
ISBN: 4873111382saved
ISBN: 4873111471saved
ISBN: 487311148Xsaved
ISBN: 4873111501saved
ISBN: 487311151Xsaved
ISBN: 4873111544saved
ISBN: 4873111641saved
ISBN: 4873111714saved
ISBN: 4873111870saved
ISBN: 4873111951saved
ISBN: 487311196Xsaved
ISBN: 4873111978saved
ISBN: 4873112044saved
ISBN: 487311215Xsaved
ISBN: 4873112168saved
ISBN: 4873112222saved
ISBN: 4873112265saved
ISBN: 4873112281saved
ISBN: 487311232Xsaved
ISBN: 4873112354saved
ISBN: 4873112370saved
ISBN: 4873112389saved
ISBN: 4873112451saved
ISBN: 487311246Xsaved
ISBN: 4873112494saved
ISBN: 4873112516saved
ISBN: 4873112524saved
ISBN: 4873112532saved
ISBN: 4873112540saved
ISBN: 4873112559saved
ISBN: 4873112567saved
ISBN: 4873112656saved
ISBN: 4873112664saved
ISBN: 4873

ISBN: 9784873116600saved
ISBN: 9784873116648saved
ISBN: 9784873116655saved
ISBN: 9784873116662saved
ISBN: 9784873116679saved
ISBN: 9784873116686saved
ISBN: 9784873116693saved
ISBN: 9784873116709saved
ISBN: 9784873116716saved
ISBN: 9784873116730saved
ISBN: 9784873116747saved
ISBN: 9784873116754saved
ISBN: 9784873116761saved
ISBN: 9784873116778saved
ISBN: 9784873116785saved
ISBN: 9784873116792saved
ISBN: 9784873116808saved
ISBN: 9784873116815saved
ISBN: 9784873116822saved
ISBN: 9784873116839saved
ISBN: 9784873116846saved
ISBN: 9784873116853saved
ISBN: 9784873116860saved
ISBN: 9784873116877saved
ISBN: 9784873116884saved
ISBN: 9784873116891saved
ISBN: 9784873116921saved
ISBN: 9784873116938saved
ISBN: 9784873116945saved
ISBN: 9784873116952saved
ISBN: 9784873116969saved
ISBN: 9784873116976saved
ISBN: 9784873116983saved
ISBN: 9784873116990saved
ISBN: 9784873117003saved
ISBN: 9784873117010saved
ISBN: 9784873117027saved
ISBN: 9784873117034saved
ISBN: 9784873117041saved
ISBN: 9784873117058saved
