In [1]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

# 環境

In [2]:
%%bash
python -V

Python 3.6.5


In [3]:
%%bash
pip freeze | grep "requests-html"

requests-html==0.10.0


# 基本の使い方

In [4]:
from requests_html import HTMLSession

session = HTMLSession()

resp = session.get("https://www.python.jp/")
resp.html.url

'https://www.python.jp/'

# 要素の取得

In [5]:
# find()でCSS Selectorで要素を探します。first=Trueにすることで最初の要素を取得できる
title = resp.html.find('h4.card-title', first=True)

title.text # => Pythonとは
title.attrs # => {'class': ('card-title',)}
title.find('a') # => [<Element 'a' href='pages/about.html'>]
title.search('{}とは')[0] # => Python

'<h4 class="card-title"><a href="pages/about.html">Python'

In [6]:
title = resp.html.find('h4.card-title', first=True)
print("title.text =>", title.text)
print("title.attrs =>", title.attrs)
print("title.find('a') =>", title.find('a'))
print("search result of '{}とは' =>", title.search('{}とは')[0])

title.text => Pythonとは
title.attrs => {'class': ('card-title',)}
title.find('a') => [<Element 'a' href='pages/about.html'>]
search result of '{}とは' => <h4 class="card-title"><a href="pages/about.html">Python


# 非同期処理の実行

In [11]:
from requests_html import AsyncHTMLSession
asession = AsyncHTMLSession()

async def get_pyconjp_2017():
    r = await asession.get(f"https://pycon.jp/2017/")
    return r

async def get_pyconjp_2018():
    r = await asession.get(f"https://pycon.jp/2018/")
    return r

async def get_pyconjp_2019():
    r = await asession.get(f"https://pycon.jp/2019/")
    return r

results = asession.run(get_pyconjp_2017, get_pyconjp_2018, get_pyconjp_2019)

for result in results:
    print(result.html.url)
# => https://pycon.jp/2018/
# => https://pycon.jp/2019/
# => https://pycon.jp/2017/ja/

https://pycon.jp/2017/ja/
https://pycon.jp/2019/
https://pycon.jp/2018/


# Javascriptの実行

In [14]:
from requests_html import AsyncHTMLSession
asession = AsyncHTMLSession()

async def exec_js():
    resp = await asession.get("https://pycon.jp/2019/")
    print("before:", resp.html.find("h2", first=True)) # => None
    await resp.html.arender()
    print("after:", resp.html.find("h2", first=True).text) # => {'https://pyconjp.connpass.com/event/139133/', ...}

loop = asyncio.get_event_loop() 
loop.run_until_complete(exec_js())

# before: None
# after: Conference

before: None
after: Conference


# その他便利機能

In [15]:
resp = session.get("https://www.python.jp/")
resp.html.links

{'http://www.google.com/calendar/ical/kj670le78ju5alcbt1khect5ks%40group.calendar.google.com/public/basic.ics',
 'http://www.sbcr.jp/products/4797395440.html',
 'https://discordapp.com/',
 'https://djangocongress.jp/',
 'https://docs.python.jp',
 'https://docs.python.org/ja/2.7/',
 'https://docs.python.org/ja/3/',
 'https://github.com/python-doc-ja/python-doc-ja',
 'https://rcos.nii.ac.jp/',
 'https://twitter.com/python_japan/',
 'https://www.nii.ac.jp/about/recruit/2018/0709-2.html',
 'https://www.nii.ac.jp/about/recruit/2018/1024.html',
 'https://www.python.org/dev/peps/pep-0545/',
 'https://www.python.org/downloads/release/python-2715/',
 'https://www.python.org/downloads/release/python-372/',
 'index.html',
 'index_15.html',
 'index_2.html',
 'index_3.html',
 'index_4.html',
 'index_5.html',
 'install/install.html',
 'jobboard/index.html',
 'jobboard/nii-2.html',
 'jobboard/nii.html',
 'news/20190227_docs_moved.html',
 'news/djangocongressjp2019.html',
 'news/index.html',
 'news/wa

In [8]:
from requests_html import HTML
doc = """<a href='https://pyconjp/2019'>"""
html = HTML(html=doc)
html.links

{'https://pyconjp/2019'}