Skip to content

Commit

Permalink
pyppeteer
Browse files Browse the repository at this point in the history
Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
  • Loading branch information
kennethreitz committed Feb 27, 2018
1 parent f026f82 commit 3f58d2b
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 37 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ parse = "*"
"bs4" = "*"
"pyqt5" = "*"
"w3lib" = "*"
pyppeteer = "*"


[dev-packages]
Expand Down
36 changes: 35 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

51 changes: 19 additions & 32 deletions requests_html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import asyncio
from urllib.parse import urlparse, urlunparse

import pyppeteer
import requests
from pyquery import PyQuery

Expand All @@ -10,11 +12,7 @@
from parse import findall
from w3lib.encoding import html_to_unicode

try:
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
except ImportError:
pass



DEFAULT_ENCODING = 'utf-8'
Expand Down Expand Up @@ -154,7 +152,8 @@ def gen():
try:
href = link.attrs['href'].strip()
if not href.startswith('#') and self.skip_anchors and href not in ['javascript:;']:
yield href
if href:
yield href
except KeyError:
pass

Expand Down Expand Up @@ -285,7 +284,7 @@ def request(self, *args, **kwargs):

class BrowserHTMLSession(HTMLSession):
"""A web-browser interpreted session (for JavaScript), powered by
PyQt5's QWebEngineView."""
`PyPpeteer <https://pypi.python.org/pypi/pyppeteer>`_."""

def __init__(self, *args, **kwargs):
super(BrowserHTMLSession, self).__init__(*args, **kwargs)
Expand All @@ -294,7 +293,7 @@ def request(self, *args, **kwargs):
# Convert Request object into HTTPRequest object.
r = super(BrowserHTMLSession, self).request(*args, **kwargs)

r._content = self.render(r.text).encode(DEFAULT_ENCODING)
r._content = self.render(r.url).encode(DEFAULT_ENCODING)
r.encoding = DEFAULT_ENCODING

return r
Expand All @@ -303,30 +302,18 @@ def request(self, *args, **kwargs):
def render(source_url):
"""Fully render HTML, JavaScript and all."""

if 'QApplication' not in globals():
raise RuntimeError('PyQt5 must be installed.')

class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication([])
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
# self.load(QUrl(url))
self.app.exec_()

def _loadFinished(self, result):
# This is an async call, you need to wait for this
# to be called before closing the app
self.page().toHtml(self._callable)

def _callable(self, data):
self.html = data
# Data has been stored, it's safe to quit the app
self.app.quit()

return Render(source_url).html
async def _async_render(url):
browser = pyppeteer.launch()
page = await browser.newPage()
await page.goto(url)

content = await page.content()
return content

loop = asyncio.get_event_loop()
content = loop.run_until_complete(_async_render(source_url))

return content


# Backwards compatiblity.
Expand Down
5 changes: 1 addition & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

# What packages are required for this module to be executed?
REQUIRED = [
'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib'
'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib', 'pyppeteer'
]

# The rest you shouldn't have to touch too much :)
Expand Down Expand Up @@ -79,9 +79,6 @@ def run(self):
author_email=EMAIL,
url=URL,
python_requires='>=3.5.0',
extras_require={
'browser': ['PyQt5'],
},
# If your package is a single module, use this instead of 'packages':
py_modules=['requests_html'],

Expand Down

0 comments on commit 3f58d2b

Please sign in to comment.