Skip to content

Commit

Permalink
cleanups
Browse files Browse the repository at this point in the history
Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
  • Loading branch information
kennethreitz committed Feb 27, 2018
1 parent add71f6 commit c198b8c
Showing 1 changed file with 29 additions and 24 deletions.
53 changes: 29 additions & 24 deletions requests_html.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import asyncio
from urllib.parse import urlparse, urlunparse
from concurrent.futures._base import TimeoutError
from typing import List

import pyppeteer
import requests
from pyquery import PyQuery

from fake_useragent import UserAgent
from lxml import etree
from lxml.html import HtmlElement
from lxml.html.soupparser import fromstring
from parse import search as parse_search
from parse import findall
Expand All @@ -30,7 +33,7 @@ def __init__(self, *args, **kwargs):
self._html = None

@property
def html(self):
def html(self) -> str:
if self._html:
return self._html

Expand All @@ -49,7 +52,7 @@ def _from_response(cls, response):
class BaseParser:
"""A basic HTML/Element Parser, for Humans."""

def __init__(self, *, element, default_encoding=None, html=None, url):
def __init__(self, *, element, default_encoding: str = None, html: str = None, url: str):
self.element = element
self.url = url
self.skip_anchors = True
Expand All @@ -58,7 +61,7 @@ def __init__(self, *, element, default_encoding=None, html=None, url):
self._html = html

@property
def html(self):
def html(self) -> str:
"""Unicode representation of the HTML content."""
if self._html:
return self._html
Expand All @@ -71,7 +74,7 @@ def set_html(self, html):
self._html = html

@property
def encoding(self):
def encoding(self) -> str:
"""The encoding string to be used, extracted from the HTML and
:class:`HTMLResponse <HTMLResponse>` headers.
"""
Expand All @@ -85,25 +88,25 @@ def encoding(self):
return self._encoding if self._encoding else self.default_encoding

@property
def pq(self):
def pq(self) -> PyQuery:
"""PyQuery representation of the :class:`Element <Element>` or :class:`HTML <HTML>`."""
return PyQuery(self.element)

@property
def lxml(self):
def lxml(self) -> HtmlElement:
return fromstring(self.html)

@property
def text(self):
def text(self) -> str:
"""The text content of the :class:`Element <Element>` or :class:`HTML <HTML>`."""
return self.pq.text()

@property
def full_text(self):
def full_text(self) -> str:
"""The full text content (including links) of the :class:`Element <Element>` or :class:`HTML <HTML>`.."""
return self.lxml.text_content()

def find(self, selector, first=False, _encoding=None):
def find(self, selector: str, first: bool = False, _encoding: str = None):
"""Given a jQuery selector, returns a list of :class:`Element <Element>` objects.
If ``first`` is ``True``, only returns the first :class:`Element <Element>` found."""
Expand All @@ -121,7 +124,7 @@ def gen():
else:
return c

def xpath(self, selector, first=False, _encoding=None):
def xpath(self, selector: str, first: bool = False, _encoding: str = None):
"""Given an XPath selector, returns a list of :class:`Element <Element>` objects.
If ``first`` is ``True``, only returns the first :class:`Element <Element>` found."""
Expand All @@ -134,18 +137,18 @@ def xpath(self, selector, first=False, _encoding=None):
else:
return c

def search(self, template):
def search(self, template: str):
"""Searches the :class:`Element <Element>` for the given parse template."""
return parse_search(template, self.html)

def search_all(self, template):
def search_all(self, template: str):
"""Searches the :class:`Element <Element>` (multiple times) for the given parse
template.
"""
return [r for r in findall(template, self.html)]

@property
def links(self):
def links(self) -> List[str]:
"""All found links on page, in as–is form."""
def gen():
for link in self.find('a'):
Expand All @@ -161,7 +164,7 @@ def gen():
return set(g for g in gen())

@property
def absolute_links(self):
def absolute_links(self) -> List[str]:
"""All found links on page, in absolute form."""
def gen():
for link in self.links:
Expand All @@ -183,7 +186,7 @@ def gen():
return set(g for g in gen())

@property
def base_url(self):
def base_url(self) -> str:
"""The base URL for the page. Supports the ``<base>`` tag."""

# Support for <base> tag.
Expand Down Expand Up @@ -214,7 +217,7 @@ def __repr__(self):
return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))

@property
def attrs(self):
def attrs(self) -> dict:
"""Returns a dictionary of the attributes of the class:`Element <Element>`."""
attrs = {k: self.pq.attr[k].strip() for k in self.element.keys()}

Expand All @@ -236,16 +239,16 @@ def __init__(self, *, url, html, default_encoding=DEFAULT_ENCODING):
default_encoding=default_encoding
)

def __repr__(self):
def __repr__(self) -> str:
return "<HTML url={}>".format(repr(self.url))

def render(self, retries=8):
def render(self, retries: int = 8):
"""Loads the response in Chromium, and replaces HTML content
with an updated version, JavaScript executed.
"""
async def _async_render(url):
async def _async_render(url: str):
try:
browser = pyppeteer.launch()
browser = pyppeteer.launch(headless=True)
page = await browser.newPage()

# Load the given page (GET request, obviously.)
Expand All @@ -262,15 +265,17 @@ async def _async_render(url):
for i in range(retries):
if not content:
try:
content = loop.run_until_complete(_async_render(self.url))
content = loop.run_until_complete(_async_render(url=self.url))
except TimeoutError:
pass

html = HTML(url=self.url, html=content, default_encoding=DEFAULT_ENCODING)
self.__dict__.update(html.__dict__)

return self


def user_agent(style='chrome'):
def user_agent(style='chrome') -> str:
"""Returns a random user-agent, if not requested one of a specific
style. Defaults to a Chrome-style User-Agent.
"""
Expand All @@ -296,7 +301,7 @@ def __init__(self, mock_browser=True, *args, **kwargs):
self.hooks = {'response': self._handle_response}

@staticmethod
def _handle_response(response, **kwargs):
def _handle_response(response, **kwargs) -> requests.Response:
"""Requests HTTP Response handler. Attaches .html property to Response
objects.
"""
Expand All @@ -305,7 +310,7 @@ def _handle_response(response, **kwargs):

return response

def request(self, *args, **kwargs):
def request(self, *args, **kwargs) -> HTMLResponse:
# Convert Request object into HTTPRequest object.
r = super(HTMLSession, self).request(*args, **kwargs)
html_r = HTMLResponse._from_response(r)
Expand Down

0 comments on commit c198b8c

Please sign in to comment.