# Exploring the requests-html capabilities

In [None]:
# Official documentation of the requests-html package: https://requests-html.readthedocs.io/en/latest/

### Initial setup

In [1]:
# Loading the necessary packages
from requests_html import HTMLSession

In [2]:
# establish/open a session
session = HTMLSession()

In [3]:
# submitting a GET request
r = session.get("https://en.wikipedia.org/wiki/Association_football")
r.status_code

200

In [4]:
# The html response to the GET request is contained in the '.html' method
r.html

<HTML url='https://en.wikipedia.org/wiki/Association_football'>

### Links

In [5]:
# We can extract all link addresses directly with '.links'
urls = r.html.links
urls

{'/wiki/File:Yellow_card.svg',
 '/wiki/Flag_football',
 'https://id.loc.gov/authorities/subjects/sh85123840',
 '/wiki/List_of_association_football_rivalries',
 '/wiki/Help:Authority_control',
 '/wiki/2005_in_association_football',
 'https://www.fifa.com/worldcup/news/y=2015/m=12/news=2014-fifa-world-cuptm-reached-3-2-billion-viewers-one-billion-watched--2745519.html',
 '/wiki/Shrewsbury_School',
 'https://wikimediafoundation.org/',
 '/wiki/Association_football#Players,_equipment,_and_officials',
 '/wiki/Mixed-sex_sports',
 'https://stq.wikipedia.org/wiki/Foutbal',
 'http://www.heraldscotland.com/sport/football/no-longer-the-game-of-two-halves.19185657',
 'https://web.archive.org/web/20050314003412/http://www.fifa.com/en/marketing/newmedia/index/0%2C3509%2C10%2C00.html',
 '/wiki/Biribol',
 '/wiki/1918_in_association_football',
 '/wiki/Association_football_headgear',
 'https://gan.wikipedia.org/wiki/%E8%85%B3%E7%90%83',
 '/wiki/Eight-man_football',
 '/wiki/Exhibition_game',
 'https://en.

In [6]:
# Note that those are the relative URLs 

In [7]:
# To get absolute URLs we can use '.absolute_links' instead of '.links'
full_path_urls = r.html.absolute_links
full_path_urls

{'https://en.wikipedia.org/wiki/List_of_men%27s_national_association_football_teams',
 'https://id.loc.gov/authorities/subjects/sh85123840',
 'https://en.wikipedia.org/wiki/Footbag_net',
 'https://www.fifa.com/worldcup/news/y=2015/m=12/news=2014-fifa-world-cuptm-reached-3-2-billion-viewers-one-billion-watched--2745519.html',
 'https://en.wikipedia.org/wiki/File:Ousmane_Demb%C3%A9l%C3%A9_World_Cup_Trophy.jpg',
 'https://en.wikipedia.org/wiki/Boules',
 'https://en.wikipedia.org/wiki/List_of_association_football_stadiums_by_country',
 'https://en.wikipedia.org/wiki/1932_in_association_football',
 'https://wikimediafoundation.org/',
 'https://en.wikipedia.org/wiki/Wikipedia:Featured_articles',
 'https://en.wikipedia.org/wiki/Throwback_uniform',
 'https://en.wikipedia.org/wiki/Template_talk:Team_sports',
 'https://en.wikipedia.org/wiki/Rabona',
 'https://stq.wikipedia.org/wiki/Foutbal',
 'http://www.heraldscotland.com/sport/football/no-longer-the-game-of-two-halves.19185657',
 'https://foun

In [8]:
# An important thing to note is that these links (given by both methods) are returned in a SET, not a LIST
type(urls)

set

## Searching for elements

In [9]:
# A quick note: requests-html uses CSS selectors for searching
# We will cover them in the next section,
# but here is a more thorough look into it: https://www.w3schools.com/cssref/css_selectors.asp

In [10]:
# We can search for elements similarly to Beautiful Soup using the find() method
# It behaves as find_all()

# find all 'a' tags
links = r.html.find("a")
links

[<Element 'a' id='top'>,
 <Element 'a' href='/wiki/Wikipedia:Featured_articles' title='This is a featured article. Click here for more information.'>,
 <Element 'a' href='/wiki/Wikipedia:Protection_policy#semi' title='This article is semi-protected.'>,
 <Element 'a' href='/wiki/File:Football_(soccer)_Part_One.ogg' title='Listen to this article'>,
 <Element 'a' class=('mw-jump-link',) href='#mw-head'>,
 <Element 'a' class=('mw-jump-link',) href='#p-search'>,
 <Element 'a' class=('mw-disambig',) href='/wiki/Soccer_(disambiguation)' title='Soccer (disambiguation)'>,
 <Element 'a' href='/wiki/Football' title='Football'>,
 <Element 'a' class=('image',) href='/wiki/File:Ronaldinho_and_Khedira.jpg'>,
 <Element 'a' href='/wiki/Ronaldinho' title='Ronaldinho'>,
 <Element 'a' href='/wiki/UEFA_Champions_League' title='UEFA Champions League'>,
 <Element 'a' href='/wiki/A.C._Milan' title='A.C. Milan'>,
 <Element 'a' class=('mw-redirect',) href='/wiki/Real_Madrid_C.F.' title='Real Madrid C.F.'>,
 <El

In [11]:
links[4]

<Element 'a' class=('mw-jump-link',) href='#mw-head'>

In [12]:
# To get the raw HTML of an element use the '.html' method
links[4].html

'<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>'

In [13]:
type(links[4].html)

str

In [14]:
# To extract the text inside an element, use ".text", just like in Beautiful Soup
links[4].text

'Jump to navigation'

In [15]:
# To obtain a dictionary of the element's attributes, use '.attrs' (exactly as in Beautiful Soup)
links[10].attrs

{'href': '/wiki/UEFA_Champions_League', 'title': 'UEFA Champions League'}

In [16]:
# This package offers a couple of ways to filter tags based off text

# Choose only those tags that contain the string 'wikipedia' in their text (not in the 'href' attribute)
# Note: this is not case-sensitive
r.html.find("a", containing = "wikipedia")

[<Element 'a' href='//en.wikipedia.org/wiki/Wikipedia:Contact_us'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Wikipedia:About'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Find out about Wikipedia'>,
 <Element 'a' href='//shop.wikimedia.org' title='Visit the Wikipedia store'>,
 <Element 'a' href='https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en' title='Support us'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_NDL_identifiers' title='Category:Wikipedia articles with NDL identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_NARA_identifiers' title='Category:Wikipedia articles with NARA identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_LCCN_identifiers' title='Category:Wikipedia articles with LCCN identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_HDS_identifiers' title='Category:Wikipedia articles 

In [17]:
# display the text of those tags
[tag.text for tag in r.html.find("a", containing = "wikipedia")]

['Contact Wikipedia',
 'About Wikipedia',
 'About Wikipedia',
 'Wikipedia store',
 'Donate to Wikipedia',
 'Wikipedia articles with NDL identifiers',
 'Wikipedia articles with NARA identifiers',
 'Wikipedia articles with LCCN identifiers',
 'Wikipedia articles with HDS identifiers',
 'Wikipedia articles with GND identifiers',
 'Wikipedia articles with BNF identifiers',
 'Wikipedia indefinitely semi-protected pages',
 'https://en.wikipedia.org/w/index.php?title=Association_football&oldid=934524737']

In [18]:
# If we wish to find only the first element (similarly to Beautiful Soup .find()) we need to specify the 'first' parameter
r.html.find("p", first = True)

<Element 'p' class=('mw-empty-elt',)>