In [2]:
# This script shows how to use the requests package to scrape a page
# from Wikipedia.
import requests
# Get the response. It is always a good idea to set the `timeout`
# argument in production code. Otherwise your program may hang
# indefinitely.
r = \
    requests.get(
        'https://en.wikipedia.org/wiki/Natural_language_processing',
        timeout=3)


In [3]:
# The following code block is strictly speaking not necessary, but it
# helps you to better understand the response you got.
r.raise_for_status()            # Ensure we notice bad responses.
r.status_code


200

In [4]:
r.headers['content-type']


'text/html; charset=UTF-8'

In [5]:
r.encoding


'UTF-8'

In [None]:
print(r.text)                   # `print` gives nicer output for HTML.



In [9]:
#r.text.encode('utf-8')          # Use specific encoding.


In [None]:
r.json()            # Doesn't work in this example since no JSON data.
# Finally we write the response content to file.
with open('data-wikipedia-NLP.html', mode='wb') as fd:
    fd.write(r.content)

In [7]:
# This script shows how to download and parse a Wikipedia page using
# Requests and Beautiful Soup. We also use the `re` module for regular
# expressions.
import re
import requests
from bs4 import BeautifulSoup

In [10]:
r = \
    requests.get(
        'https://en.wikipedia.org/wiki/Natural_language_processing',
        timeout=3)

In [None]:
s = BeautifulSoup(r.text, 'lxml')  # Use `lxml` to parse the webpage.
print(s.prettify())               # Take a look at the parsed webpage.


In [None]:
# Extract all the text from the webpage. THIS IS WHAT YOU NEED MOST
# OFTEN FOR NLP AND TEXT ANALYTICS, unless you need to extract only
# part of the webpage.
s.get_text()

In [14]:
# Ways to navigate the data structure.
s.title
s.title.name
s.title.string
s.title.parent.name

'head'

In [15]:
s.p               # First 'p' tag.

<p><b>Natural language processing</b> (<b>NLP</b>) is a subfield of <a href="/wiki/Linguistics" title="Linguistics">linguistics</a>, <a href="/wiki/Computer_science" title="Computer science">computer science</a>, and <a href="/wiki/Artificial_intelligence" title="Artificial intelligence">artificial intelligence</a> concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of <a href="/wiki/Natural_language" title="Natural language">natural language</a> data.  The result is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. 
</p>

In [16]:
s.p.get_text()

'Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.  The result is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. \n'

In [17]:
s.a               # First 'a' tag.

<a id="top"></a>

In [18]:
# The difference between the `find` and the `find_all` methods is that
# the former only finds the FIRST child of this tag matching the given
# criterial, while the latter gets ALL of them.
s.find(id='footer')

<footer class="mw-footer" id="footer" role="contentinfo">
<ul id="footer-info">
<li id="footer-info-lastmod"> This page was last edited on 26 February 2021, at 12:02<span class="anonymous-show"> (UTC)</span>.</li>
<li id="footer-info-copyright">Text is available under the <a href="//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License" rel="license">Creative Commons Attribution-ShareAlike License</a><a href="//creativecommons.org/licenses/by-sa/3.0/" rel="license" style="display:none;"></a>;
additional terms may apply.  By using this site, you agree to the <a href="//foundation.wikimedia.org/wiki/Terms_of_Use">Terms of Use</a> and <a href="//foundation.wikimedia.org/wiki/Privacy_policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a href="//www.wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li>
</ul>
<ul id="footer-places">
<li id="footer-places-privacy"><a class="extiw" href="htt

In [19]:
s.find(style='clear: both;')

<div style="clear: both;"></div>

In [20]:
# Extract all 'a' tags.
atags = s.find_all('a')           # Find all `<a ...>...</a>` tags.

In [22]:
atags[0]

<a id="top"></a>

In [23]:
atags[3]


<a class="image" href="/wiki/File:Automated_online_assistant.png"><img alt="" class="thumbimage" data-file-height="501" data-file-width="400" decoding="async" height="251" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/8b/Automated_online_assistant.png/200px-Automated_online_assistant.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/8b/Automated_online_assistant.png/300px-Automated_online_assistant.png 1.5x, //upload.wikimedia.org/wikipedia/commons/8/8b/Automated_online_assistant.png 2x" width="200"/></a>

In [24]:
atags[3].name

'a'

In [25]:
atags[3].get('href')  # Get the actual `href` attribute (i.e. the URL).

'/wiki/File:Automated_online_assistant.png'

In [13]:
# If we want to get all `href` attributes, we loop over all `a` tags.
{tag.get('href') for tag in s.find_all('a')[:10]}

{'#cite_note-Kongthon-1',
 '#mw-head',
 '#searchInput',
 '/wiki/Automated_online_assistant',
 '/wiki/Computer_science',
 '/wiki/Customer_service',
 '/wiki/File:Automated_online_assistant.png',
 '/wiki/Linguistics',
 None}

In [29]:
# Find all tags whose names start with the letter 'b' (in this case
# 'body', 'b', and 'br').
{tag.name for tag in s.find_all(re.compile('^b'))}

{'b', 'bdi', 'body', 'br'}

In [30]:
# Find all tags whose name contains the letter 't'.
{tag.name for tag in s.find_all(re.compile('t'))}

{'annotation',
 'cite',
 'dt',
 'footer',
 'html',
 'input',
 'math',
 'meta',
 'mstyle',
 'noscript',
 'script',
 'semantics',
 'style',
 'table',
 'tbody',
 'td',
 'th',
 'title',
 'tr'}

In [31]:
# We can also pass a list to the `find_all` method, in which case bs4
# allows a string match against any item in that list.
{tag.name for tag in s.find_all(['a', 'body'])}

{'a', 'body'}

In [14]:
# Find all the tags in the document, but none of the text
# strings. `True` matches anything it can.
{tag.name for tag in s.find_all(True)[:10]}


{'head', 'html', 'link', 'meta', 'script', 'title'}

In [33]:
# how to use selector
[tag.get_text() for tag in s.select(
    "#mw-content-text > div.mw-parser-output > dl:nth-child(56) > dd:nth-child(12)")]


['Automatically translate text from one human language to another.  This is one of the most difficult problems, and is a member of a class of problems colloquially termed "AI-complete", i.e. requiring all of the different types of knowledge that humans possess (grammar, semantics, facts about the real world, etc.) to solve properly.']