In [6]:
# Section 11.2: Python WWW API (p. 379-386)

In [7]:
import urllib

In [8]:
from urllib.request import urlopen

In [9]:
# dir(urlopen)

In [10]:
response = urlopen('http://www.w3c.org/Consortium/facts.html')

In [11]:
type(response)

http.client.HTTPResponse

In [12]:
response.geturl()

'https://www.w3.org/Consortium/facts.html'

In [13]:
for field in response.getheaders():
    print(field)

('date', 'Wed, 01 Dec 2021 00:50:47 GMT')
('last-modified', 'Mon, 04 Jan 2021 13:50:31 GMT')
('etag', '"63c5-5b8135f9457c0"')
('accept-ranges', 'bytes')
('content-length', '25541')
('cache-control', 'max-age=21600')
('expires', 'Wed, 01 Dec 2021 06:50:47 GMT')
('vary', 'Accept-Encoding')
('content-type', 'text/html; charset=utf-8')
('x-backend', 'ssl-mirrors')
('strict-transport-security', 'max-age=15552000; includeSubdomains; preload')
('content-security-policy', 'upgrade-insecure-requests')
('connection', 'close')


In [14]:
html = response.read()

In [15]:
type(html)

bytes

In [16]:
# dir(html)

In [17]:
html.count(bytes('Web', 'utf-8'))

26

In [18]:
from urllib.request import urlopen
def getSource(url):
    'returns the content of resource specified by url as a string'
    response =  urlopen(url)
    html = response.read()
    return html.decode(errors='ignore')     # return codecs.encode(s, 'utf-8'))

In [19]:
# dir(html)
# no encode

In [20]:
# getSource('http://google.com')

In [21]:
# dir(html)

In [22]:
# practice problem 11.1, p.381
'''
write method news() that takes a URL of a new web site and a list of topic strings 
and computes the number of occurrences of each topic in the news
'''
from urllib.request import urlopen

def news(url, topics):
    response = urlopen(url)
    html = response.read()
    content = html.decode().lower()

    for topic in topics:
        n = content.count(topic)
        print(topic, n)

In [23]:
news('https://www.bbc.com/', ['economy', 'sport', 'COVID'])

economy 1
sport 51
COVID 0


In [24]:
# module html.parser
from html.parser import HTMLParser

infile = open('w3c.html')
content = infile.read()
infile.close()

parser = HTMLParser()
parser.feed(content)

In [25]:
from html.parser import HTMLParser

class LinkParser(HTMLParser):   # override
    '''
    HTML doc parser, prints values of href attributes in anchor start tags
    '''
    def handle_starttag(self, tag, attributes):
        ' print value of href attribute, if any'
        
        if tag == 'a':
            # search for href attribute and print value
            for attribute in attributes:
                if attribute[0] == 'href':
                    print(attribute[1])
                    #print(attribute)

In [26]:
infile = open('links.html')
content = infile.read()
infile.close()
linkparser = LinkParser()
linkparser.feed(content)

http://www.google.com
test.html
mailto:me@example.net


In [27]:
# practice problem 11.2, p.284-5
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    'HTML doc parser that prints tags indented by depth'
    def __init__(self):
        'initializes the parser and the initial indentation'
        HTMLParser.__init__(self)
        self.indent = 0
        
    def handle_starttag(self, tag, attrs):
        '''
        prints start tag with an indentation proportional
        to the depth of the tags element in the document
        '''
        if tag not in {'br', 'p'}:
            print('{}{} start'.format(self.indent*' ', tag))
            self.indent += 4
            
    def handle_endtag(self, tag):
        '''
        prints end tag with an indentation proportional
        to the dept of the tas element in the document
        '''
        if tag not in {'br', 'p'}:
            self.indent -= 4
            print('{}{} end'.format(self.indent*' ', tag))
            

In [28]:
infile = open("w3c.html")
content = infile.read()
infile.close()
myparser = MyHTMLParser()
myparser.feed(content)

html start
    head start
        title start
        title end
    head end
    body start
        h1 start
        h1 end
        h2 start
        h2 end
        ul start
            li start
            li end
            li start
            li end
        ul end
        a start
        a end
    body end
html end


In [29]:
# p. 384   urllib.parse
rsrce = urlopen('http://www.w3.org/Consortium/mission.html')
content = rsrce.read().decode()
linkparser = LinkParser()
linkparser.feed(content)

/
/standards/
/participate/
/Consortium/membership
/Consortium/
/Consortium/
/Consortium/facts.html
/Consortium/presskit.html
/Consortium/sponsor/
/Consortium/Recruitment/
/Consortium/contact.html
/Help/
#w3c_content_body
/
/Consortium/
#openstand
#principles
#vision
http://open-stand.org/principles/
http://open-stand.org/
/WAI/
/International/
/standards/webofdevices/
/Mobile/
/standards/agents/Overview.html
/standards/webdesign/
/standards/webarch/
/standards/xml/
/standards/semanticweb/
/standards/webofservices/
/standards/semanticweb/
/standards/xml/security
/standards/webofservices/security
/standards/webdesign/privacy
/
/standards/
/participate/
/Consortium/membership
/Consortium/
/Consortium/contact
/Help/
/Consortium/sup
/Consortium/siteindex
http://lists.w3.org/Archives/Public/site-comments/
http://twitter.com/W3C
http://www.csail.mit.edu/
http://www.ercim.org/
http://www.keio.ac.jp/
http://ev.buaa.edu.cn/
/Consortium/Legal/ipr-notice


In [32]:
from urllib.parse import urljoin
from html.parser import HTMLParser
class Collector(HTMLParser):
    'collects hyperlink URLs into a list'

    def __init__(self, url):
        'initializes parser, the url, and a list'
        HTMLParser.__init__(self)
        self.url = url
        self.links = []

        # Solution to Practice Problem 11.3        
        self.text = ''
        
    def handle_starttag(self, tag, attrs):
        'collects hyperlink URLs in their absolute format'
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    # construct absolute URL
                    absolute = urljoin(self.url, attr[1])
                    if absolute[:4] == 'http': # collect HTTP URLs
                        self.links.append(absolute)
                        
    # Solution to Practice Problem 11.3        
    def handle_data(self, data):
        'collects and concatenates text data'
        self.text += data

    def getLinks(self):
        'returns hyperlinks URLs in their absolute format'
        return self.links

    # Solution to Practice Problem 11.3
    def getData(self):
        'returns the concatenation of all text data'
        return self.text


In [33]:
# parser collects HTTP Hyperlinks
#url = 'http://www.w3.org/Consortium/mission.html'
url = 'http://classicshorts.com/stories/btw.html'
resource = urlopen(url)
content = resource.read().decode()
collector = Collector(url)
collector.feed(content)
for link in collector.getLinks():
    print(link)

https://youtu.be/q5ns9m2YIjI
http://en.wikipedia.org/wiki/Hongkong
http://en.wikipedia.org/wiki/Orient
http://dictionary.reference.com/browse/desultory
http://dictionary.reference.com/browse/indisposition
http://dictionary.reference.com/browse/aversion
http://dictionary.reference.com/browse/aristocratic
http://dictionary.reference.com/browse/occult
http://dictionary.reference.com/browse/renounce
http://dictionary.reference.com/browse/certitude
http://dictionary.reference.com/browse/destitute
http://dictionary.reference.com/browse/dismal
http://dictionary.reference.com/browse/rill
http://dictionary.reference.com/browse/scuttle
http://dictionary.reference.com/browse/inhospitality
http://dictionary.reference.com/browse/proffer
http://dictionary.reference.com/browse/melancholy
http://dictionary.reference.com/browse/prescience
http://dictionary.reference.com/browse/dispirit
http://dictionary.reference.com/browse/murk
http://dictionary.reference.com/browse/torrent
http://dictionary.reference

In [121]:
from urllib.request import urlopen
response = urlopen('https://pair.com')
read1 = response.read()
read2 = read1.decode()

In [122]:
type(read1)

bytes

In [123]:
type(read2)

str

In [125]:
read2

'<!DOCTYPE html>\n<html class="html" lang="en-US">\n<head>\n\t<meta charset="UTF-8"><link rel="preload" href="https://www.pair.com/wp-content/cache/fvm/min/1636482150-css614b240f0ba2be05cec067d887ce1ada8449313b88d402222242587232206.css" as="style" media="all" />\n<link rel="preload" href="https://www.pair.com/wp-content/cache/fvm/min/1636482150-cssff20a40cfd8e81a9bfe81032a906626137216a753f022f9194bfcfc48ce4d.css" as="style" media="all" />\n<link rel="preload" href="https://www.pair.com/wp-content/cache/fvm/min/1636482150-css8069b511a8e7fc41175dd55535f0e72f2cc622b1908cfe6abe791252aeee4.css" as="style" media="all" />\n<link rel="preload" href="https://www.pair.com/wp-content/cache/fvm/min/1636482150-cssa4a8b9c648d11ff7f62de5ee139948d08bd729dd237a3714b05872fbf9bb9.css" as="style" media="all" />\n<link rel="preload" href="https://www.pair.com/wp-content/cache/fvm/min/1636482150-cssd9ef8c81abb8d536e2f9844e9043e1af0798b8ad89973add774d84a802df4.css" as="style" media="all" />\n<link rel="prelo

In [126]:
from html.parser import HTMLParser
help(HTMLParser)

Help on class HTMLParser in module html.parser:

class HTMLParser(_markupbase.ParserBase)
 |  HTMLParser(*, convert_charrefs=True)
 |  
 |  Find tags and other markup and call handler functions.
 |  
 |  Usage:
 |      p = HTMLParser()
 |      p.feed(data)
 |      ...
 |      p.close()
 |  
 |  Start tags are handled by calling self.handle_starttag() or
 |  self.handle_startendtag(); end tags by self.handle_endtag().  The
 |  data between tags is passed from the parser to the derived class
 |  by calling self.handle_data() with the data as argument (the data
 |  may be split up in arbitrary chunks).  If convert_charrefs is
 |  True the character references are converted automatically to the
 |  corresponding Unicode character (and self.handle_data() is no
 |  longer split in chunks), otherwise they are passed by calling
 |  self.handle_entityref() or self.handle_charref() with the string
 |  containing respectively the named or numeric reference as the
 |  argument.
 |  
 |  Method res

In [128]:
from re import findall
print(findall('cl[ia][ia]ck', 'click clack cliack claick claack cliick cl ck clck'))

['cliack', 'claick', 'claack', 'cliick']


In [129]:
print(findall('cl[ia]ck', 'click clack cliack claick claack cliick cl ck clck'))

['click', 'clack']


In [130]:
print(findall('cl[ia]?[ia]?ck', 'click clack cliack claick claack cliick cl ck clck'))

['click', 'clack', 'cliack', 'claick', 'claack', 'cliick', 'clck']


In [137]:
templist = findall('b[ia][ia]?t', 'bt beet beeet bet bets b3t bat baat bite bait beit biat boat bot boot biit byte')
templist

['bat', 'baat', 'bit', 'bait', 'biat', 'biit']

In [138]:
templist = findall('b[^ia][^ia]t', 'bt beet beeet bet bets b3t bat baat bite bait biat beit boat bot boot biit byte')
templist

['beet', 'boot']

In [139]:
templist = findall('be+t|b[h-p]t', 'bt beet beeet bet bets b3t bat baat bite bait biat beit boat bot boot biit byte')

In [140]:
templist

['beet', 'beeet', 'bet', 'bet', 'bit', 'bot']

In [24]:
from html.parser import HTMLParser
class TheParser(HTMLParser):
    def handle_endtag(self, tag):
        if tag == 'title':
            print('end of title')

In [25]:
infile = open("w3c.html")
content = infile.read()
infile.close()
myparser = TheParser()

In [26]:
myparser.feed(content)

end of title


In [20]:
content

'<html>\n<head>\n<title>W3C Mission Summary</title>\n</head>\n<body>\n<h1>W3C Mission</h1>\n<p>\nThe W3C mission is to lead the World Wide Web to its full potential<br>\nby developing protocols and guidelines that ensure the long-term growth of the Web.\n</p>\n<h2>Principles</h2>\n<ul>\n<li>Web for All</li>\n<li>Web on Everything</li>\n</ul>\nSee the complete <a href="http://www.w3.org/Consortium/mission.html">W3C Mission document</a>.\n</body>\n</html>\n'

In [19]:
dir(TheParser)

['CDATA_CONTENT_ELEMENTS',
 '_HTMLParser__starttag_text',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_decl_otherchars',
 '_parse_doctype_attlist',
 '_parse_doctype_element',
 '_parse_doctype_entity',
 '_parse_doctype_notation',
 '_parse_doctype_subset',
 '_scan_name',
 'check_for_whole_start_tag',
 'clear_cdata_mode',
 'close',
 'error',
 'feed',
 'get_starttag_text',
 'getpos',
 'goahead',
 'handle_charref',
 'handle_comment',
 'handle_data',
 'handle_decl',
 'handle_endtag',
 'handle_entityref',
 'handle_pi',
 'handle_startendtag',
 'handle_starttag',
 'parse_bogus_comment',
 'parse_comment',
 'parse_declaration',
 'parse_endtag',
 'parse_html_declaration',
 'p