Urllib is a standard Python library, containing functions for
1. requesting data across the web, 
2. handling cookies, 
3. changin metadata, such as headers and your user agent.

Documentation for urllib is https://docs.python.org/3/library/urllib.html

the function named "urlopen" is used to open a remote object across a network and read it.
urlopen is a function of urllib.

In [2]:
#Tell the processor to send data to the application that handles your wireless (or wired) interface!
from urllib.request import urlopen

#Obtain the resource
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [4]:
#The same code but with added abstraction by myself, considering html file to be a resource, having an address, and urlopen beinga a method to use to obtain the 
#resource from the address!

from urllib.request import urlopen

def obtain_resource_with(address_or_id_of_the_resource : str, method_used_to_obtain_the_resource : callable):
    return method_used_to_obtain_the_resource(address_or_id_of_the_resource)

address_of_the_resource = 'http://pythonscraping.com/pages/page1.html'
method_used_to_obtain_the_resource = urlopen

resource = obtain_resource_with(address_of_the_resource, method_used_to_obtain_the_resource)

print(resource.read())

assert(resource.read() == html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [20]:
#Get the HTML content of the page and create a BeautifulSoup object!
    #Find a name for the BeautifulSoup object and start an assignment expression with the name!
    #Start writing the constructor of the BeautifulSoup object!
    #Pass to the constructor the HTML text the object is based on!
    #Pass to the constructor the specification of the parser for BeautifulSoup to use!
    #Finish writing the constructor!

from urllib.request import urlopen
from bs4 import BeautifulSoup

#Obtain the html file!
html = urlopen('http://www.pythonscraping.com/pages/page1.html')

bs = BeautifulSoup(html.read(), 'html.parser')

#Print the first instance of the tag h1!
print(bs.h1)

print(bs.h1 == bs.html.body.h1 == bs.body.h1 == bs.body.h1 == bs.html.h1)

print(bs.nonExistentTag)

try:
    print(bs.nonExistentTag.someTag)
except AttributeError as ae:
    print("\'NoneType\' object has no attribute \'someTag\'")

<h1>An Interesting Title</h1>
True
None
'NoneType' object has no attribute 'someTag'


  print(bs.nonExistentTag)
  print(bs.nonExistentTag.someTag)


In [10]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen("https://pythonscrapingthisurldoesnotexist.com")
except HTTPError as e: #The rest of the program given an HTTPError, not being executed.
    print("The server returned an HTTP error")
except URLError as e:
    print("The server could not be found!")
else:
    print(html.read())

The server could not be found!


In [None]:
#Check whether no server could be reached at all!

from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

#Try to open the website, except when HTTPError or ERLError occurs, in the later case print the indicator of the error!
try:
    html = urlopen('https://pythonscrapingthisurldoesnotexist.com')
except HTTPError as e:
    print(e)
except URLError as e:
    print('The server could not be found!')
else:
    print('It Worked!')

In [None]:
#Generalization
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

#Try to open the website, except when HTTPError or URLError occurs, in the later case print the indicator of the error!
def initialize_html_safely(address : str):
    try:
        html = urlopen(address)
    except HTTPError as e:
        print(e)
    except URLError as e:
        print('The server could not be found!')
    else:
        print('Html initialized safely!')
        return html

In [None]:
#Guard against a bs object having or not having a tag!


try:
    badContent = bs.nonExistingTag.anotherTag
except AttributeError as e:
    print('Tag was not found')
else:
    an_attempt_was_made_to_access_a_non_existent_tag = badContent == None
    if an_attempt_was_made_to_access_a_non_existent_tag:
        print ('Tag was not found')
    else:
        print(badContent)

In [12]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup


def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        #Encapsulate two of the BeautifulSoup lines inside one try statement!
        #Constraint: A line of code exists between the "try:" and "except AttributeError as e:" keywords implies that the execution of the line might throw an AttributeError.
        bsObj = BeautifulSoup(html.read(), "lxml")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title


title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)

<h1>An Interesting Title</h1>
