## Open an URL 

In [11]:
# Load the module for interacting with URLs
import urllib.request

In [None]:
# Assign URL to be opened to variable
x = urllib.request.urlopen('https://www.google.com')
print(x.read())

## Sending requests to an URL (interacting with search engines)

### *POST

In [15]:
# POSTs: need help from parse
import urllib.parse


In [None]:
## assign url to variable
url = 'http://pythonprogramming.net'
## Assign search terms to search variable, and store in a dictionary
values = {'s':'basic','submit':'Search'}
## Encode data in correct URL format
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
## Request the data from the url specified
request = urllib.request.Request(url, data)
## Save response by opening the url
response = urllib.request.urlopen(request)
respData = response.read()
print(respData)

### *GET 

In [19]:
# Besides POST, use GET to fulfill similar search requests 
data = {}
data['name'] = 'Somebody Here'
data['location'] = 'Northampton'
data['language'] = 'Python'
url_values = urllib.parse.urlencode(data)
print(url_values)
url = 'http://www.example.com/example.cgi'
full_url = url + '?' + url_values
print(full_url)

name=Somebody+Here&location=Northampton&language=Python
http://www.example.com/example.cgi?name=Somebody+Here&location=Northampton&language=Python


## Webpages hate requests from python...? 

* Requests can be easily blocked when the user is apparently non-human.

In [17]:
try:
    x = urllib.request.urlopen('http://www.google.com/search?q=test')
    print(x.read())
    
except Exception as e:
    print(str(e))

HTTP Error 403: Forbidden


### Overcome the auto-filtering blockade?
* Provide a fake user-agent in headers when making the request

In [18]:
# Assign a "real" user string to headers
## Cheating. Pretending that you are not accessing the webpage through Python
try:
    url = 'https://www.google.com/search?q=test'
    ## Provide the fake user-agent in headers
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'
    ## Request & open
    req = urllib.request.Request(url, headers=headers)
    resp = urllib.request.urlopen(req)
    respData = resp.read()
    
    saveFile = open('withHeaders.txt', 'w')
    saveFile.write(str(respData))
    saveFile.close()
except Exception as e:
    print(str(e))
    

## How to deal with URLErrors? 

* Errors including URLError, ValueError, and TypeError may occur and break the request

In [None]:
# Use try: ... except <type of Error>
from urllib.error import URLError
req = Request(someurl)
try:
    response = urlopen(req)
except URLError as e:
    if hasattr(e, 'reason'):
        print('We failed to reach a server.')
        print('Reason: ', e.reason)
    elif hasattr(e, 'code'):
        print('The server couldn\'t fulfill the request.')
        print('Error code: ', e.code)
else:
    # everything is fine

## Treasure hunt for useful info in the responses 

* Parse through the search results with regular expressions

In [12]:
with urllib.request.urlopen('http://python.org/') as response:
    html = response.read()