In [1]:
## Using requests and urllib

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import urllib
import requests

In [3]:
from urllib import request as req
import urllib.error as error
import urllib.parse as urlparse
import urllib.robotparser as robot


## urllib

In [4]:
## Loading URLs
link = "https://en.wikipedia.org/wiki/List_of_most_popular_websites"

In [5]:
response = req.urlopen(link)

In [6]:
type(response)

http.client.HTTPResponse

In [7]:
response_value = response.read()
response_value[0:100]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

In [9]:
print(response.getcode())
print('\n',response.geturl())
print('\n',len(response.getheaders()), response.getheaders()[0][0:100])
print('\n',response.getheader('date'))
print('\n',response.code)
print('\n',response._method)
string = response_value.decode()
print('\n',string[0])      


200

 https://en.wikipedia.org/wiki/List_of_most_popular_websites

 24 ('date', 'Sat, 12 Nov 2022 09:51:40 GMT')

 Sat, 12 Nov 2022 09:51:40 GMT

 200

 GET

 <


In [10]:
## using urlerror
try:
    req.urlopen("https://www.python.ogr")
except error.URLError as e:
    print(e)

<urlopen error [Errno -2] Name or service not known>


In [11]:
##using urlparse
print(urlparse.urlsplit(link))
print('\n', urlparse.urlsplit(link).netloc)
print('\n', urlparse.urlparse(link))

data = {'param1': 'value1', 'param2': 'value2'}
print('\n', urlparse.urlencode(data).encode('utf-8'))
print('\n',  urlparse.quote(link) )
print('\n',  urlparse.unquote(link) )
print('\n',  urlparse.quote_plus(link) )
print('\n', urlparse.urljoin('http://localhost:8080/~cache/','data_file')) 

SplitResult(scheme='https', netloc='en.wikipedia.org', path='/wiki/List_of_most_popular_websites', query='', fragment='')

 en.wikipedia.org

 ParseResult(scheme='https', netloc='en.wikipedia.org', path='/wiki/List_of_most_popular_websites', params='', query='', fragment='')

 b'param1=value1&param2=value2'

 https%3A//en.wikipedia.org/wiki/List_of_most_popular_websites

 https://en.wikipedia.org/wiki/List_of_most_popular_websites

 https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FList_of_most_popular_websites

 http://localhost:8080/~cache/data_file


In [12]:
## robotparser
par = robot.RobotFileParser()
par.set_url('https://www.samsclub.com/robots.txt')
par.read()
print(par)
print(par.can_fetch('*','https://www.samsclub.com/referal')) #Allow
print(par.can_fetch('*','https://www.samsclub.com/category')) #Allow
print(par.can_fetch('*','https://www.samsclub.com/friend')) #Disallow

User-agent: *
Disallow: /catalog/search/
Disallow: /s/
Disallow: %2ASCDC%3D1%2A
Disallow: /%2A%3Fbvroute
Disallow: /list/
Disallow: %2A/samsclub/0
Disallow: /catalog/product/
Disallow: /catalog/productreviews/
Disallow: /locator%3FreturnRoute
Disallow: /myinstantsavings/details/
Disallow: /sams/subscription/
Disallow: /account/
Allow: /sams/account/referal/
Allow: /sams/account/signin/login.jsp
Allow: /sams/account/signin/createSession.jsp
Allow: /sams/account/registration/registration.jsp
Allow: /sams/account/membership/renewMembership.jsp
Disallow: /managepreferences/
Disallow: /checkout/
Disallow: /cart/
Disallow: /search/
Disallow: /pharmacy/login/
Disallow: /cgi-bin/
Disallow: /friend
True
True
False


However, there are some limitations to using urllib.request. Connection-based delays
can occur while using functions like urlopen() and urlretrieve(). These functions
return raw data and need to be converted into the required type for the parser before
they can be used in the scraping process. 

*Deploying threads, or threading, is considered an effective technique when dealing
with HTTP requests and responses.*

# requests

#### Capabilities
* Short, simple, and readable functions and attributes 
* Access to various HTTP methods (GET, POST, and PUT, to name a few) 
* Gets rid of manual actions, like encoding form values 
* Processes query strings 
* Custom headers 
* Session and cookie processing 
* Deals with JSON requests and content 
* Proxy settings 
* Deploys encoding and compliance 
* API-based link headers 
* Raw socket response 
* Timeouts and more... 


In [13]:
link="http://www.python-requests.org"
r = requests.get(link)

print('\n', r, type(r))
print('\n', r.url)
print('\n', r.status_code)
print('\n', r.history)
print('\n', r.headers)
print('\n', r.headers['Content-Type'])
print('\n', r.request.headers)
print('\n', r.encoding)
print('\n', r.content[0:100])


 <Response [200]> <class 'requests.models.Response'>

 http://www.python-requests.org/

 200

 []

 {'Date': 'Sat, 12 Nov 2022 21:53:19 GMT', 'Server': 'Apache', 'X-Powered-By': 'PHP/7.2.10-0ubuntu0.18.04.1', 'Set-Cookie': 'PHPSESSID=714d3321dbd5535f2b905dfda53f70a3; path=/, SERVERID=s5; path=/', 'Expires': 'Thu, 19 Nov 1981 08:52:00 GMT', 'Cache-Control': 'no-store, no-cache, must-revalidate', 'Pragma': 'no-cache', 'Content-Length': '2694', 'Content-Type': 'text/html; charset=UTF-8'}

 text/html; charset=UTF-8

 {'User-Agent': 'python-requests/2.28.1', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive'}

 UTF-8

 b'\n<html>\n<head>\n\t<title>Domain Suspension</title>\n\t<meta http-equiv="Content-Type" content="text/html'


In [14]:
# returns a raw socket response from the server 
link="http://www.python-requests.org"
r = requests.get(link, stream=True)


In [15]:
print('\n', type(r.raw), '\n', r.raw.read(100))


 <class 'urllib3.response.HTTPResponse'> 
 b'\n<html>\n<head>\n\t<title>Domain Suspension</title>\n\t<meta http-equiv="Content-Type" content="text/html'


In [16]:
## Deal with json
link = "https://feeds.citibikenyc.com/stations/stations.json"

r = requests.get(link).json()
print(r.keys())

dict_keys(['executionTime', 'stationBeanList'])


#### Implementing HTTP methods: GET and POS

Generally, web-based interaction or communication between the web page and the
user or reader is achieved as follows: 

* The user or reader can access the web page to read or navigate through information that's presented to them 
* The user or reader can also submit certain information to the web page using the HTML form, such as by searching, logging in, user registration, password recovery, and so on 

In this section, we will be using the requests Python library to implement common
HTTP methods (GET and POST) that execute the HTTP-based communication scenario we
listed previously. 

* **GET**: A command way to request information is to use safe methods since the **resource state is not altered**. The GET parameters, also known as query strings, are visible in the URL. They are appended to the URL using `?` and are available as `key=value` pairs. 

* **POST**: These are known as secure requests that are made to a source. The requested **resource state can be altered**. Data that's posted or sent to the requested URL is not visible in the URL; instead, it's transferred to the request body. A request that's made using POST isn't cached or bookmarked and has no restrictions in terms of length. 



In [17]:
## request with parameters and headers
## GET
link="http://localhost:8888/~cache"
queries= {'id':'123456','display':'yes'}
addedheaders={'user-agent':''}
r = requests.get(link, params=queries, headers=addedheaders) 
print(r.url)

http://localhost:8888/~cache?id=123456&display=yes


In [18]:
## POST
pageUrl="http://httpbin.org/forms/post"
postUrl="http://httpbin.org/post"

params = {'custname':'Mr. ABC', 'custtel':'', 'custemail':'abc@somedomain.com',
          'size':'small','topping':['cheese','mushroom'],
          'delivery':'13:00','comments':'None'}

headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Content-Type':'application/x-www-form-urlencoded','Referer':pageUrl}

response = requests.post(postUrl, 
                         data=params,
                         headers=headers).json()
print(response)

{'args': {}, 'data': '', 'files': {}, 'form': {'comments': 'None', 'custemail': 'abc@somedomain.com', 'custname': 'Mr. ABC', 'custtel': '', 'delivery': '13:00', 'size': 'small', 'topping': ['cheese', 'mushroom']}, 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Content-Length': '130', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'httpbin.org', 'Referer': 'http://httpbin.org/forms/post', 'User-Agent': 'python-requests/2.28.1', 'X-Amzn-Trace-Id': 'Root=1-63701605-724801a86d6fad350fc51e34'}, 'json': None, 'origin': '200.233.44.71', 'url': 'http://httpbin.org/post'}


## Others examples

In [19]:
response_from_requests = requests.get(link)
response_from_requests

<Response [404]>

In [20]:
type(response_from_requests.content)

bytes

In [21]:
## use decode to convert bytes to str
type(response_from_requests.content.decode())

str

In [22]:
response_from_requests.content[0:100]

b'<!DOCTYPE HTML>\n<html>\n\n<head>\n    <meta charset="utf-8">\n\n    <title>Jupyter Notebook</title>\n    <'

### Save in local store the web content 

In [23]:
link_2 = 'https://www.samsclub.com/robots.txt'
#download the file content in the file /tmp/tmpfile
req.urlretrieve(link_2)

('/tmp/tmpkkbl_p8h', <http.client.HTTPMessage at 0x7fa63ec3a1a0>)

In [24]:
link_3 = 'https://www.samsclub.com/robots.txt'
#download the current directory with specified file name
req.urlretrieve(link_3, 'file_to_save_content_link_3.txt')

('file_to_save_content_link_3.txt',
 <http.client.HTTPMessage at 0x7fa63ef03040>)

In [23]:
## another method
link="https://www.samsclub.com/sitemap.xml"
content = requests.get(link).content
file = open(os.getcwd()+os.sep+os.sep+os.sep+"sitemap.xml","wb")
file.write(content)
file.close()