In [1]:
import requests

url = 'http://www.webscrapingfordatascience.com/basichttp/'
r = requests.get(url)

# Which HTTP status code did we get back from the server?
print(r.status_code)
# What is the textual status code?
print(r.reason)
# What were the HTTP response headers?
print(r.headers)

# The request information is saved as a Python object in r.request:
print(r.request)
# What were the HTTP request headers?
print(r.request.headers)

# The HTTP response content:
print(r.text)

200
OK
{'Date': 'Fri, 09 Aug 2019 04:21:47 GMT', 'Server': 'Apache/2.4.18 (Ubuntu)', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '20', 'Connection': 'keep-alive'}
<PreparedRequest [GET]>
{'User-Agent': 'python-requests/2.21.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
Hello from the web!



## HTML and CSS

In [6]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/w/index.php' + \
'?title=List_of_Game_of_Thrones_episodes&oldid=802553687'

r = requests.get(url)
#print(r.text)
html_contents = r.text
html_soup = BeautifulSoup(html_contents, 'html.parser' )

type(html_soup)

bs4.BeautifulSoup

In [10]:
# find(name, attrs, recursive, string, **keywords);
# find_all(name, attrs, recursive, string, limit, **keywords)

print( html_soup.find('h1'))
print(html_soup.find('', {'id': 'p-logo'}))

<h1 class="firstHeading" id="firstHeading" lang="en">List of <i>Game of Thrones</i> episodes</h1>
<div id="p-logo" role="banner"><a class="mw-wiki-logo" href="/wiki/Main_Page" title="Visit the main page"></a></div>


In [11]:
for found in html_soup.find_all(['h1', 'h2']):
    print(found)

<h1 class="firstHeading" id="firstHeading" lang="en">List of <i>Game of Thrones</i> episodes</h1>
<h2>Contents</h2>
<h2><span class="mw-headline" id="Series_overview">Series overview</span></h2>
<h2><span class="mw-headline" id="Episodes">Episodes</span></h2>
<h2><span class="mw-headline" id="Home_media_releases">Home media releases</span></h2>
<h2><span class="mw-headline" id="Ratings">Ratings</span></h2>
<h2><span class="mw-headline" id="References">References</span></h2>
<h2><span class="mw-headline" id="External_links">External links</span></h2>
<h2>Navigation menu</h2>


In [19]:
# We'll use a list to store our episode list
episodes = []

ep_tables = html_soup.find_all('table', class_="wikiepisodetable")

for table in ep_tables:
    headers = []
    rows = table.find_all('tr')
    # Start by fetching the header cells from the first row to determine
    # the field names
    for header in table.find('tr').find_all('th'):
        headers.append(header.text)
    # Then go through all the rows except the first one
    for row in table.find_all('tr')[1:]:
        values = []
        # And get the column cells, the first one being inside a th-tag
        for col in row.find_all(['th','td']):
            values.append(col.text)
        if values:
            episode_dict = {headers[i]: values[i] for i in range(len(values))}
            episodes.append(episode_dict)
            
# Show the results
for i,episode in enumerate(episodes):
    if i == 3:
        break
    print(episode)
    print("-------------")

{'No.overall': '1', 'No. inseason': '1', 'Title': '"Winter Is Coming"', 'Directed by': 'Tim Van Patten', 'Written by': 'David Benioff & D. B. Weiss', 'Original air date\u200a[20]': 'April\xa017,\xa02011\xa0(2011-04-17)', 'U.S. viewers(millions)': '2.22[21]'}
-------------
{'No.overall': '2', 'No. inseason': '2', 'Title': '"The Kingsroad"', 'Directed by': 'Tim Van Patten', 'Written by': 'David Benioff & D. B. Weiss', 'Original air date\u200a[20]': 'April\xa024,\xa02011\xa0(2011-04-24)', 'U.S. viewers(millions)': '2.20[22]'}
-------------
{'No.overall': '3', 'No. inseason': '3', 'Title': '"Lord Snow"', 'Directed by': 'Brian Kirk', 'Written by': 'David Benioff & D. B. Weiss', 'Original air date\u200a[20]': 'May\xa01,\xa02011\xa0(2011-05-01)', 'U.S. viewers(millions)': '2.44[23]'}
-------------


## Using POST

In [20]:
url = 'http://www.webscrapingfordatascience.com/postform2/'

# First perform a GET request
r = requests.get(url)

# Followed by a POST request
formdata = {
    'name': 'Seppe',
    'gender': 'M',
    'pizza': 'like',
    'haircolor': 'brown',
    'comments': ''
    }

r = requests.post(url, data=formdata)
print(r.text)

<html>
	<body>


<h2>Thanks for submitting your information</h2>

<p>Here's a dump of the form data that was submitted:</p>

<pre>array(5) {
  ["name"]=>
  string(5) "Seppe"
  ["gender"]=>
  string(1) "M"
  ["pizza"]=>
  string(4) "like"
  ["haircolor"]=>
  string(5) "brown"
  ["comments"]=>
  string(0) ""
}
</pre>


	</body>
</html>



In [22]:
import requests

url = 'http://www.webscrapingfordatascience.com/usercheck/'

r = requests.get(url)

print(r.text)
# Shows: It seems you are using a scraper

print(r.request.headers)

It seems you are using a scraper!
{'User-Agent': 'python-requests/2.21.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


In [24]:
my_headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + ' (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

r = requests.get(url, headers=my_headers)

print(r.text) # it's using another link
print(r.request.headers)

It seems you are using a scraper!
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36  (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


In [25]:

my_headers = {
  'Referer': 'http://www.webscrapingfordatascience.com/referercheck/'
}

r = requests.get(url, headers=my_headers)

print(r.text)

It seems you are using a scraper!


In [26]:
url = 'http://www.webscrapingfordatascience.com/referercheck/secret.php'

my_headers = {
  'Referer': 'http://www.webscrapingfordatascience.com/referercheck/'
}

r = requests.get(url, headers=my_headers)

print(r.text)

This is a totally secret page


In [27]:
url = 'http://www.webscrapingfordatascience.com/redirect/'
r = requests.get(url)

print(r.text)
print(r.headers)

Hello, there -- you've been redirected here from another page!

{'Date': 'Fri, 09 Aug 2019 04:39:56 GMT', 'Server': 'Apache/2.4.18 (Ubuntu)', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '63', 'Connection': 'keep-alive'}


In [28]:
url = 'http://www.webscrapingfordatascience.com/redirect/'
r = requests.get(url, allow_redirects=False)

print(r.text)
print(r.headers)

You will be redirected... bye bye!
{'Date': 'Fri, 09 Aug 2019 04:40:14 GMT', 'Server': 'Apache/2.4.18 (Ubuntu)', 'SECRET-CODE': '1234', 'Location': 'http://www.webscrapingfordatascience.com/redirect/destination.php', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '34', 'Connection': 'keep-alive'}


In [31]:
url = 'http://www.webscrapingfordatascience.com/authentication/'

r = requests.get(url, auth=('harsha', 'test'))

print(r.text)
print(r.request.headers)

Hello harsha.
You entered test as your password.
{'User-Agent': 'python-requests/2.21.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Authorization': 'Basic aGFyc2hhOnRlc3Q='}


## With Cookies

In [32]:
import requests

url = 'http://www.webscrapingfordatascience.com/cookielogin/secret.php'

r = requests.get(url)

print(r.text)

Hmm... it seems you are not logged in


In [36]:

# First perform a POST request
r = requests.post(url, data={'username': 'dummy', 'password': '1234'})

# Get the cookie value, either from
# r.headers or r.cookies print(r.cookies)
my_cookies = r.cookies

# r.cookies is a RequestsCookieJar object which can also
# be accessed like a dictionary. The following also works:
my_cookies['PHPSESSID'] = r.cookies.get('PHPSESSID')

# Now perform a GET request to the secret page using the cookies
r = requests.get(url + 'secret.php', cookies=my_cookies)

print(r.text)

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>404 Not Found</title>
</head><body>
<h1>Not Found</h1>
<p>The requested URL /cookielogin/secret.phpsecret.php was not found on this server.</p>
<hr>
<address>Apache/2.4.18 (Ubuntu) Server at www.webscrapingfordatascience.com Port 80</address>
</body></html>



In [37]:
url = 'http://www.webscrapingfordatascience.com/redirlogin/'

# First perform a POST request -- do not follow the redirect
r = requests.post(url, data={'username': 'dummy', 'password': '1234'},
                  allow_redirects=False)

# Get the cookie value, either from r.headers or r.cookies
print(r.cookies)

my_cookies = r.cookies

# Now perform a GET request manually to the secret page using the cookies
r = requests.get(url + 'secret.php', cookies=my_cookies)

print(r.text)

<RequestsCookieJar[<Cookie PHPSESSID=qbfo0qtp4e09g2bi75pi07hat7 for www.webscrapingfordatascience.com/>]>
This is a secret code: 1234


In [38]:

url = 'http://www.webscrapingfordatascience.com/trickylogin/'

# First perform a normal GET request to get the form
r = requests.post(url)

# Then perform the POST request -- do not follow the redirect
r = requests.post(url, params={'p': 'login'},
                  data={'username': 'dummy', 'password': '1234'},
                  allow_redirects=False)

# Set the cookies
my_cookies = r.cookies

# Now perform a GET request manually to the secret page using the cookies
r = requests.get(url, params={'p': 'protected'}, cookies=my_cookies)

print(r.text)

You should login first before accessing the protected page!
<br><br>
<form method="post" action="index.php?p=login">
	Username: <input type="text" name="username"><br>
	Password: <input type="password" name="password"><br>
	<input type="Submit" value="Access the secret page">
</form>



In [39]:
url = 'http://www.webscrapingfordatascience.com/trickylogin/'

# First perform a normal GET request to get the form
r = requests.post(url)

# Set the cookies
my_cookies = r.cookies
print(my_cookies)

# Then perform the POST request -- do not follow the redirect
# Use the cookies we got before
r = requests.post(url, params={'p': 'login'},
                  data={'username': 'dummy', 'password': '1234'},
                  allow_redirects=False,
                  cookies=my_cookies)

# We need to update our cookies again
# Note that the PHPSESSID value will have changed
my_cookies = r.cookies
print(my_cookies)

# Now perform a GET request manually to the secret page
# using the updated cookies
r = requests.get(url, params={'p': 'protected'}, cookies=my_cookies)

print(r.text)

<RequestsCookieJar[<Cookie PHPSESSID=vllf79hb909hh1tr744875pih2 for www.webscrapingfordatascience.com/>]>
<RequestsCookieJar[<Cookie PHPSESSID=v7vo03du4g8dr0vhicabjcef17 for www.webscrapingfordatascience.com/>]>
Here is your secret code: 3838.


## Using Session with requests

In [3]:
import requests

url = 'http://www.webscrapingfordatascience.com/trickylogin/'

my_session = requests.Session()

r = my_session.post(url)
r = my_session.post(url, params={'p': 'login'}, data={'username': 'harsha', 'password': '1234'})
r = my_session.get(url, params={'p': 'protected'})

print(r.text)

Here is your secret code: 3838.


In [5]:
# clean session
# my_session.cookies.clear()

my_session = requests.Session()
my_session.headers.update({'User-Agent': 'Chrome!'})

# All requests in this session will now use this User-Agent header:

r = my_session.post(url)
print(r.request.headers)

r = my_session.post(url, params={'p': 'login'},
                    data={'username': 'dummy', 'password': '1234'})
print(r.request.headers)

r = my_session.get(url, params={'p': 'protected'})
print(r.request.headers)

{'User-Agent': 'Chrome!', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Length': '0'}
{'User-Agent': 'Chrome!', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Cookie': 'PHPSESSID=o74av9nnmrri15nvc04jn2jfh7'}
{'User-Agent': 'Chrome!', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Cookie': 'PHPSESSID=de501j1hm14idq2cgk7fkbjm31'}
