## Chapter 13: Avoiding Scraping Traps
### Adjust Your Headers

In [1]:
import requests
from bs4 import BeautifulSoup
import json

session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome',
    'Accept': 'text.html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
url = 'https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending'
req = session.get(url, headers=headers)
bs = BeautifulSoup(req.text, 'lxml')
tbody = bs.find('table', {'class': 'table-striped'}).get_text()
print('Headers from whatismybrowser:\n{}'.format(tbody))

url = 'https://httpbin.org/headers'
r = requests.get(url)
# r=json.dumps(json.loads(r.text),indent=4)
rjson = json.dumps(r.json(), indent=4)
print('Original headers on requests:\n{}'.format(rjson))

Headers from whatismybrowser:


ACCEPT
text.html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8


ACCEPT_ENCODING
gzip, deflate


CONNECTION
keep-alive


HOST
www.whatismybrowser.com


USER_AGENT
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome


Original headers on requests:
{
    "headers": {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "close",
        "Host": "httpbin.org",
        "User-Agent": "python-requests/2.18.4"
    }
}


### Handling Cookies with Javascript

In [2]:
from selenium import webdriver
import json

driver = webdriver.PhantomJS()
driver.get('http://pythonscraping.com/')
driver.implicitly_wait(1)
print(json.dumps(driver.get_cookies(), indent=4))



[
    {
        "domain": "pythonscraping.com",
        "httponly": false,
        "name": "has_js",
        "path": "/",
        "secure": false,
        "value": "1"
    }
]


In [3]:
# Reuse cookies with selenium
from selenium import webdriver
import json

driver = webdriver.PhantomJS()
driver.get('http://pythonscraping.com/')
driver.implicitly_wait(1)

savedCookies = driver.get_cookies()
print(json.dumps(savedCookies, indent=4))

driver2 = webdriver.PhantomJS()
# tell selenium where the cookies belong to
driver2.get('http://pythonscraping.com/')
driver2.delete_all_cookies()
for cookie in savedCookies:
    if not cookie['domain'].startswith('.'):
        cookie['domain'] = '.{}'.format(cookie['domain'])
    driver2.add_cookie(cookie)

driver2 = webdriver.PhantomJS()
driver2.get('http://pythonscraping.com/')
print(json.dumps(driver2.get_cookies(), indent=4))



[
    {
        "domain": "pythonscraping.com",
        "httponly": false,
        "name": "has_js",
        "path": "/",
        "secure": false,
        "value": "1"
    }
]




[
    {
        "domain": "pythonscraping.com",
        "httponly": false,
        "name": "has_js",
        "path": "/",
        "secure": false,
        "value": "1"
    }
]


### Avoiding Honeypots

In [4]:
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement

driver = webdriver.PhantomJS()
driver.get('http://pythonscraping.com/pages/itsatrap.html')
links = driver.find_elements_by_tag_name('a')
for link in links:
    if not link.is_displayed():
        print('The link %s is a trap' % link.get_attribute('href'))
fields = driver.find_elements_by_tag_name('input')
for field in fields:
    if not field.is_displayed():
        print('Do not change value of %s' % field.get_attribute('name'))



The link http://pythonscraping.com/dontgohere is a trap
Do not change value of phone
Do not change value of email
