# Raspagem de dados com Requests e BS

## Exemplo 1

In [7]:
import requests
response = requests.get('http://www.ifpi.edu.br')
print(response.status_code)
print(response.headers['content-type'])
print(response.text)


200
text/html;charset=utf-8
<!DOCTYPE html>
<html lang="pt-br" dir="ltr">

<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">

    
        <base href="http://libra.ifpi.edu.br/home/"><!--[if lt IE 7]></base><![endif]-->
    

    
        
  
    
  <meta property="og:site_name" content="IFPI Instituto Federal do Piauí">
  <meta property="og:url" content="http://libra.ifpi.edu.br/home">
  <meta property="og:type" content="website">
  <meta property="og:locale" content="pt_BR">
  <meta property="og:title" content="Home">
  <meta property="og:description" content="">
  <meta property="og:image" content="http://libra.ifpi.edu.br/logo.png">
  
  
  
  <script type="application/javascript">
    (function() {
        var po = document.createElement('script');
        po.async = true;
        po.src = document.location.protocol + '//connect.facebook.net/pt_BR/all.js#xfbml=1';
        var head = document.getElementsByTagName('head')[0];
        head.appendChild(po)

## Exemplo 2

In [34]:
import requests
def download(url, num_retries=2):
    print('Downloading:', url)
    page = None
    try:
        response = requests.get(url)
        page = response.text
    except requests.exceptions.RequestException as e:
        print('Download error:', e.reason)
    return page
# testing...
page = download('http://www.google.com/')
print(page)

Downloading: http://www.google.com/
<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="pt-BR"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="nxY6YFG7dmFSGHf3n2hahw==">(function(){window.google={kEI:'y9ajW_CgK4OYwQSfm63oBQ',kEXPI:'0,1353747,57,5,46,1318,588,583,434,281,1125,1071,154,591,140,142,183,64,1011,97,301,360,234,55,74,97,2337558,173,32,329294,1294,12383,4855,32692,15247,867,317,11846,5281,1953,9287,365,1216,813,1289,1262,1052,3191,1241,1201,260,5107,575,1119,2,205,339,1,7,27,727,2431,1362,283,1429,1114,767,731,2095,1294,10,1533,34,222,552,359,318,556,882,134,282,2,4461,525,22,604,2,1318,447,728,3,773,814,659,283,3736,69,535,515,334,10,120,1110,234,386,8,1003,81,7,2,26,463,90,530,29,981,37,361,16,245,206,13,57,229,339,1216,90,9,431,437,390,859,119,1061,571,497,46,78,1459,245,8,304,318,558,404,8,2,198

## Exemplo 3

In [33]:
import requests
def download(url, num_retries=2):
    print('Downloading:', url)
    page = None
    try:
        response = requests.get(url)
        page = response.text
        if response.status_code >= 400:
            print('Download error:', response.text)
            if num_retries and 500 <= response.status_code < 600:
                return download(url, num_retries - 1)
    except requests.exceptions.RequestException as e:
        print('Download error:', e.reason)
    return page

# testing...
page = download('https://www.oantagonista.com/')
print(page)


Downloading: https://www.oantagonista.com/
Download error: <!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>Attention Required! | Cloudflare</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge,chrome=1" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/cf.errors.css" type="text/css" media="screen,projection" />
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" type="text/css" media="screen,projection" /><![e

## Exemplo 4 - Parsing em HTML com RE

In [20]:
import re
#from crawler import download
url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
page = download(url)
area = re.findall(r'<td class="w2p_fw">(.*?)</td>', page)[1]
# alternativa:
# area re.findall('''<tr 			id="places_area__row">.*?<tds*class=["']w2p_fw["']>		(.*?)</td>''', html)
print(area)

Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239
244,820 square kilometres


## Exemplo 5 - Parsing com BS

In [25]:
from bs4 import BeautifulSoup
url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
html = download(url)
soup = BeautifulSoup(html, 'html.parser')
tr = soup.find(attrs={'id':'places_area__row'})
td = tr.find(attrs={'class':'w2p_fw'})
area = td.text
print(area) 


Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239
244,820 square kilometres


## Exemplo 6 - Parsing de HTML com html.parser

In [8]:
from bs4 import BeautifulSoup
from pprint import pprint
broken_html = '<ul class=country><li>Area<li>Population</ul>'
soup = BeautifulSoup(broken_html, 'html.parser')
fixed_html = soup.prettify()
pprint(fixed_html)


('<ul class="country">\n'
 ' <li>\n'
 '  Area\n'
 '  <li>\n'
 '   Population\n'
 '  </li>\n'
 ' </li>\n'
 '</ul>')


## Exemplo 7 - Parsing de HTML com html5lib

In [11]:
from bs4 import BeautifulSoup
from pprint import pprint
broken_html = '<ul class=country><li>Area<li>Population</ul>'
soup = BeautifulSoup(broken_html, 'html5lib')
fixed_html = soup.prettify()
pprint(fixed_html)

('<html>\n'
 ' <head>\n'
 ' </head>\n'
 ' <body>\n'
 '  <ul class="country">\n'
 '   <li>\n'
 '    Area\n'
 '   </li>\n'
 '   <li>\n'
 '    Population\n'
 '   </li>\n'
 '  </ul>\n'
 ' </body>\n'
 '</html>')
