# Web scraping
When there is no dataset and API

Python libraries:
- Pandas
- Numpy
- Requests
- Beautifulsoup

In [2]:
import requests

## Downloading a page

In [5]:
# Write your code here.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
content = response.content
print content

<!DOCTYPE html>
<html>
    <head>
        <title>A simple example page</title>
    </head>
    <body>
        <p>Here is some simple content for this page.</p>
    </body>
</html>


## Parsing a page
Library BeautifulSoup has an HTML parser.

In [7]:
from bs4 import BeautifulSoup

# Initialize the parser, and pass in the content we grabbed earlier.
parser = BeautifulSoup(content, 'html.parser')

# Get the body tag from the document.
# Since we passed in the top level of the document to the parser, we need to pick a branch off of the root.
# With beautifulsoup, we can access branches by simply using tag types as 
body = parser.body

# Get the p tag from the body.
p = body.p
print(p)

# Print the text of the p tag.
# Text is a property that gets the inside text of a tag.
print(p.text)


# Get the title
head = parser.head
title=head.title
title_text =title.text
print(title)
print(title_text)

<p>Here is some simple content for this page.</p>
Here is some simple content for this page.
<title>A simple example page</title>
A simple example page


### Find_all
Finds all occurenses of a tag and returns it in a list form

In [9]:
parser = BeautifulSoup(content, 'html.parser')

# Get a list of all occurences of the body tag in the element.
body = parser.find_all("body")

# Get the paragraph tag
p = body[0].find_all("p")

# Get the text
print(p[0].text)

head = parser.find_all('head')
title = head[0].find_all('title')
title_text = title[0].text
print title_text

Here is some simple content for this page.
A simple example page


### Element IDs

![element ID](id.png)

- div : divides the page

In [12]:
# Get the page content and setup a new parser.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_ids.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Pass in the id attribute to only get elements with a certain id.
first_paragraph = parser.find_all("p", id="first")[0]
print(first_paragraph.text)

second_paragraph = parser.find_all('p', id='second')[0]
second_paragraph_text = parser.find_all('p', id='second')[0].text
print second_paragraph_text


                First paragraph.
            


                Second paragraph.
            



### Element Classes
![classes](class.png)

- class isn't globally unique
- groups things

In [19]:
# Get the website that contains classes.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_classes.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Get the first inner paragraph.
# Find all the paragraph tags with the class inner-text.
# Then take the first element in that list.
first_inner_paragraph = parser.find_all("p", class_="inner-text")[0]
print(first_inner_paragraph.text)
second_inner_paragraph_text = parser.find_all("p", class_="inner-text")[1].text
print second_inner_paragraph_text

first_outer_paragraph = parser.find_all('p', class_='outer-text')[0]
first_outer_paragraph_text=first_outer_paragraph.text
print first_outer_paragraph_text


                First paragraph.
            

                Second paragraph.
            


                First outer paragraph.
            



## CSS selectors
![selector](selector.png)
- class and element ID can be specified together
- multiple classes can be assigned

In [31]:
# Get the website that contains classes and ids
response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Select all the elements with the first-item class
first_items = parser.select(".first-item")

# Print the text of the first paragraph (first element with the first-item class)
print(first_items[0].text)

# Print the text of the first paragraph with the outer-text class to first_outer_text.
first_outer = parser.select('.outer-text')
#first_outer_text = parser.select(".outer-text")[0].text
first_outer_text = first_outer[0].text
print(first_outer_text)

# Print the text of the first paragraph with the element ID 'second'
second_text = parser.select("#second")[0].text
print second_text


                First paragraph.
            


                First outer paragraph.
            



                First outer paragraph.
            



### Nested CSS selectors
http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html

In [36]:
# Get the super bowl box score data.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html")
content = response.content
parser = BeautifulSoup(content, 'html.parser')

# Find the number of turnovers committed by the Seahawks.
turnovers = parser.select("#turnovers")[0]
seahawks_turnovers = turnovers.select("td")[1]
seahawks_turnovers_count = seahawks_turnovers.text
print(seahawks_turnovers_count)


## NESTED CSS SELECTOR 
patriots_total_plays_count = parser.select('#total-plays td')[2].text
seahawks_total_yards_count = parser.select('#total-yards td')[1].text

print seahawks_total_yards_count

1
396
