In [1]:
# Downloading a simple sample website
import requests

page = requests.get('http://dataquestio.github.io/web-scraping-pages/simple.html')
page

<Response [200]>

In [2]:
# Getting the status_code of the above obtained Response object
page.status_code

200

In [3]:
# HTML content of the page
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [4]:
# Using BeautifulSoup to parse the document
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
# HTML content of the page, formatted nicely
print (soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [6]:
# Selecting all the elements at the top level of the page
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [7]:
# Seeing the type of each element in the above generated list
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [8]:
# Selecting the html tag and its children
html = list(soup.children)[2]
list(html.children)

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [9]:
# Diving into the body
body = list(html.children)[3]

# Getting the p tag by finding the children of the body tag
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [10]:
# Isolating the p tag
p = list(body.children)[1]

# Extracting all of the text inside the tag
p.get_text()

'Here is some simple content for this page.'

In [11]:
# Finding all instances of a tag at once
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [12]:
# Extracting text present in the p tag
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [13]:
# For finding the first instance of a tag
soup.find('p') # This returns a single BeautifulSoup object

<p>Here is some simple content for this page.</p>

In [14]:
# Now we go for searching for tags by class and id

page = requests.get('http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html')
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [15]:
# Searching for items by class
# Here, we search for any p tag that has the class 'outer-text'
soup.find_all('p', class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [16]:
# Here, we look for any tag that has the class 'outer-text'
soup.find_all(class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [17]:
# Searching for elements by id
soup.find_all(id = 'first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [18]:
# Using CSS selectors to find all the p tags in the page that are inside of a div
soup.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

In [19]:
# Downloading the web page containing the forecast
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.WwZNQe6FPDc')

# Creating a BeautifulSoup class to parse the page
soup = BeautifulSoup(page.content, 'html.parser')

# Finding the div with id 'seven-day-forecast' and assigning it to a variable
seven_day = soup.find(id = 'seven-day-forecast')

# Finding each individual forecast item
forecast_items = seven_day.find_all(class_ = 'tombstone-container')

print (forecast_items)

[<div class="tombstone-container">
<p class="period-name">Today<br/><br/></p>
<p><img alt="Today: Mostly sunny, with a high near 59. West wind 9 to 16 mph, with gusts as high as 22 mph. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 59. West wind 9 to 16 mph, with gusts as high as 22 mph. "/></p><p class="short-desc">Mostly Sunny</p><p class="temp temp-high">High: 59 °F</p></div>, <div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: A 20 percent chance of showers after 11pm.  Mostly cloudy, with a low around 53. West southwest wind 10 to 18 mph, with gusts as high as 24 mph.  New precipitation amounts of less than a tenth of an inch possible. " class="forecast-icon" src="newimages/medium/nshra20.png" title="Tonight: A 20 percent chance of showers after 11pm.  Mostly cloudy, with a low around 53. West southwest wind 10 to 18 mph, with gusts as high as 24 mph.  New precipitation amounts o

In [20]:
# Extracting the first forecast item, i.e., today's weather
tonight = forecast_items[0]
print (tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Mostly sunny, with a high near 59. West wind 9 to 16 mph, with gusts as high as 22 mph. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 59. West wind 9 to 16 mph, with gusts as high as 22 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Sunny
 </p>
 <p class="temp temp-high">
  High: 59 °F
 </p>
</div>


In [21]:
# Observing the data-types
print (type(page))
print (type(soup))
print (type(seven_day))
print (type(forecast_items))
print (type(tonight))

<class 'requests.models.Response'>
<class 'bs4.BeautifulSoup'>
<class 'bs4.element.Tag'>
<class 'bs4.element.ResultSet'>
<class 'bs4.element.Tag'>


In [22]:
# Name of the forecast item
period = tonight.find(class_ = 'period-name').get_text()
print (period)

# The short description
short_desc = tonight.find(class_ = 'short-desc').get_text()
print (short_desc)

# Temperature
temp = tonight.find(class_ = 'temp').get_text()
print (temp)

Today
Mostly Sunny
High: 59 °F


In [23]:
# Extracting the 'title' attribute from the 'img' tag
img = tonight.find('img')
desc = img['title'] # Treating the BeautifulSoup object as a dictionary

print (desc)

Today: Mostly sunny, with a high near 59. West wind 9 to 16 mph, with gusts as high as 22 mph. 


In [24]:
# Extracting all the information from the page about the periods

period_tags = seven_day.select('.tombstone-container .period-name')
periods = [pt.get_text() for pt in period_tags]
periods

['Today',
 'Tonight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday',
 'SundayNight',
 'MemorialDay']

In [25]:
# Extracting all the information from the page about the short descriptions
short_descs = [sd.get_text() for sd in seven_day.select('.tombstone-container .short-desc')]
print (short_descs)

['Mostly Sunny', 'Slight ChanceShowers', 'ChanceShowers', 'Mostly Cloudy', 'Partly Sunny', 'Partly Cloudy', 'Sunny', 'Partly Cloudy', 'Sunny']


In [26]:
# Extracting all the information from the page about the temperatures
temps = [t.get_text() for t in seven_day.select('.tombstone-container .temp')]
print (temps)

['High: 59 °F', 'Low: 53 °F', 'High: 60 °F', 'Low: 52 °F', 'High: 61 °F', 'Low: 52 °F', 'High: 62 °F', 'Low: 53 °F', 'High: 64 °F']


In [27]:
# Extracting all the information from the page about the weather descriptions
descs = [d['title'] for d in seven_day.select('.tombstone-container img')]
print (descs)

['Today: Mostly sunny, with a high near 59. West wind 9 to 16 mph, with gusts as high as 22 mph. ', 'Tonight: A 20 percent chance of showers after 11pm.  Mostly cloudy, with a low around 53. West southwest wind 10 to 18 mph, with gusts as high as 24 mph.  New precipitation amounts of less than a tenth of an inch possible. ', 'Friday: A 30 percent chance of showers, mainly after 11am.  Mostly cloudy, with a high near 60. West southwest wind 8 to 16 mph, with gusts as high as 21 mph.  New precipitation amounts of less than a tenth of an inch possible. ', 'Friday Night: Mostly cloudy, with a low around 52. West southwest wind 11 to 15 mph, with gusts as high as 20 mph. ', 'Saturday: Partly sunny, with a high near 61. West wind 13 to 20 mph, with gusts as high as 25 mph. ', 'Saturday Night: Partly cloudy, with a low around 52.', 'Sunday: Sunny, with a high near 62.', 'Sunday Night: Partly cloudy, with a low around 53.', 'Memorial Day: Sunny, with a high near 64.']


In [28]:
# Combining all the data into a Pandas Dataframe

import pandas as pd
weather = pd.DataFrame({'period' : periods, 'short_desc' : short_descs, 'temp' : temps, 'desc' : descs})
weather

Unnamed: 0,desc,period,short_desc,temp
0,"Today: Mostly sunny, with a high near 59. West...",Today,Mostly Sunny,High: 59 °F
1,Tonight: A 20 percent chance of showers after ...,Tonight,Slight ChanceShowers,Low: 53 °F
2,"Friday: A 30 percent chance of showers, mainly...",Friday,ChanceShowers,High: 60 °F
3,"Friday Night: Mostly cloudy, with a low around...",FridayNight,Mostly Cloudy,Low: 52 °F
4,"Saturday: Partly sunny, with a high near 61. W...",Saturday,Partly Sunny,High: 61 °F
5,"Saturday Night: Partly cloudy, with a low arou...",SaturdayNight,Partly Cloudy,Low: 52 °F
6,"Sunday: Sunny, with a high near 62.",Sunday,Sunny,High: 62 °F
7,"Sunday Night: Partly cloudy, with a low around...",SundayNight,Partly Cloudy,Low: 53 °F
8,"Memorial Day: Sunny, with a high near 64.",MemorialDay,Sunny,High: 64 °F


In [34]:
# Using REGEX to pull out the numeric temperature values

temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)", expand = False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    59
1    53
2    60
3    52
4    61
5    52
6    62
7    53
8    64
Name: temp_num, dtype: object

In [35]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
desc          9 non-null object
period        9 non-null object
short_desc    9 non-null object
temp          9 non-null object
temp_num      9 non-null int32
dtypes: int32(1), object(4)
memory usage: 404.0+ bytes


In [36]:
# Finding the mean of all the temperatures
weather['temp_num'].mean()

57.333333333333336

In [37]:
# Determining those rows that describe night-time conditions
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
8    False
Name: temp, dtype: bool

In [38]:
weather[is_night]

Unnamed: 0,desc,period,short_desc,temp,temp_num,is_night
1,Tonight: A 20 percent chance of showers after ...,Tonight,Slight ChanceShowers,Low: 53 °F,53,True
3,"Friday Night: Mostly cloudy, with a low around...",FridayNight,Mostly Cloudy,Low: 52 °F,52,True
5,"Saturday Night: Partly cloudy, with a low arou...",SaturdayNight,Partly Cloudy,Low: 52 °F,52,True
7,"Sunday Night: Partly cloudy, with a low around...",SundayNight,Partly Cloudy,Low: 53 °F,53,True
