In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
response = requests.get('https://liannewriting.github.io/scraping_example.html')
print('Visited URL: {}'.format(response.url))
print(response.status_code)

Visited URL: https://liannewriting.github.io/scraping_example.html
200


In [3]:
soup = BeautifulSoup(response.text, 'html.parser')

type(soup)

bs4.BeautifulSoup

In [4]:
print(soup.prettify())

<html>
 <head>
  <title>
   My Example Website
  </title>
  <style>
   .data-container {
              background-color: blue;
              color: white;
              text-align: center;
              margin: 10px;
              padding: 10px;
            }
            
            table {    
              width: 75%;
              border-collapse: collapse;
              text-align: center;
            }
            td {
              border: 1px solid black;
            }
  </style>
 </head>
 <body>
  <div class="data-container">
   Apple
  </div>
  <div class="data-container">
   Orange
  </div>
  <div class="data-container">
   Peach
  </div>
  <div class="data-container">
   Pear
  </div>
  <div class="data-container">
   Avocado
  </div>
  <div class="data-container">
   Strawberry
  </div>
  <div class="data-container">
   Grape
  </div>
  <div class="data-container">
   Blueberry
  </div>
  <div class="data-container">
   Blackberry
  </div>
  <table id="data-table">
   <tr>

## Scraping the data from the div tags

In [5]:
data_containers = soup.find_all('div', class_='data-container')
data_containers

[<div class="data-container">Apple</div>,
 <div class="data-container">Orange</div>,
 <div class="data-container">Peach</div>,
 <div class="data-container">Pear</div>,
 <div class="data-container">Avocado</div>,
 <div class="data-container">Strawberry</div>,
 <div class="data-container">Grape</div>,
 <div class="data-container">Blueberry</div>,
 <div class="data-container">Blackberry</div>]

In [6]:
# To look at the first tag object
print(data_containers[0])

# To get the text attribute within the tag
print(data_containers[0].text)

<div class="data-container">Apple</div>
Apple


In [7]:
dat = []
for dc in data_containers:
    fruit = dc.text
    dat.append(fruit)

print(dat)

['Apple', 'Orange', 'Peach', 'Pear', 'Avocado', 'Strawberry', 'Grape', 'Blueberry', 'Blackberry']


In [8]:
df_fruit = pd.DataFrame(data=dat, columns=['fruit_name'])
print(df_fruit)

   fruit_name
0       Apple
1      Orange
2       Peach
3        Pear
4     Avocado
5  Strawberry
6       Grape
7   Blueberry
8  Blackberry


## Scraping the data from the table

In [9]:
data_table = soup.find('table', id='data-table')

print(data_table)

<table id="data-table">
<tr><th>Address</th><th>Price</th></tr>
<tr><td>1 First St</td><td>100000</td></tr>
<tr><td>2 Second St</td><td>200000</td></tr>
<tr><td>3 Third St</td><td>300000</td></tr>
<tr><td>4 Fourth St</td><td>400000</td></tr>
<tr><td>5 Fifth St</td><td>500000</td></tr>
<tr><td>6 Sixth St</td><td>600000</td></tr>
</table>


In [10]:
data_table_tds = data_table.find_all('td')
data_table_tds

[<td>1 First St</td>,
 <td>100000</td>,
 <td>2 Second St</td>,
 <td>200000</td>,
 <td>3 Third St</td>,
 <td>300000</td>,
 <td>4 Fourth St</td>,
 <td>400000</td>,
 <td>5 Fifth St</td>,
 <td>500000</td>,
 <td>6 Sixth St</td>,
 <td>600000</td>]

In [11]:
addresses = []
prices = []
for i, td in enumerate(data_table_tds):
    
    if i % 2:
        prices.append(td.text)
    else:
        addresses.append(td.text)

In [12]:
df_prices = pd.DataFrame(data={'addresses': addresses, 'prices': prices})
print(df_prices)

     addresses  prices
0   1 First St  100000
1  2 Second St  200000
2   3 Third St  300000
3  4 Fourth St  400000
4   5 Fifth St  500000
5   6 Sixth St  600000
