### Activity Exercise: Scraping and Analyzing Static Websites with Python

#### Importing necessary libraries


In [1]:
from bs4 import BeautifulSoup as bs
import requests

#### Fetching the website itself

In [27]:
r = requests.get('https://www.scrapethissite.com/pages/simple/')

if r.status_code == 200:
    print('Successfully fetched the page!')
    html_content = r.text
else:
    print('Failed to retrieve the page:', r.status_code)

Successfully fetched the page!


In [28]:
# This code is the html content of the website
webpage = bs(r.content,'html.parser')
print(webpage.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="A single page that lists information about all the countries in the world. Good for those just get started with web scraping." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
  <meta content="noindex

#### Printing the Title of the Website

In [32]:
title = webpage.title
print(title)

<title>Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping</title>


#### Getting all content from the webpage 

In [13]:
headers = webpage.find_all(['h1', 'h2', 'h3'])

for header in headers:
    print(f'{header.name}: {header.text.strip()}')

h1: Countries of the World: A Simple Example
                            250 items
h3: Andorra
h3: United Arab Emirates
h3: Afghanistan
h3: Antigua and Barbuda
h3: Anguilla
h3: Albania
h3: Armenia
h3: Angola
h3: Antarctica
h3: Argentina
h3: American Samoa
h3: Austria
h3: Australia
h3: Aruba
h3: Åland
h3: Azerbaijan
h3: Bosnia and Herzegovina
h3: Barbados
h3: Bangladesh
h3: Belgium
h3: Burkina Faso
h3: Bulgaria
h3: Bahrain
h3: Burundi
h3: Benin
h3: Saint Barthélemy
h3: Bermuda
h3: Brunei
h3: Bolivia
h3: Bonaire
h3: Brazil
h3: Bahamas
h3: Bhutan
h3: Bouvet Island
h3: Botswana
h3: Belarus
h3: Belize
h3: Canada
h3: Cocos [Keeling] Islands
h3: Democratic Republic of the Congo
h3: Central African Republic
h3: Republic of the Congo
h3: Switzerland
h3: Ivory Coast
h3: Cook Islands
h3: Chile
h3: Cameroon
h3: China
h3: Colombia
h3: Costa Rica
h3: Cuba
h3: Cape Verde
h3: Curacao
h3: Christmas Island
h3: Cyprus
h3: Czech Republic
h3: Germany
h3: Djibouti
h3: Denmark
h3: Dominica
h3: Dominican Repu

In [15]:
links = webpage.find_all()
for link in links:
    print('Link:', link.get('href'), 'Text:', link.text.strip())

Link: None Text: Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping

















                                Scrape This Site
                            




                                Sandbox
                            




                                Lessons
                            




                                FAQ
                            



                                Login
                            












                            Countries of the World: A Simple Example
                            250 items







                            A single page that lists information about all the countries in the world. Good for those just get started with web scraping.
                            Practice looking for patterns in the HTML that will allow you to extract information about each country. Then, build a simple web scraper that makes a request to this page, parses the HTML a

## Part 4 of the Activity 

Using a different website for this part


In [16]:
r2 = requests.get('https://www.scrapethissite.com/pages/forms/')

if r2.status_code == 200:
    print('Successfully fetched the page!')
    html_content = r2.text
else:
    print('Failed to retrieve the page:', r2.status_code)

Successfully fetched the page!


In [17]:
webpage2 = bs(r2.content,'html.parser')
print(webpage2.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
  <meta con

In [31]:
title2 = webpage2.title
print(title2)

<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>


In [23]:
tables = webpage2.find_all('table')

for table in tables:
    rows = table.find_all('tr')
for row in rows:
# Extract all cells in the row
    cells = row.find_all(['td', 'th'])
    cell_data = [cell.text.strip() for cell in cells]
    print(cell_data)

['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+ / -']
['Boston Bruins', '1990', '44', '24', '', '0.55', '299', '264', '35']
['Buffalo Sabres', '1990', '31', '30', '', '0.388', '292', '278', '14']
['Calgary Flames', '1990', '46', '26', '', '0.575', '344', '263', '81']
['Chicago Blackhawks', '1990', '49', '23', '', '0.613', '284', '211', '73']
['Detroit Red Wings', '1990', '34', '38', '', '0.425', '273', '298', '-25']
['Edmonton Oilers', '1990', '37', '37', '', '0.463', '272', '272', '0']
['Hartford Whalers', '1990', '31', '38', '', '0.388', '238', '276', '-38']
['Los Angeles Kings', '1990', '46', '24', '', '0.575', '340', '254', '86']
['Minnesota North Stars', '1990', '27', '39', '', '0.338', '256', '266', '-10']
['Montreal Canadiens', '1990', '39', '30', '', '0.487', '273', '249', '24']
['New Jersey Devils', '1990', '32', '33', '', '0.4', '272', '264', '8']
['New York Islanders', '1990', '25', '45', '', '0.312', '223', '290', '-6

#### Assigning the country-info class to be important

In [24]:
important_elements = webpage.find_all(class_='country-info') # Replace with actual class name
for element in important_elements:
    print('Important Element:', element.text.strip())

Important Element: Capital: Andorra la Vella
Population: 84000
Area (km2): 468.0
Important Element: Capital: Abu Dhabi
Population: 4975593
Area (km2): 82880.0
Important Element: Capital: Kabul
Population: 29121286
Area (km2): 647500.0
Important Element: Capital: St. John's
Population: 86754
Area (km2): 443.0
Important Element: Capital: The Valley
Population: 13254
Area (km2): 102.0
Important Element: Capital: Tirana
Population: 2986952
Area (km2): 28748.0
Important Element: Capital: Yerevan
Population: 2968000
Area (km2): 29800.0
Important Element: Capital: Luanda
Population: 13068161
Area (km2): 1246700.0
Important Element: Capital: None
Population: 0
Area (km2): 1.4E7
Important Element: Capital: Buenos Aires
Population: 41343201
Area (km2): 2766890.0
Important Element: Capital: Pago Pago
Population: 57881
Area (km2): 199.0
Important Element: Capital: Vienna
Population: 8205000
Area (km2): 83858.0
Important Element: Capital: Canberra
Population: 21515754
Area (km2): 7686850.0
Importan

In [34]:
specific_element = webpage2.find(id='footer')
print('Element with Specific ID:', specific_element.text.strip())

Element with Specific ID: Lessons and Videos © Hartley Brody 2023


#### Saving the data to a CSV file

In [35]:
import csv

tables = webpage2.find_all('table')

# Open a CSV file to write the output
with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    
    for table in tables:
        rows = table.find_all('tr')
        
        for row in rows:
            # Extract all cells in the row
            cells = row.find_all(['td', 'th'])
            # Get the text from each cell and strip any surrounding whitespace
            cell_data = [cell.text.strip() for cell in cells]
            # Write the row data to the CSV file
            writer.writerow(cell_data)

print("Data has been written to output.csv")


Data has been written to output.csv
