# BeautifulSoup and requests

BeautifulSoup is a Python library for parsing HTML and XML documents. It creates parse trees from page source codes that can be used to extract data easily.


In [None]:
from bs4 import BeautifulSoup

import requests


In [None]:
url = "https://www.scrapethissite.com/pages/forms/?per_page=10000"


In [None]:
page = requests.get(url)
page.text


'<!doctype html>\n<html lang="en">\n  <head>\n    <meta charset="utf-8">\n    <title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>\n    <link rel="icon" type="image/png" href="/static/images/scraper-icon.png" />\n\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <meta name="description" content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.">\n\n    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" crossorigin="anonymous">\n    <link href=\'https://fonts.googleapis.com/css?family=Lato:400,700\' rel=\'stylesheet\' type=\'text/css\'>\n    <link rel="stylesheet" type="text/css" href="/static/cs

In [None]:
soup = BeautifulSoup(page.text, "html.parser")


In [None]:
soup.find("div")


<div class="container">
<div class="col-md-12">
<ul class="nav nav-tabs">
<li id="nav-homepage">
<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>
</li>
<li id="nav-sandbox">
<a class="nav-link" href="/pages/">
<i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                Sandbox
                            </a>
</li>
<li id="nav-lessons">
<a class="nav-link" href="/lessons/">
<i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                Lessons
                            </a>
</li>
<li id="nav-faq">
<a class="nav-link" href="/faq/">
<i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                FAQ
                            </a>
</li>
<li class="pull-right" id="nav-login">
<a class="nav-link" href="/login/">
                                Login

In [None]:
soup.find("p", class_="lead").text.strip()


'Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.\n                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.'

In [None]:
url2 = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"

page = requests.get(url2)
soup = BeautifulSoup(page.text, "html.parser")


In [None]:
world_table = soup.find_all("table")[0]
table = []
table_rows = [
    table.append(heading.text.strip().split("\n\n"))
    for heading in world_table.find_all("tr")
]


In [None]:
table


[['Rank',
  'Name',
  'Industry',
  'Revenue (USD millions)',
  'Revenue growth',
  'Employees',
  'Headquarters'],
 ['1',
  'Walmart',
  'Retail',
  '648,125',
  '  6.0%',
  '2,100,000',
  'Bentonville, Arkansas'],
 ['2',
  'Amazon',
  'Retail and cloud computing',
  '574,785',
  '  11.9%',
  '1,525,000',
  'Seattle, Washington'],
 ['3',
  'Apple',
  'Electronics industry',
  '383,482',
  '  -2.8%',
  '161,000',
  'Cupertino, California'],
 ['4',
  'UnitedHealth Group',
  'Healthcare',
  '371,622',
  '  14.6%',
  '440,000',
  'Minnetonka, Minnesota'],
 ['5',
  'Berkshire Hathaway',
  'Conglomerate',
  '364,482',
  '  20.7%',
  '396,500',
  'Omaha, Nebraska'],
 ['6',
  'CVS Health',
  'Healthcare',
  '357,776',
  '  10.9%',
  '259,500',
  'Woonsocket, Rhode Island'],
 ['7',
  'ExxonMobil',
  'Petroleum industry',
  '344,582',
  '  -16.7%',
  '61,500',
  'Spring, Texas'],
 ['8',
  'Alphabet',
  'Technology and cloud computing',
  '307,394',
  '  8.7%',
  '182,502',
  'Mountain View, Cal

In [None]:
# table_rows[0][3] = " ".join(table_rows[0][3:5])
# del table_rows[0][4]


# for num in range(1, len(table_rows)):
#     " ".join(table_rows[num][-1:])

# table_rows


In [None]:
[table[i][0] for i in range(1, len(table))]


['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100']

In [None]:
import pandas as pd

df = pd.DataFrame(
    table[1:], index=[table[i][0] for i in range(1, len(table))], columns=table[0]
)


In [None]:
df


Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
1,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
2,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
3,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
4,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
5,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
96,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
97,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
98,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
99,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


In [None]:
df["Revenue growth"] = df["Revenue growth"].str.strip()


In [None]:
df = df.set_index("Rank")


In [None]:
df


Unnamed: 0_level_0,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...
96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


# Reading in files


In [None]:
df = pd.read_csv("./Dataset/world_population.csv")  # read in a csv file


In [None]:
df


Unnamed: 0,Rank,CCA3,Country,Capital,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km²),Density (per km²),Growth Rate,World Population Percentage
0,36,AFG,Afghanistan,Kabul,Asia,41128771.0,38972230.0,33753499.0,28189672.0,19542982.0,10694796.0,12486631.0,10752971.0,652230.0,63.0587,1.0257,0.52
1,138,ALB,Albania,Tirana,Europe,2842321.0,2866849.0,2882481.0,2913399.0,3182021.0,3295066.0,2941651.0,2324731.0,28748.0,98.8702,0.9957,0.04
2,34,DZA,Algeria,Algiers,Africa,44903225.0,43451666.0,39543154.0,35856344.0,30774621.0,25518074.0,18739378.0,13795915.0,2381741.0,18.8531,1.0164,0.56
3,213,ASM,American Samoa,Pago Pago,Oceania,44273.0,46189.0,51368.0,54849.0,58230.0,47818.0,32886.0,27075.0,199.0,222.4774,0.9831,0.00
4,203,AND,Andorra,Andorra la Vella,Europe,79824.0,77700.0,71746.0,71519.0,66097.0,53569.0,35611.0,19860.0,468.0,170.5641,1.0100,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,226,WLF,Wallis and Futuna,Mata-Utu,Oceania,11572.0,11655.0,12182.0,13142.0,14723.0,13454.0,11315.0,9377.0,142.0,81.4930,0.9953,0.00
230,172,ESH,Western Sahara,El Aaiún,Africa,575986.0,556048.0,491824.0,413296.0,270375.0,178529.0,116775.0,76371.0,266000.0,2.1654,1.0184,0.01
231,46,YEM,Yemen,Sanaa,Asia,33696614.0,32284046.0,28516545.0,24743946.0,18628700.0,13375121.0,9204938.0,6843607.0,527968.0,63.8232,1.0217,0.42
232,63,ZMB,Zambia,Lusaka,Africa,20017675.0,18927715.0,,13792086.0,9891136.0,7686401.0,5720438.0,4281671.0,752612.0,26.5976,1.0280,0.25
