# Lab | Web scraping from multiple pages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# A.Expand the project
# If you're done, you can try to expand the project on your own. Here are a few suggestions:

#A1. Find other lists of hot songs on the internet and scrape them too: having a bigger pool of songs will be awesome!
#A2. Apply the same logic to other "groups" of songs: the best songs from a decade or from a country / culture / language / genre.
#A3. Wikipedia maintains a large collection of lists of songs: https://en.wikipedia.org/wiki/Lists_of_songs.

# B. Practice web scraping
# As you've seen, scraping the internet is a skill that can get you all sorts of information. Here are some little challenges that you can try to gain more experience in the field:

#B1.Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'
#B2. Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml'
#B3. Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'
#B4. Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: url = 'https://www.emsc-csem.org/Earthquake/'
#B5. List all language names and number of related articles in the order they appear in wikipedia.org: url = 'https://www.wikipedia.org/'
#B6. A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'
#B7. Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [3]:
# A1. Find other lists of hot songs on the internet and scrape
url = 'https://www.elportaldemusica.es/lists/top-100-canciones/2019/1'
response = requests.get(url)
response

<Response [200]>

In [4]:
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)

<!DOCTYPE html>

<html lang="es-ES">
<head>
<!-- Begin Cookie Consent plugin by Silktide - http://silktide.com/cookieconsent -->
<!-- End Cookie Consent plugin -->
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1.0, minimum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="_SEC-EPDM" name="csrf-param"/>
<meta content="P6tTbpw5NCJ7nA7gBDEuS5cbGR3MwrknOeFFRPVBPoxUmDwmzVZEYSLPVJg1XBoez1hQe7mozx9vtBEwoSsL2Q==" name="csrf-token"/>
<title>El portal de Música</title>
<link href="/assets/f4500e98173dde0b8b5e1adb635cc2de/themes/smoothness/jquery-ui.css?v=1606983066" rel="stylesheet"/>
<link href="/assets/9096fa34901acbd9446866c2f9819921/css/font-awesome.min.css?v=1606983066" rel="stylesheet"/>
<link href="/assets/d11896fb829797fa3954f6189be2a177/css/bootstrap.css?v=1606983066" rel="stylesheet"/>
<link href="/assets/8708857144c423dddc4ce8baeca7d432/min/jquery.rateyo.min.css?v=1606983066" rel="stylesheet"/>
<link href="/assets/7c440d420e0d516

In [5]:
song_titles = soup.select('.name')
artist_names = soup.select('.related')

In [6]:
songs = []
for song in song_titles:
    new_song = song.get_text().strip().lower()
    songs.append(new_song)

In [7]:
artists = []
for artist in artist_names:
    new_artists = artist.get_text().strip().lower()
    artists.append(new_artists)
artists

['paulo londra',
 '6ix9ine / anuel aa',
 'aitana',
 'daddy yankee y anuel aa',
 'bad bunny',
 'c. tangana & becky g.',
 'anuel aa / romeo santos',
 'dj snake, selena gomez, ozuna y cardi b',
 'luis fonsi y ozuna',
 'bad bunny',
 'anuel aa / haze',
 'bad bunny',
 'pedro capó & farruko',
 'lola indigo & mala rodriguez',
 'rosalía',
 'aitana',
 'ozuna / manuel turizo',
 'dj luian / mambo kingz / anuel aa / becky g  / prince roy',
 'marc anthony, will smith & bad bunny',
 'beret',
 'ozuna / romeo santos',
 'maikel delacalle',
 'brytiago / darell',
 'cauty / rafa pabon',
 'mozart la para / justin quiles',
 'maluma',
 '6ix9ine / anuel aa',
 'becky g & paulo londra',
 'j balvin',
 'becky g & natti natasha',
 'bad bunny',
 'lalo ebratt y trapical',
 'ana guerra',
 'ana guerra y juan magan',
 'c.tangana / rosalia',
 'dvicio ft reik y mau & ricky',
 'david bisbal y greeicy',
 'lola indigo',
 'karol g y anuel aa',
 'aitana y ana guerra',
 'bad bunny',
 'melendi / alejandro sanz / arkano',
 'maroo

In [8]:
final = [{"title": song, "artist": artist} for song, artist in zip(songs, artists)]

In [9]:
final

[{'title': 'adan y eva', 'artist': 'paulo londra'},
 {'title': 'mala', 'artist': '6ix9ine / anuel aa'},
 {'title': 'vas a quedarte', 'artist': 'aitana'},
 {'title': 'adictiva', 'artist': 'daddy yankee y anuel aa'},
 {'title': 'mia feat. drake', 'artist': 'bad bunny'},
 {'title': 'booty', 'artist': 'c. tangana & becky g.'},
 {'title': 'ella quiere beber', 'artist': 'anuel aa / romeo santos'},
 {'title': 'taki taki', 'artist': 'dj snake, selena gomez, ozuna y cardi b'},
 {'title': 'imposible', 'artist': 'luis fonsi y ozuna'},
 {'title': 'solo de mí', 'artist': 'bad bunny'},
 {'title': 'amanece', 'artist': 'anuel aa / haze'},
 {'title': 'ni bien ni mal', 'artist': 'bad bunny'},
 {'title': 'calma (remix)', 'artist': 'pedro capó & farruko'},
 {'title': 'mujer bruja', 'artist': 'lola indigo & mala rodriguez'},
 {'title': 'malamente', 'artist': 'rosalía'},
 {'title': 'teléfono', 'artist': 'aitana'},
 {'title': 'vaina loca', 'artist': 'ozuna / manuel turizo'},
 {'title': 'bubalu',
  'artist': 

In [10]:
df = pd.DataFrame(final)
df.head(101)

Unnamed: 0,title,artist
0,adan y eva,paulo londra
1,mala,6ix9ine / anuel aa
2,vas a quedarte,aitana
3,adictiva,daddy yankee y anuel aa
4,mia feat. drake,bad bunny
...,...,...
95,la romana,bad bunny / el alfa
96,flames,david guetta / sia
97,calypso,"luis fonsi, stefflon don"
98,la isla del amor,demarco flamenco / maki


In [11]:
# B. Practice web scraping

#B1.Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'


In [17]:
url ='https://en.wikipedia.org/wiki/Python'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [19]:
links = []

for link in soup.find_all('a'):
    links.append(link.get('href'))
links

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 '/w/index.php?title=Special:CreateAccount&returnto=Python',
 '/w/index.php?title=Special:UserLogin&returnto=Python',
 '/w/index.php?title=Special:CreateAccount&returnto=Python',
 '/w/index.php?title=Special:UserLogin&returnto=Python',
 '/wiki/Help:Introduction',
 '/wiki/Special:MyContributions',
 '/wiki/Special:MyTalk',
 '#',
 '#Snakes',
 '#Computing',
 '#People',
 '#Roller_coasters',
 '#Vehicles',
 '#Weaponry',
 '#Other_uses',
 '#See_also',
 

In [115]:
#B3. Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'
url = 'https://www.fbi.gov/wanted/topten'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
most_wanted = []
all_images = soup.select('img')
images = all_images[3:-2]
for image in images:
    name = image.get('alt')
    most_wanted.append(name)
top_ten = most_wanted[:-2] + [most_wanted[-1]]
top_ten

['YULAN ADONAY ARCHAGA CARIAS',
 'BHADRESHKUMAR CHETANBHAI PATEL',
 'WILVER VILLEGAS-PALOMINO',
 'ALEJANDRO ROSALES CASTILLO',
 'RUJA IGNATOVA',
 'ARNOLDO JIMENEZ',
 'OMAR ALEXANDER CARDENAS',
 'ALEXIS FLORES',
 'MICHAEL JAMES PRATT',
 'JOSE RODOLFO VILLARREAL-HERNANDEZ']

In [133]:
#B4. Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: 
url = 'https://www.emsc-csem.org/Earthquake/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
rows = soup.select('tbody tr')
dates = []
times = []
latitudes = []
longitues = []
regions = []
for row in rows:
    text = row.a.text.split()
    if len(text) == 2:
        date= text[0]
        time = text[1]
        dates.append(date)
        times.append(time)
    else:
        print(f"Skipping row: {text}")


Skipping row: ['3']
Skipping row: ['8']


In [134]:
for row in rows:
    reg = row.select_one('.tb_region').text.strip()
    regions.append(reg)
    
regions

['OKLAHOMA',
 'SUNDA STRAIT, INDONESIA',
 'SUNDA STRAIT, INDONESIA',
 'SUNDA STRAIT, INDONESIA',
 'SOUTHERN SUMATRA, INDONESIA',
 'SUNDA STRAIT, INDONESIA',
 'SUNDA STRAIT, INDONESIA',
 'SUNDA STRAIT, INDONESIA',
 'OFFSHORE BAJA CALIFORNIA, MEXICO',
 'SUNDA STRAIT, INDONESIA',
 'SUNDA STRAIT, INDONESIA',
 'SUNDA STRAIT, INDONESIA',
 'EASTERN TURKEY',
 'CENTRAL TURKEY',
 'SUNDA STRAIT, INDONESIA',
 'GULF OF CALIFORNIA',
 'CENTRAL TURKEY',
 'CENTRAL TURKEY',
 'SULAWESI, INDONESIA',
 'SICILY, ITALY',
 'GULF OF CALIFORNIA',
 'NEAR COAST OF NICARAGUA',
 'ADRIATIC SEA',
 'GULF OF CALIFORNIA',
 'CENTRAL TURKEY',
 'CENTRAL TURKEY',
 'GREATER LOS ANGELES AREA, CALIF.',
 'FRANCE',
 'COLORADO',
 'FRANCE',
 'CENTRAL TURKEY',
 'WESTERN MEDITERRANEAN SEA',
 'CRETE, GREECE',
 'GULF OF CALIFORNIA',
 'TURKEY-SYRIA BORDER REGION',
 'GULF OF CALIFORNIA',
 'NEAR EAST COAST OF KAMCHATKA',
 'KEPULAUAN BATU, INDONESIA',
 'CENTRAL TURKEY',
 'MYANMAR',
 'NEAR SOUTH COAST OF FRANCE',
 'NEAR SOUTH COAST OF FRANC

In [135]:
for row in rows:
    lat = row.select_one('.tabev1').text.strip() + ' ' + row.select_one('.tabev2').text.strip()
    latitudes.append(lat)

In [138]:
data = {
    'Date': dates[:20],
    'Time': times[:20],
    'Latitude': latitudes[:20],
    'Region': regions[:20],
}
df = pd.DataFrame(data)
df.head(50)

Unnamed: 0,Date,Time,Latitude,Region
0,2023-05-10,10:38:20.4,35.23 N,OKLAHOMA
1,2023-05-10,10:38:19.0,6.42 S,"SUNDA STRAIT, INDONESIA"
2,2023-05-10,10:34:58.0,6.45 S,"SUNDA STRAIT, INDONESIA"
3,2023-05-10,10:31:41.0,6.44 S,"SUNDA STRAIT, INDONESIA"
4,2023-05-10,10:18:34.0,4.88 S,"SOUTHERN SUMATRA, INDONESIA"
5,2023-05-10,10:16:44.0,6.50 S,"SUNDA STRAIT, INDONESIA"
6,2023-05-10,10:11:48.0,6.45 S,"SUNDA STRAIT, INDONESIA"
7,2023-05-10,10:09:34.0,6.44 S,"SUNDA STRAIT, INDONESIA"
8,2023-05-10,10:08:23.0,30.76 N,"OFFSHORE BAJA CALIFORNIA, MEXICO"
9,2023-05-10,10:07:15.0,6.48 S,"SUNDA STRAIT, INDONESIA"


In [238]:
#B5. List all language names and number of related articles in the order they appear in wikipedia.org: 
url = 'https://www.wikipedia.org/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
central_features = soup.select('a', class_='link-box')
central_features

[<a class="link-box" data-slogan="The Free Encyclopedia" href="//en.wikipedia.org/" id="js-link-box-en" title="English — Wikipedia — The Free Encyclopedia">
 <strong>English</strong>
 <small><bdi dir="ltr">6 644 000+</bdi> <span>articles</span></small>
 </a>,
 <a class="link-box" data-slogan="Свободная энциклопедия" href="//ru.wikipedia.org/" id="js-link-box-ru" title="Russkiy — Википедия — Свободная энциклопедия">
 <strong>Русский</strong>
 <small><bdi dir="ltr">1 909 000+</bdi> <span>статей</span></small>
 </a>,
 <a class="link-box" data-slogan="フリー百科事典" href="//ja.wikipedia.org/" id="js-link-box-ja" title="Nihongo — ウィキペディア — フリー百科事典">
 <strong>日本語</strong>
 <small><bdi dir="ltr">1 370 000+</bdi> <span>記事</span></small>
 </a>,
 <a class="link-box" data-slogan="Die freie Enzyklopädie" href="//de.wikipedia.org/" id="js-link-box-de" title="Deutsch — Wikipedia — Die freie Enzyklopädie">
 <strong>Deutsch</strong>
 <small><bdi dir="ltr">2 792 000+</bdi> <span>Artikel</span></small>
 </a>,

In [240]:
languages = []
articles = []
for feature in central_features:
    language = feature.find('strong')
    article = feature.find('bdi')
    if language is not None and article is not None:
        languages.append(language.text.strip())
        
        article_text = article.text.strip().replace(",", "").replace("+", "")
        articles.append(article_text)
data = {
    'languages': languages,
    'number_pf_articles': articles
}

df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,languages,number_pf_articles
0,English,6 644 000
1,Русский,1 909 000
2,日本語,1 370 000
3,Deutsch,2 792 000
4,Español,1 854 000
5,Français,2 514 000
6,Italiano,1 806 000
7,中文,1 347 000
8,فارسی,فارسی
9,Português,1 101 000


In [3]:
#B6. A list with the different kind of datasets available in data.gov.uk: 

url = 'https://data.gov.uk/'
response = requests.get(url)
response

<Response [200]>

In [14]:
soup = BeautifulSoup(response.content, 'html.parser')

data_types = []

for link in soup.select('h3'):
    data_types.append(link.text)
data_types

['Business and economy',
 'Crime and justice',
 'Defence',
 'Education',
 'Environment',
 'Government',
 'Government spending',
 'Health',
 'Mapping',
 'Society',
 'Towns and cities',
 'Transport',
 'Digital service performance',
 'Government reference data']

In [16]:
#B7. Display the top 10 languages by number of native speakers stored in a pandas dataframe: 

url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
language = []
number = []


In [68]:
table = soup.find('table', class_='wikitable')
# headers = table.find_all('th')
rows = table.find_all('tr')[1:]
data = []
for row in rows:
    languages = row.find_all('td')
    values = [language.text.strip() for language in languages]
    data.append(values)
data  
df = pd.DataFrame(data,columns = ['languague', 'millions_of_speakers', 'x', 'y'])

final_df = df.head(11).drop(['x', 'y'], axis=1)
final_df

Unnamed: 0,languague,millions_of_speakers
0,"Mandarin Chinese(incl. Standard Chinese, but e...",939.0
1,Spanish,485.0
2,English,380.0
3,"Hindi(excl. Urdu, and other languages)",345.0
4,Portuguese,236.0
5,Bengali,234.0
6,Russian,147.0
7,Japanese,123.0
8,Yue Chinese(incl. Cantonese),86.1
9,Vietnamese,85.0
