# Lab | Web Scraping Multiple Pages

#### Instructions 

#### Expand the project

If you're done, you can try to expand the project on your own. Here are a few suggestions:

- Find other lists of hot songs on the internet and scrape them too: having a bigger pool of songs will be awesome!
- Apply the same logic to other "groups" of songs: the best songs from a decade or from a country / culture / language / genre.
- Wikipedia maintains a large collection of lists of songs: https://en.wikipedia.org/wiki/Lists_of_songs

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import requests
from bs4 import BeautifulSoup
import getpass
import pandas as pd
import re

In [3]:
client_id = "31bb38d4d2c54b0e9b994db2a71040d5"
client_secret = getpass.getpass('Write client secret:')

In [4]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
                                                           client_secret=client_secret))

In [5]:
country_codes= ['AD', 'AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', \
    'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'EC', 'SV', 'EE', 'FI', 'FR', 'DE', \
    'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'ID', 'IE', 'IT', 'JP', 'LV', 'LI', \
    'LT', 'LU', 'MY', 'MT', 'MX', 'MC', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', \
    'PE', 'PH', 'PL', 'PT', 'SG', 'ES', 'SK', 'SE', 'CH', 'TW', 'TR', 'GB', \
    'US', 'UY']

In [6]:
# Save & explore Spotify featured playlists 
playlists = sp.featured_playlists(limit=50)

print(playlists.keys(), "\n")
print(playlists['message'], "\n")
print(playlists['playlists'].keys(), "\n")
print(playlists['playlists']['items'][0].keys())

dict_keys(['message', 'playlists']) 

Editor's picks 

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total']) 

dict_keys(['collaborative', 'description', 'external_urls', 'href', 'id', 'images', 'name', 'owner', 'primary_color', 'public', 'snapshot_id', 'tracks', 'type', 'uri'])


In [7]:
# Save all Spotify featured playlists worldwide
playlists = pd.DataFrame(columns=["playlist", "playlist_id"])

for country in country_codes:
    featured = sp.featured_playlists(country=country, limit=50)
    for playlist in featured['playlists']['items']:
        playlist_name = playlist["name"]
        playlist_id = playlist["id"]
        playlists = playlists.append({"playlist": playlist_name, \
            "playlist_id": playlist_id}, ignore_index=True)

playlists.drop_duplicates(inplace=True)

In [61]:
playlists.shape  # We found 452 unique playlists

(452, 2)

In [8]:
# Find playlists containing relevant keywords
keywords = ["hot", "top", "hit", "popular", "new", "chart"]
hot_playlists = []

for keyword in keywords:
    for playlist in playlists["playlist"]:
        if keyword in playlist:
            hot_playlists.append(playlist)

hot_playlists

['Sommerhits 2022', '#vainsuomihitit', 'Polskie hity']

We don't find a lot of playlists searching per country only, so it might be helpful to use a different database.

In [9]:
URL = "https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905"
page = requests.get(URL)

soup = BeautifulSoup(page.content)

In [10]:
# Scrape the artist names
artist_names = []

for element in soup.select("a[href*=artist]"): 
    artist_names.append(element.get_text()) 

In [11]:
# Scrape the song names
song_names = []

for element in soup.select("a[href*=recording]"): 
    song_names.append(element.get_text()) 

In [36]:
# Scrape all the 500 songs
artist_names = []
song_names = []

for i in range(1, 7):
    URL = "https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page="
    URL += str(i)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content)
    rows = soup.table.find_all("tr")

    for row in rows:

        # Skip the header row
        if row == rows[0]:
            continue
        else: 
            # Select all artist containers
            artist_containers = row.select("a[href*=artist]")

            # Extract the artist names from containers and add them to a list
            artists = []
            for artist_container in artist_containers:
                artist = artist_container.get_text()
                artists.append(artist)

            # Combine the artist names
            artist_name = " and ".join(artists)
            artist_names.append(artist_name) 

            # Follow the same procedure for songs
            song_containers = row.select("a[href*=recording]")
            songs = []
            for song_container in song_containers:
                song = song_container.get_text()
                songs.append(song)
            song_name = " and ".join(songs)
            song_names.append(song_name)
        
top_500 = pd.DataFrame(list(zip(artist_names, song_names)), columns=["artist", "song"])

In [37]:
top_500.reset_index(drop=True, inplace=True)

In [39]:
top_500.tail()

Unnamed: 0,artist,song
496,The Rolling Stones,Miss You
497,Weezer,Buddy Holly
498,Brook Benton,Rainy Night in Georgia
499,Thin Lizzy,The Boys Are Back in Town
500,Boston,More Than a Feeling
