# Extract Pink Floyd song data

In [2]:
import requests
import pandas as pd
import hashlib
from typing import Optional, List
from dataclasses import dataclass, field
from bs4 import BeautifulSoup
from functools import cache

# Get remote web page

A simple page request function that uses the `functools` package to add in memory caching. This is useful when re-running cells that have already made the same request previously.

In [4]:
@cache
def get_web_page(url: str) -> Optional[bytes]:
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
        "Accept": "*/*",
    }
    response = requests.get(url, headers=headers)
    if response.ok:
        return response.content

In [5]:
# the artist start page
response = get_web_page("https://getsongkey.com/artist/pink-floyd/lx6")

Use data classes to build the data models. Their nicer to work with than untyped dictionaries.

In [6]:
@dataclass
class Track:
    position: int
    title: str
    length: int
    bpm: int = field(default=0)
    key: str = field(default="")


@dataclass
class Album:
    year: int
    title: str
    url: str
    tracks: List[Track] = field(default_factory=list)

Go through the discography and build a list of albums with tracks.

In [7]:
albums: List[Album] = []

assert response

soup = BeautifulSoup(response, "html.parser")
for item in soup.select("#discography [itemprop='album']"):
    link_element = item.select_one("a[itemprop='url']")
    assert link_element
    album_url = str(link_element["href"])

    name_element = link_element.select_one(".meta h4[itemprop='name']")
    assert name_element
    title = name_element.get_text()

    year_element = link_element.select_one(".meta p[itemprop='copyrightYear']")
    assert year_element
    year = year_element.get_text()

    response_1 = get_web_page(f"https://getsongbpm.com{album_url}")
    response_2 = get_web_page(f"https://getsongkey.com{album_url}")

    assert response_1
    assert response_2

    album = Album(int(year), title=title, url=album_url)

    soup_1 = BeautifulSoup(response_1, "html.parser")
    soup_2 = BeautifulSoup(response_2, "html.parser")

    keys = {}

    for item in soup_2.select("#songs .songs li"):
        position = item.select_one("div[itemprop='position']")
        assert position

        if key := item.select_one(".key_listing .key"):
            keys[int(position.get_text().strip())] = (
                key.get_text().replace("Key of", "").strip()
            )

    # fill in most of the track info with the bpm site
    for item in soup_1.select("#songs ul.songs li"):
        track_name = item.select_one(".track-meta [itemprop='name']")
        assert track_name
        track_name = track_name.get_text().strip()

        duration = item.select_one(".track-meta [itemprop='duration']")
        assert duration
        m, s = duration.get_text().split("'")
        duration = int(m) * 60 + int(s)

        bpm = item.select_one(".track-bpm .track_info")
        assert bpm
        bpm = bpm.get_text().strip().replace("BPM", "")

        position = item.select_one("[itemprop='position']")
        assert position
        position = int(position.get_text().strip())

        # get the song key
        album.tracks.append(
            Track(
                position=position,
                title=track_name,
                length=duration,
                bpm=int(bpm),
                key=keys.get(position, ""),
            )
        )

    albums.append(album)

print(albums[0].tracks[0])

Track(position=1, title='Things Left Unsaid', length=266, bpm=84, key='D♯')


In [8]:
rows = []
for album in albums:
    for track in album.tracks:
        rows.append(
            {
                "album": album.title,
                "position": track.position,
                "title": track.title,
                "year": album.year,
                "bpm": track.bpm,
                "key": track.key,
                "length": track.length,
            }
        )

df = pd.DataFrame(rows)
df.head(30)

Unnamed: 0,album,position,title,year,bpm,key,length
0,The Endless River,1,Things Left Unsaid,2014,84,D♯,266
1,The Endless River,2,It's What We Do,2014,144,Em,377
2,The Endless River,3,Ebb and Flow,2014,80,C,115
3,The Endless River,4,Sum,2014,112,Em,288
4,The Endless River,5,Skins,2014,124,A,157
5,The Endless River,6,Unsung,2014,129,C,67
6,The Endless River,7,Anisina,2014,143,C,196
7,The Endless River,8,The Lost Art of Conversation,2014,90,C♯m,102
8,The Endless River,9,On Noodle Street,2014,105,C♯m,102
9,The Endless River,10,Night Light,2014,130,Cm,102


In [13]:
df.to_csv("./data/pink-floyd-tracks-stage1.csv")

The current extract has some missing tracks. I manually filled in the missing entries that are now saved in the data folder.

In [17]:
df = pd.read_csv("./data/pink-floyd-tracks-stage2.csv")

# add a unique id for each one. this will come in handy when saving the lyrics as we'll have
# a way of joining relations.
def uid(x) -> str:
    return hashlib.md5(str(x.album + x.title).encode()).hexdigest()[:10]

df["track_id"] = df.apply(uid, axis=1)
df = df.set_index("track_id")
df.head()

Unnamed: 0_level_0,album,position,title,year,bpm,key,length
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0a8e306a70,The Endless River,1,Things Left Unsaid,2014,84,D#,266
2a14b70199,The Endless River,2,It's What We Do,2014,144,Em,377
129cbef07e,The Endless River,3,Ebb and Flow,2014,80,C,115
0d8b215b70,The Endless River,4,Sum,2014,112,Em,288
d7b94a6211,The Endless River,5,Skins,2014,124,A,157


In [18]:
df.to_csv("./data/pink-floyd-tracks-stage3.csv")