# Import Libs

In [38]:
import polars as pl
import polars.selectors as cs
from collections import Counter
import re

# Import Annotations

In [2]:
anno_1 = pl.read_csv("../../data/DEAM/static_annotations_averaged_songs_1_2000.csv")
anno_1 = anno_1.rename({col: col.strip() for col in anno_1.columns})

anno_2 = pl.read_csv("../../data/DEAM/static_annotations_averaged_songs_2000_2058.csv")
anno_2 = anno_2.rename({col: col.strip() for col in anno_2.columns})

anno_1.shape, anno_2.shape

((1744, 5), (58, 13))

In [3]:
display(anno_1, anno_2)

song_id,valence_mean,valence_std,arousal_mean,arousal_std
i64,f64,f64,f64,f64
2,3.1,0.94,3.0,0.63
3,3.5,1.75,3.3,1.62
4,5.7,1.42,5.5,1.63
5,4.4,2.01,5.3,1.85
7,5.8,1.47,6.4,1.69
…,…,…,…,…
1996,3.9,1.87,5.9,2.21
1997,5.3,1.42,3.9,1.14
1998,6.4,1.5,6.2,1.6
1999,4.6,2.11,5.4,1.8


song_id,valence_mean,valence_std,valence_ max_mean,valence_max_std,valence_min_mean,valence_min_std,arousal_mean,arousal_std,arousal_max_mean,arousal_max_std,arousal_min_mean,arousal_min_std
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2001,3.2,0.98,5.0,1.41,2.2,0.98,6.6,0.8,8.6,0.49,3.4,1.02
2002,6.4,0.49,8.2,0.98,5.0,1.1,5.2,1.17,7.4,1.36,2.2,1.17
2003,5.4,1.5,7.2,1.17,3.4,1.02,4.6,1.85,6.2,2.04,1.4,0.49
2004,5.0,1.1,6.4,1.02,3.2,1.17,4.8,1.6,6.0,2.28,2.8,0.98
2005,3.8,1.17,5.0,1.1,1.6,0.8,5.2,0.75,8.4,0.8,2.0,1.1
…,…,…,…,…,…,…,…,…,…,…,…,…
2054,5.4,1.2,6.0,1.1,4.4,1.36,3.6,1.36,4.2,1.33,2.4,0.8
2055,5.0,1.41,6.0,1.41,4.2,1.33,5.2,1.47,6.4,1.62,4.0,1.41
2056,5.0,1.41,5.6,1.62,4.4,1.36,4.6,1.74,5.0,1.79,4.0,1.67
2057,3.17,1.07,4.5,0.76,2.17,0.9,6.83,0.37,8.17,0.9,4.83,1.46


In [11]:
all_annotations = pl.concat(
    [
        anno_1,
        anno_2.select(
            "song_id", "valence_mean", "valence_std", "arousal_mean", "arousal_std"
        ),
    ],
    how="vertical",
)


# Import Metadata

In [5]:
metadata_2013 = pl.read_csv("../../data/DEAM/metadata_2013.csv")
metadata_2014 = pl.read_csv(
    "../../data/DEAM/metadata_2014.csv", truncate_ragged_lines=True
)
metadata_2015 = pl.read_csv(
    "../../data/DEAM/metadata_2015.csv", truncate_ragged_lines=True
)

In [6]:
metadata_2013 = (
    metadata_2013.select("song_id", "Artist", "Song title", "Genre")
    .rename({"Artist": "artist", "Song title": "song_title", "Genre": "genre"})
    .with_columns(
        pl.col(pl.Utf8)
        .str.replace_all("\u00a0", " ")  # normalize non-breaking spaces if present
        .str.strip_chars()  # strip whitespace (space, tab, newline, etc.)
        .str.strip_chars('"')  # then strip leading/trailing double quotes
    )
)

In [7]:
metadata_2014 = (
    metadata_2014.select("Id", "Artist", "Track", "Genre")
    .rename(
        {
            "Id": "song_id",
            "Artist": "artist",
            "Track": "song_title",
            "Genre": "genre",
        }
    )
    .with_columns(
        pl.col(pl.Utf8)
        .str.replace_all("\u00a0", " ")  # normalize non-breaking spaces if present
        .str.strip_chars()  # strip whitespace (space, tab, newline, etc.)
        .str.strip_chars('"')  # then strip leading/trailing double quotes
    )
)

In [8]:
metadata_2015 = (
    metadata_2015.select("id", "artist", "title", "genre")
    .rename(
        {
            "id": "song_id",
            "title": "song_title",
        }
    )
    .with_columns(
        pl.col(pl.Utf8)
        .str.replace_all("\u00a0", " ")  # normalize non-breaking spaces if present
        .str.strip_chars()  # strip whitespace (space, tab, newline, etc.)
        .str.strip_chars('"')  # then strip leading/trailing double quotes
    )
)

In [None]:
all_metadata = pl.concat(
    [metadata_2013, metadata_2014, metadata_2015],
    how="vertical",
)

# Join

In [21]:
merged = all_annotations.join(all_metadata, on="song_id", how="left").with_columns(
    pl.col("genre").str.to_lowercase().alias("genre")
)
merged

song_id,valence_mean,valence_std,arousal_mean,arousal_std,artist,song_title,genre
i64,f64,f64,f64,f64,str,str,str
2,3.1,0.94,3.0,0.63,"""The New Mystikal Troubadours""","""Tonight A Lonely Century""","""blues"""
3,3.5,1.75,3.3,1.62,"""Kevin MacLeod""","""DD Groove""","""blues"""
4,5.7,1.42,5.5,1.63,"""Kevin MacLeod""","""Slow Burn""","""blues"""
5,4.4,2.01,5.3,1.85,"""My Bubba & Mi""","""Nothing Much""","""blues"""
7,5.8,1.47,6.4,1.69,"""Kevin MacLeod""","""Hustle""","""blues"""
…,…,…,…,…,…,…,…
2054,5.4,1.2,3.6,1.36,"""Tom La Meche""","""Interlude""","""jazz"""
2055,5.0,1.41,5.2,1.47,"""Goo Goo Cluster""","""Vide grenier""","""reggae"""
2056,5.0,1.41,4.6,1.74,"""Ruediger Kramer""","""happy child singing""","""jazz"""
2057,3.17,1.07,6.83,0.37,"""La Verue""","""Au Feu""","""rock"""


# Simplifying Genres

In [31]:
unique_genres = merged.select("genre").unique().get_column("genre").to_list()

In [None]:
# 10 umbrella bins
UMBRELLAS = [
    "rock",
    "pop",
    "hip-hop",
    "electronic",
    "jazz",
    "classical",
    "folk/country",
    "blues/soul",
    "world",
    "other",
]


In [28]:
KEYWORD_TO_BIN = {
    # core
    "rock": "rock",
    "pop": "pop",
    "hip-hop": "hip-hop",
    "hiphop": "hip-hop",
    "rap": "hip-hop",
    "electronic": "electronic",
    "jazz": "jazz",
    "classical": "classical",
    "folk": "folk/country",
    "country": "folk/country",
    "blues": "blues/soul",
    "soulrb": "blues/soul",
    "soul": "blues/soul",
    "world": "world",
    "reggae": "world",
    # helpful hints -> fold into umbrellas
    "acoustic": "folk/country",
    "singer/songwriter": "folk/country",
    "singersongwriter": "folk/country",  # just in case
    # noise/modifiers (we won’t map these; leave here for clarity):
    # "experimental", "instrumental", "international", "spoken", "fusion"
}

# If multiple umbrellas tie, prefer earlier in this list:
TIE_BREAK_PRIORITY = [
    "hip-hop",
    "rock",
    "pop",
    "electronic",
    "jazz",
    "classical",
    "folk/country",
    "blues/soul",
    "world",
    "other",
]

NOISE_WORDS = {
    "experimental",
    "instrumental",
    "international",
    "spoken",
    "fusion",
    "n/a",
}


In [29]:
def normalize_text(s: str) -> str:
    s = (s or "").strip().lower()
    # standardize separators
    s = s.replace("/", "-")
    # compress whitespace and dashes
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"-+", "-", s)
    return s


def classify_genre(raw: str) -> str:
    s = normalize_text(raw)

    if not s or s in NOISE_WORDS or s == "n/a":
        return "other"

    votes = Counter()

    # substring voting
    for kw, bin_name in KEYWORD_TO_BIN.items():
        if kw in s:
            votes[bin_name] += 1

    # if no keywords matched, try token-wise fallback
    if not votes:
        tokens = re.split(r"[-\s]+", s)
        tokens = [t for t in tokens if t and t not in NOISE_WORDS]
        for t in tokens:
            bin_name = KEYWORD_TO_BIN.get(t)
            if bin_name:
                votes[bin_name] += 1

    if not votes:
        # final fallback: heuristic keywords
        if "groove" in s or "bird of youth" in s:
            return "other"
        return "other"

    # choose the umbrella with max votes; break ties by priority
    top_count = max(votes.values())
    candidates = [b for b, c in votes.items() if c == top_count]
    for pref in TIE_BREAK_PRIORITY:
        if pref in candidates:
            return pref
    return "other"


In [32]:
collapsed = [classify_genre(g) for g in unique_genres]
for g, c in zip(unique_genres, collapsed):
    print(f"{g:60s} -> {c}")


blues                                                        -> blues/soul
international-folk-experimental-rock                         -> rock
jazz                                                         -> jazz
groove                                                       -> other
classical-soulrb-folk                                        -> blues/soul
electronic-rock                                              -> rock
country-rock                                                 -> rock
reggae                                                       -> world
jazz-international-pop                                       -> pop
classical-instrumental-electronic                            -> electronic
folk-experimental-electronic-rock                            -> rock
world                                                        -> world
international-pop-rock                                       -> rock
soulrb-blues-rock                                            -> blues/soul
folk-ele

In [None]:
merged_genres = merged.with_columns(
    pl.col("genre").map_elements(classify_genre, return_dtype=pl.String).alias("genre")
).with_columns(
    pl.when(pl.col("genre") == "world")
    .then(pl.lit("other"))
    .otherwise(pl.col("genre"))
    .alias("genre")
)

merged_genres

song_id,valence_mean,valence_std,arousal_mean,arousal_std,artist,song_title,genre
i64,f64,f64,f64,f64,str,str,str
2,3.1,0.94,3.0,0.63,"""The New Mystikal Troubadours""","""Tonight A Lonely Century""","""blues/soul"""
3,3.5,1.75,3.3,1.62,"""Kevin MacLeod""","""DD Groove""","""blues/soul"""
4,5.7,1.42,5.5,1.63,"""Kevin MacLeod""","""Slow Burn""","""blues/soul"""
5,4.4,2.01,5.3,1.85,"""My Bubba & Mi""","""Nothing Much""","""blues/soul"""
7,5.8,1.47,6.4,1.69,"""Kevin MacLeod""","""Hustle""","""blues/soul"""
…,…,…,…,…,…,…,…
2054,5.4,1.2,3.6,1.36,"""Tom La Meche""","""Interlude""","""jazz"""
2055,5.0,1.41,5.2,1.47,"""Goo Goo Cluster""","""Vide grenier""","""other"""
2056,5.0,1.41,4.6,1.74,"""Ruediger Kramer""","""happy child singing""","""jazz"""
2057,3.17,1.07,6.83,0.37,"""La Verue""","""Au Feu""","""rock"""


In [49]:
merged_genres["genre"].value_counts(sort=True)

genre,count
str,u32
"""rock""",378
"""folk/country""",320
"""electronic""",242
"""blues/soul""",205
"""classical""",180
"""jazz""",171
"""pop""",170
"""hip-hop""",110
"""other""",26
