In [None]:
import pandas as pd
import re

INPUT_CSV = "data/spotify_genres_exploded.csv"
OUTPUT_CSV = "data/spotify_unique_genre_buckets.csv"

df = pd.read_csv(INPUT_CSV)

df["artist_genres"] = df["artist_genres"].astype(str).str.strip().str.lower()

BUCKETS = [
    ("EDM & Progressive", [
        r"\bedm\b", r"\bprogressive house\b", r"\bbig room\b", r"\bfuture house\b",
        r"\btropical house\b", r"\belectro house\b", r"\bslap house\b", r"\bmelodic techno\b",
        r"\btechno\b", r"\btrance house\b", r"\bstutter house\b"
    ]),
    ("Trance", [
        r"\btrance\b", r"\bprogressive trance\b", r"\buplifting trance\b", r"\bpsytrance\b"
    ]),
    ("Electronica / Chill", [
        r"\belectronica\b", r"\belectronic\b", r"\bambient(?!.*folk)\b", r"\bdowntempo\b",
        r"\bvaporwave\b", r"\bsynthwave\b", r"\bchillwave\b", r"\bfuture bass\b",
        r"\bmelodic bass\b", r"\blounge\b", r"\bspace music\b"
    ]),
    ("Lo-Fi / Chillhop", [
        r"\blo-?fi\b", r"\blo-?fi beats\b", r"\blo-?fi hip hop\b", r"\bchillhop\b", r"\bjazz beats\b"
    ]),
    ("Pop & Regional Pop", [
        r"\bpop\b", r"\beuropop\b", r"\bnorwegian pop\b", r"\bfrench pop\b", r"\bswedish pop\b",
        r"\bitalo dance\b", r"\beurodance\b", r"\bdance pop\b", r"\belectropop\b",
        r"\bsoft pop\b", r"\bbedroom pop\b", r"\bk-?pop\b", r"\bj-?pop\b", r"\bc-?pop\b",
        r"\bmandopop\b", r"\bpop punk\b", r"\bpop rock\b"
    ]),
    ("Rock / Metal / Core", [
        r"\brock\b", r"\bmetal\b", r"\bmetalcore\b", r"\bpost-hardcore\b", r"\bpost-rock\b",
        r"\bprogressive (metal|rock)\b", r"\bdjent\b", r"\bscreamo\b", r"\bemo\b",
        r"\bgrunge\b", r"\bpunk\b", r"\bhard rock\b", r"\bclassic rock\b",
        r"\bfolk metal\b", r"\balternative (rock|metal)\b"
    ]),
    ("Folk / Acoustic / Celtic", [
        r"\bfolk(?! metal)\b", r"\bfolk rock\b", r"\bindie folk\b", r"\bsea shanties\b",
        r"\bceltic\b", r"\bmedieval\b", r"\bsinger-?songwriter\b", r"\bacoustic\b"
    ]),
    ("Hip-Hop / Rap", [
        r"\bhip hop\b", r"\brap\b", r"\btrap(?!.*(metal|soul))\b", r"\bdrill\b", r"\buk drill\b",
        r"\bgrime\b", r"\bphonk\b", r"\bemo rap\b", r"\bcloud rap\b"
    ]),
    ("Soundtrack / Score / Musicals", [
        r"\bsoundtrack\b", r"\bmusicals?\b", r"\bvgm\b", r"\b(score|original score)\b"
    ]),
]

def map_bucket(genre: str) -> str:
    g = str(genre).lower().strip()
    for bucket, patterns in BUCKETS:
        for pat in patterns:
            if re.search(pat, g):
                return bucket
    return "Others"

df["genre_bucket"] = df["artist_genres"].apply(map_bucket)

df["play_id"] = df["ts"].astype(str) + "§" + df["track_id"].astype(str)

dedup = (
    df.drop_duplicates(subset=["play_id", "genre_bucket"])
      .reset_index(drop=True)
)

dedup.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Saved deduplicated data: {OUTPUT_CSV}")
print(f"Rows before: {len(df)}, after dedup: {len(dedup)}")


✅ Saved deduplicated data: data/spotify_unique_genre_buckets.csv
Rows before: 56058, after dedup: 37152
