# Phase 1: Data Cleaning

For the prediction model


## Phase 1: Clean the Data
Database: billboard_global_200.csv<br>
All of the Global 200 rankings on Billboard since it's beginning

In [None]:
import csv
import re

address = "..data/billboard_global_200.csv"
first_appearance_addy = "..data/clean1.csv"

seen = set()

def normalize(s: str) -> str:
    return re.sub(r'\s+', ' ', s.lower().strip())

def canonical_key(row):
    # row layout: [date, rank, title, main_artist, featured_artists]
    song = normalize(row[2])

    artist_blob = f"{row[3]},{row[4]}"
    artists = re.split(
        r',|&|\+|and|feat\.?|ft\.?|featuring|x|with|w/',
        artist_blob,
        flags=re.IGNORECASE
    )

    artists = sorted({
        normalize(a) for a in artists
        if a and a.strip() and a.strip().lower() not in {"n/a", "none"}
    })

    return (song, tuple(artists))


with open(address, 'r', encoding='utf-8') as file, \
     open(first_appearance_addy, 'w', newline='', encoding='utf-8') as write_file:

    reader = csv.reader(file)
    writer = csv.writer(write_file)

    header = next(reader)
    writer.writerow(header)

    for row in reader:
        if len(row) < 5:
            continue

        key = canonical_key(row)
        if key not in seen:
            seen.add(key)
            writer.writerow(row)

print(f"{len(seen)} unique songs written.")
print("Data cleaning phase 1 done.")


4325 unique songs written.
Data cleaning phase 1 done.


## Data Cleaning Phase 2
- Convert dates to numerical format (decades since September 5th, 2020, caps the value at 1 and still be accurate)
- Normalize ranking (keep b/w 0 and 1), w/ #1 rank being 1 and #200 rank being 0, so greater is better.
- Separate collaborations properly(!!!)
- Replace N/A values (when applicable) with "None"

In [None]:
import csv
import datetime
import math
import re

address = "..data/clean1.csv"
target = "..data/clean2.csv"

columns = "decades_since_2020_Sept,date,rank, rank_norm, title,main_artist,featured_artists"

day_one = datetime.datetime(2020, 9, 5)

# Helpers
def graft_exceptions(artist_list):
    def is_exception(candidate):
        # Exact-match exceptions
        if candidate in EXCEPTIONS:
            return True
        # Ends with "And His Orchestra"
        if candidate.endswith("And His Orchestra"):
            return True
        # Match &-based exceptions ignoring spaces around &
        candidate_norm = candidate.replace(' & ', '&')
        for ex in EXCEPTIONS:
            if ex.replace(' & ', '&') == candidate_norm:
                return True
        return False


    if len(artist_list) <= 1:
        return artist_list
    
    result = []
    i = 0
    while i < len(artist_list):
        max_look = min(i + 4, len(artist_list))
        found = False
        for j in range(max_look, i, -1):
            candidate = ' '.join(artist_list[i:j])  # <--- change here
            if is_exception(candidate):
                result.append(candidate)
                i = j
                found = True
                break
        if not found:
            result.append(artist_list[i])
            i += 1
    return result

def split_artists(main, sec):
    def normalize_artists(s, sec_column_nonempty=False):
        if not s or s.strip() in {'N/A', 'None'}:
            return ''
        if sec_column_nonempty:
            s = re.sub(r'(feat\.?|ft\.?|featuring|Featuring)', ' feat. ', s)
        s = re.sub(r'\s*x\s*', ' x ', s)
        s = re.sub(r'(?i)(\S)With(\S)', r'\1 With \2', s)
        s = re.sub(r'(?i)\s*With\s*', ' With ', s)
        s = re.sub(r'\s+', ' ', s)
        return s.strip()


    def split_names(names, is_feature=False):
        names = normalize_artists(names)
        if not names:
            return []

        # If the whole string matches an exception, skip splitting
        if is_feature:
            if names in EXCEPTIONS or names.endswith("And His Orchestra"):
                return [names]

        # otherwise, split normally
        result = [names]
        for splitter in SPLITTERS:
            temp = []
            for name in result:
                temp.extend([n.strip() for n in name.split(splitter)])
            result = temp

        seen = set()
        unique = []
        for n in result:
            if n and n not in seen:
                seen.add(n)
                unique.append(n)
        return graft_exceptions(unique)



    main_artists = split_names(normalize_artists(main, bool(sec.strip() and sec.strip() != "N/A")))
    sec_artists = split_names(normalize_artists(sec), is_feature=True)

    if len(main_artists) > 1:
        sec_artists = main_artists[1:] + sec_artists
        main_artists = [main_artists[0]]

    return main_artists, sec_artists


with open(address, 'r', encoding='utf-8') as file, \
     open(target, 'w', newline='', encoding='utf-8') as write_file:

    cursor = csv.reader(file)
    writer = csv.writer(write_file)
    writer.writerow(columns.split(','))
    next(cursor)

    SPLITTERS = [',', '&', ' + ', ' and ',
                 ' feat. ', ' ft. ', ' featuring ', ' Featuring ',
                 ' x ', ' X ', ' with ', ' With ', 'Duet With', 'w/', "/"]

    EXCEPTIONS = {
        "Tyler, the Creator",
        "Tyler, The Creator",

        "John Scott Trotter & His Orchestra",
        "Ralph Carmichael Orchestra and Chorus",
        "Georgie Stoll & His Orchestra",

        "AC/DC",
        "Earth, Wind & Fire",
        "TWS: 24/7",
        "HUNTR/X",
    }

    for row in cursor:
        try:
            date = datetime.datetime.strptime(row[0], "%Y-%m-%d")
        except ValueError:
            continue

        diff = ((date - day_one).days / 3652) + 0.000001

        try:
            rank = int(row[1])
            rank_norm = 1 / math.sqrt(rank)
        except ValueError:
            continue

        main = row[3]
        sec = row[4]

        n1, n2 = split_artists(main, sec)

        main_out = f"'{n1[0]}'" if n1 else "'None'"
        sec_out = '"' + ", ".join([f"'{artist}'" for artist in n2]) + '"' if n2 else '"None"'

        writer.writerow([diff, row[0], row[1], rank_norm, row[2], main_out, sec_out])

print("Finished cleaning phase 2")


Finished cleaning phase 2


## Data Cleaning Phase 3

- Use embeddings for artist names to see which artists are closer to each other
- Create charting recency bias (more recent data weighted more heavily), using numerical format of date

**Feature Engineering**
- For each song, aggregate the past performance of the main artist up to that point
- For each song, aggregate the combined past performance of collaborators (again, w/ recency bias)

Keep in mind release date and song title will be ignored but kept in case in the future I want to use this data.

* In the future, possibly use non-debut data to examine 'longevity' of artists (might be a waste of time). That is, don't ONLY consider the debut of a given artist's songs, just consider it more heavily.

In [None]:
import csv
import math
import re

address = "..data/clean2.csv"
target = "..data/clean3.csv"

# columns = "decades_since_2020_Sept,date,rank, rank_norm, title,main_artist,featured_artists"
new_cols = "decades_since_2020_Sept,date,rank, rank_norm,title,main_artist,featured_artists,artist_past_performance,features_past_performance"

# Less aggressive decay (decades axis). Adjust if you want longer memory.
DECAY_LAMBDA = 1.0
# Make feature default consistent with main artist default
FEATURES_DEFAULT = 0.5

artist_pat = re.compile(r"'([^']*)'")  # names inside single quotes
COMMA_SPLIT = re.compile(r'\s*,\s*')


def parse_time(s: str):
    """
    Return float decades_since_2020_Sept or None if unparsable.
    Do NOT silently coerce bad values to 0.0 (that inflates early scores).
    """
    if s is None:
        return None
    s = s.strip()
    if s == "":
        return None
    try:
        return float(s)
    except ValueError:
        # handle bracketed values like "[0.123]"
        if s.startswith("[") and s.endswith("]"):
            try:
                return float(s[1:-1].strip())
            except Exception:
                return None
        return None


def parse_main(name: str) -> str:
    if name is None:
        return ""
    return name.strip().strip("'").strip()


def parse_features(s: str) -> list[str]:
    """
    Primary: extract single-quoted artist names (Phase 2 format).
    Fallback: if that yields nothing and string isn't "None", attempt comma-split
    and strip quotes. Returns empty list for '"None"' or unparsable inputs.
    """
    if s is None:
        return []
    s_strip = s.strip()
    if s_strip == '"None"' or s_strip == 'None' or s_strip == '':
        return []
    found = artist_pat.findall(s_strip)
    if found:
        return [f.strip() for f in found if f.strip()]
    # fallback: remove surrounding double quotes if present, then split on commas
    if s_strip.startswith('"') and s_strip.endswith('"'):
        inner = s_strip[1:-1].strip()
    else:
        inner = s_strip
    parts = [p.strip().strip("'").strip().strip('"').strip() for p in COMMA_SPLIT.split(inner) if p.strip()]
    return parts


def decay_to(t_now: float, state_tuple):
    """
    Exponential decay of (S, W) from last time t0 to t_now.
    If t_now <= t0, do NOT move t0 backward; return (S, W, t0).
    """
    S, W, t0 = state_tuple
    if t_now is None:
        return S, W, t0
    if t_now <= t0:
        # preserve t0 (avoid rolling last_time backward on out-of-order or bad rows)
        return S, W, t0
    decay = math.exp(-DECAY_LAMBDA * (t_now - t0))
    return S * decay, W * decay, t_now


with open(address, "r", newline="", encoding="utf-8") as f_in, \
     open(target, "w", newline="", encoding="utf-8") as f_out:

    r = csv.reader(f_in)
    w = csv.writer(f_out)

    # write header
    w.writerow([c.strip() for c in new_cols.split(",")])
    next(r, None)  # skip original header

    rows = list(r)  # keep for stable sort; ok for Billboard-size data

    # stable chronology: parse_time may return None -> place those rows at end
    def sort_key(row):
        t = parse_time(row[0])
        # Put unparsable times at the end
        t_sort = t if t is not None else float("inf")
        date = row[1] if len(row) > 1 else ""
        try:
            rank_int = int(row[2]) if len(row) > 2 and row[2].strip() != "" else 9999
        except Exception:
            rank_int = 9999
        return (t_sort, date, rank_int)

    rows.sort(key=sort_key)

    # per-artist state: name -> (S, W, last_time)
    state: dict[str, tuple[float, float, float]] = {}

    for row in rows:
        # Basic length check
        if len(row) < 7:
            # skip malformed rows rather than silently using defaults that bias results
            continue

        t = parse_time(row[0])
        if t is None:
            # skip rows with bad time to avoid inflating "early" performance
            continue

        # rank_norm: prefer column 3; fallback to compute from rank (col 2); final fallback 0.5
        try:
            r_norm = float(row[3])
        except Exception:
            try:
                rank_raw = int(row[2])
                r_norm = 1.0 / math.sqrt(rank_raw) if rank_raw > 0 else 0.5
            except Exception:
                r_norm = 0.5

        main = parse_main(row[5])
        feats = parse_features(row[6])

        # past performance before current song is added (no leakage)
        S_m, W_m, _ = decay_to(t, state.get(main, (0.0, 0.0, t)))
        main_perf = (S_m / W_m) if (W_m and W_m > 1e-8) else 0.5

        feat_perfs = []
        for a in feats:
            S_a, W_a, _ = decay_to(t, state.get(a, (0.0, 0.0, t)))
            feat_perfs.append((S_a / W_a) if (W_a and W_a > 1e-8) else 0.5)
        feats_perf = (sum(feat_perfs) / len(feat_perfs)) if feat_perfs else FEATURES_DEFAULT

        # write engineered features
        w.writerow(row + [main_perf, feats_perf])

        # update states with current song for all participants (main + features)
        participants = {main, *feats}
        for a in participants:
            S0, W0, t0 = state.get(a, (0.0, 0.0, t))
            S0, W0, _ = decay_to(t, (S0, W0, t0))
            S0 += r_norm
            W0 += 1.0
            # ensure last_time never moves backwards
            last_time = t if (t >= t0) else t0
            state[a] = (S0, W0, last_time)

print("Finished Phase 3")


Finished Phase 3
