In [2]:
import pandas as pd

import os 
import glob

In [3]:
# get all the track names from MUSDB18

MUSDB_PATH = "~/Documents/data/musdb18hq/canonical"

In [4]:
musdb_train = glob.glob(os.path.join(os.path.expanduser(MUSDB_PATH), "train", "*"))
musdb_train = [os.path.basename(p) for p in musdb_train]

In [5]:
musdb_test = glob.glob(os.path.join(os.path.expanduser(MUSDB_PATH), "test", "*"))
musdb_test = [os.path.basename(p) for p in musdb_test]

In [6]:
musdb_val = os.listdir(os.path.join(os.path.expanduser(MUSDB_PATH), "../intermediates/npz/val"))
musdb_val = [f.split(".")[0] for f in musdb_val if f.endswith(".npz")]

musdb_val

['ANiMAL - Rockshow',
 'Actions - One Minute Smile',
 'Alexander Ross - Goodbye Bolero',
 'Clara Berry And Wooldog - Waltz For My Victims',
 'Fergessen - Nos Palpitants',
 'James May - On The Line',
 'Johnny Lokke - Promises & Lies',
 'Leaf - Summerghost',
 'Meaxic - Take A Step',
 'Patrick Talbot - A Reason To Leave',
 'Skelpolu - Human Mistakes',
 'Traffic Experiment - Sirens',
 'Triviul - Angelsaint',
 'Young Griffo - Pennies']

In [9]:
RAWSTEMS_PATH = "~/Documents/data/rawstems/canonical/"

rawstems = glob.glob(os.path.join(os.path.expanduser(RAWSTEMS_PATH), "*"))
rawstems = [os.path.basename(p) for p in rawstems]

In [8]:
rawstems = [fn.split(" - ") for fn in rawstems]

rawstems

[]

In [28]:
rawstems_df = []

for artist, title in rawstems:
    artist = artist.replace("_", " ").strip()
    title = title.replace("_", " ").strip()

    track_name = f"{artist} - {title}"
    rawstems_df.append(track_name)

In [29]:
musdb_tracklist = pd.read_csv("/home/hice1/kwatchar3/scratch/musdb25-multitrack/content/cleaned_tracklist_w_path_final.csv")

In [30]:
musdb_tracklist = musdb_tracklist[musdb_tracklist.Source == "DSD"]

In [31]:
musdb_tracklist["split"] = musdb_tracklist["musdb18_path"].apply(lambda x: x.split("canonical/")[-1].split("/")[0])

In [32]:
musdb_tracklist.value_counts("split")

split
train    54
test     46
Name: count, dtype: int64

In [36]:
track_in_musdb_train = []
track_in_musdb_val = []
track_in_musdb_test = []

for _, row in musdb_tracklist.iterrows():
    track_name = f"{row['Artist'].strip()} - {row['Title'].strip()}"

    if track_name in rawstems_df or track_name == "Triviul - Widow":
        if track_name == "Triviul - Widow":
            print("Found special case")
            track_name = "Triviul - Widow (feat. The Fiend)"
        if row["split"] == "train":

            if track_name in musdb_val:

                original_path = os.path.join(os.path.expanduser(RAWSTEMS_PATH), f"{row['Artist'].replace(' ', '_')} - {row['Title'].replace(' ', '_')}")
                new_path = original_path.replace("musdb18_train", "musdb18_val")
                os.makedirs(os.path.dirname(new_path), exist_ok=True)

                os.rename(
                    original_path,
                    new_path,
                )

                track_in_musdb_val.append(track_name)
                print(f"Found in val: {track_name}")
            else:
                track_in_musdb_train.append(track_name)
        elif row["split"] == "test":
            track_in_musdb_test.append(track_name)
        else:
            raise ValueError(f"Unknown split: {row['split']}")  

    # if track_name not in rawstems_df:
    #     print(track_name)

    

Found in val: Actions - One Minute Smile
Found in val: Fergessen - Nos Palpitants
Found in val: James May - On The Line
Found in val: Leaf - Summerghost
Found in val: Skelpolu - Human Mistakes
Found in val: Traffic Experiment - Sirens
Found in val: Triviul - Angelsaint
Found special case
Found in val: Young Griffo - Pennies


In [None]:
from collections import defaultdict
import pandas as pd



rawstems = glob.glob(os.path.join(os.path.expanduser(RAWSTEMS_PATH), "*"))
rawstems = [os.path.basename(p) for p in rawstems]

error_log = []

df = []

for filename in rawstems:
    comp = filename.split("|")

    if len(comp) != 2:
        raise ValueError(f"Unexpected filename format: {filename}")
    
    artist, title = comp

    artist = artist.replace("_", " ").strip()
    title = title.replace("_", " ").strip()

    track_name = f"{artist} - {title}"
    if artist == "Triviul":
        print(track_name)
    item = {
        "artist": artist,
        "title": title,
        "folder_name": filename
    }

    if track_name in track_in_musdb_train:
        item['musdb18_split'] = 'train'
    elif track_name in track_in_musdb_test:
        item['musdb18_split'] = 'test'
    else:
        item['musdb18_split'] = 'none'

    folder = os.path.join(os.path.expanduser(RAWSTEMS_PATH), filename)
    stems = glob.glob(os.path.join(folder, "**/*.*"), recursive=True)
    stems = [os.path.relpath(s, folder) for s in stems]

    num_stems = len(stems)
    stem_distribution = defaultdict(int)
    top_level_stem_distribution = defaultdict(int)
    for stem in stems:
        
        original_stem = stem

        stem = stem.replace("Bss", "Bass")
        stem = stem.replace("VocLV", "Voc/LV")
        if "Voc/LV" not in stem:
            stem = stem.replace("oc/LV", "Voc/LV")
        stem = stem.replace("Rhy/OERC", "Rhy/PERC")

        if "Room-AR70.wav" in stem:
            stem = "Misc/Room/Room-AR70.wav"

        if original_stem != stem:
            error_log.append({
                "stem_type": stem.split("/")[0],
                "track_id": filename,
                "original_path": original_stem,
                "corrected_path": stem,
            })

        stem_classes = stem.split("/")[:-1]
        if len(stem_classes) == 0:
            print(f"Unknown stem class for stem: {stem}")
            raise ValueError(f"Unknown stem class for stem: {stem}")

        top_level_stem = stem_classes[0]
        stem_classes = "/".join(stem_classes)

        stem_distribution[stem_classes] += 1

        top_level_stem_distribution[top_level_stem] += 1
        
    item.update(stem_distribution)

    for k, v in top_level_stem_distribution.items():
        item[f"{k}/**"] = v

    item['num_stems'] = num_stems

    df.append(item)

df = pd.DataFrame(df)
df = df.fillna(0)
idx_keys = ['artist', 'title', 'folder_name', 'musdb18_split', 'num_stems']
other_keys = [k for k in df.columns if k not in idx_keys]
df = df[idx_keys + sorted(other_keys)]
for col in other_keys:
    df[col] = df[col].astype(int)

# df.to_csv(os.path.join(os.path.expanduser(RAWSTEMS_PATH), "../..", "tracklist.csv"), index=False)

Triviul - Alright?
Triviul - Angelsaint
Triviul - Better?
Triviul - Dorothy
Triviul - Gimme
Triviul - To Sam Rawfers
Triviul - Widow (feat. The Fiend)


In [1]:
df

NameError: name 'df' is not defined

In [None]:
os.path.abspath(os.path.join(os.path.expanduser(RAWSTEMS_PATH), "../..", "tracklist.csv"))

'/home/hice1/kwatchar3/Documents/data/rawstems/tracklist.csv'

In [None]:
error_log = pd.DataFrame(error_log)

In [None]:
error_log = error_log.sort_values(['stem_type', 'track_id'], ascending=[False, True])

In [None]:
# error_log.to_csv("rawstems_error_log.csv", index=False)