In [None]:
# Sort train samples into genre folders
import os
import shutil
import json

# Paths
base_folder = "tmp"
metadata_file = "metadata.json"

# Load metadata JSON
with open(metadata_file, "r") as f:
    metadata = json.load(f)

# Ensure "none" folder exists for missing metadata or genre
none_folder = os.path.join(base_folder, "none")
os.makedirs(none_folder, exist_ok=True)

# Iterate over all files in the base_folder
for filename in os.listdir(base_folder):
    filepath = os.path.join(base_folder, filename)
    if os.path.isfile(filepath):
        # Extract key from filename
        key = filename.split("_")[0]

        # Get genre from metadata
        genre = "none"  # default folder
        if key in metadata:
            genre = metadata[key].get("metadata", {}).get("genre", "none")

        # Create genre folder if it doesn't exist
        genre_folder = os.path.join(base_folder, genre)
        os.makedirs(genre_folder, exist_ok=True)

        # Copy file to genre folder
        shutil.copy(filepath, genre_folder)

In [None]:
import os
import pandas as pd

# Path to your samples folder
root_dir = "samples"

# Prepare list to collect sample data
samples_data = []

# Walk through each genre subfolder
for genre_folder in sorted(os.listdir(root_dir)):
    # Walk through each origin subfolder
    for origin_folder in sorted(os.listdir(os.path.join(root_dir, genre_folder))):
        genre_origin_path = os.path.join(root_dir, genre_folder, origin_folder)

        if os.path.isdir(genre_origin_path):
            # Get all mp3 files
            files = sorted(os.listdir(genre_origin_path))

            # Loop through each file
            for i, file in enumerate(files):
                if file.endswith(".mp3"):
                    sample_id = f"{genre_folder}_{origin_folder}_{i:04d}"
                    file_path = os.path.join(root_dir, genre_folder, origin_folder, file).replace("\\", "/")

                    samples_data.append({
                        "sample_id": sample_id,
                        "genre": genre_folder,
                        "origin": origin_folder,
                        "file_path": file_path
                    })

# Convert to DataFrame
samples_df = pd.DataFrame(samples_data)

# Sort for cleanliness
samples_df = samples_df.sort_values(["genre", "origin", "sample_id"])

# Preview
samples_df.head()

In [None]:
import pandas as pd
import os

# Assume samples_df already exists from your previous code!

# Path to your master CSV
master_csv_path = "samples.csv"

if os.path.exists(master_csv_path):
    existing_samples_df = pd.read_csv(master_csv_path, dtype={"sample_id": str})
    
    # Merge the new data
    combined_df = pd.concat([existing_samples_df, samples_df], ignore_index=True)
    
    # Drop duplicate sample_ids
    combined_df = combined_df.drop_duplicates(subset=["sample_id"])
else:
    combined_df = samples_df

# Save merged result
combined_df.to_csv(master_csv_path, index=False)

print("✅ samples.csv has been updated successfully!")