In [1]:
import os

def get_file_list(audio_folder, text_folder):
    """
    This function gets the base file names that are present in both the audio and text folders.

    Args:
    - audio_folder: Path to the folder containing the audio files.
    - text_folder: Path to the folder containing the text files.

    Returns:
    - A list of base file names (without extensions) that are present in both folders.
    """
    # Get the set of audio files (without extensions)
    audio_files = set(os.path.splitext(f)[0] for f in os.listdir(audio_folder) if f.endswith('.wav'))

    # Get the set of text files (without extensions)
    text_files = set(os.path.splitext(f)[0] for f in os.listdir(text_folder) if f.endswith('.txt'))

    # Find the intersection of both sets (i.e., files present in both folders)
    common_files = list(audio_files.intersection(text_files))

    return sorted(common_files)


In [13]:

# Example usage
audio_folder = '/root/dev/data/audio'
text_folder = '/root/dev/data/text'
file_list = get_file_list(audio_folder, text_folder)

print(f"Found {len(file_list)} matching files.")
for file in file_list:
    print(file)


Found 270 matching files.
MAPS_MUS-alb_esp2_AkPnCGdD
MAPS_MUS-alb_esp2_AkPnStgb
MAPS_MUS-alb_esp2_SptkBGAm
MAPS_MUS-alb_esp2_SptkBGCl
MAPS_MUS-alb_esp3_AkPnCGdD
MAPS_MUS-alb_esp4_AkPnStgb
MAPS_MUS-alb_esp5_SptkBGCl
MAPS_MUS-alb_esp6_SptkBGCl
MAPS_MUS-alb_se2_ENSTDkCl
MAPS_MUS-alb_se2_StbgTGd2
MAPS_MUS-alb_se3_AkPnBcht
MAPS_MUS-alb_se3_AkPnBsdf
MAPS_MUS-alb_se4_AkPnBsdf
MAPS_MUS-alb_se6_AkPnStgb
MAPS_MUS-alb_se7_AkPnBsdf
MAPS_MUS-alb_se8_SptkBGAm
MAPS_MUS-appass_1_AkPnBsdf
MAPS_MUS-appass_1_SptkBGCl
MAPS_MUS-appass_3_AkPnStgb
MAPS_MUS-bach_846_AkPnBcht
MAPS_MUS-bach_846_SptkBGAm
MAPS_MUS-bach_847_AkPnBcht
MAPS_MUS-bach_847_AkPnStgb
MAPS_MUS-bach_847_SptkBGCl
MAPS_MUS-bach_850_AkPnBsdf
MAPS_MUS-bk_xmas1_AkPnBsdf
MAPS_MUS-bk_xmas1_AkPnStgb
MAPS_MUS-bk_xmas1_ENSTDkAm
MAPS_MUS-bk_xmas1_ENSTDkCl
MAPS_MUS-bk_xmas1_StbgTGd2
MAPS_MUS-bk_xmas2_AkPnStgb
MAPS_MUS-bk_xmas2_SptkBGCl
MAPS_MUS-bk_xmas3_AkPnStgb
MAPS_MUS-bk_xmas4_ENSTDkCl
MAPS_MUS-bk_xmas4_SptkBGCl
MAPS_MUS-bk_xmas4_StbgTGd2
MAPS_MUS-b

In [9]:
import os

# Assuming your dataset is structured like this:
maps_files = file_list

# Extract the unique instruments and pieces
instruments = set()
pieces = set()

for file_name in maps_files:
    # Split by "-" to get the piece part
    piece_name = file_name.split('-')[1]  # The second element after splitting by "-"

    # Split by "_" and take the last element for the instrument name
    instrument_name = file_name.split('_')[-1]

    # Add to sets
    instruments.add(instrument_name)
    pieces.add(piece_name)

# Display the unique instruments and pieces
print(f"{len(instruments)}Unique Instruments (Categories for Training/Validation/Testing):")
for instrument in instruments:
    print(instrument)

print(f"\n{len(pieces)} Unique Pieces (Song Names):")
for piece in pieces:
    print(piece)


9Unique Instruments (Categories for Training/Validation/Testing):
ENSTDkCl
StbgTGd2
AkPnBcht
AkPnBsdf
AkPnStgb
SptkBGAm
ENSTDkAm
SptkBGCl
AkPnCGdD

230 Unique Pieces (Song Names):
deb_pass_SptkBGAm
muss_2_AkPnCGdD
mendel_op62_5_SptkBGAm
schu_143_2_AkPnCGdD
grieg_halling_AkPnBsdf
bk_xmas1_ENSTDkAm
gra_esp_3_SptkBGCl
schu_143_3_StbgTGd2
ty_mai_SptkBGAm
chp_op18_AkPnCGdD
scn16_5_SptkBGAm
alb_esp2_SptkBGAm
grieg_zwerge_AkPnBcht
mz_333_3_ENSTDkCl
scn15_7_SptkBGAm
bk_xmas2_AkPnStgb
mz_333_2_AkPnCGdD
mz_333_3_StbgTGd2
scn16_4_AkPnBsdf
waldstein_1_AkPnStgb
chpn
liz_et6_ENSTDkCl
mos_op36_6_AkPnBsdf
liz_et6_StbgTGd2
pathetique_1_StbgTGd2
ty_mai_ENSTDkAm
bor_ps6_ENSTDkCl
scn16_4_AkPnCGdD
bk_xmas5_ENSTDkCl
mz_545_3_StbgTGd2
chp_op31_AkPnBcht
br_im5_AkPnCGdD
liz_et3_AkPnStgb
scn15_5_SptkBGCl
scn15_11_ENSTDkAm
ty_mai_AkPnBcht
mz_333_2_StbgTGd2
scn15_9_ENSTDkAm
scn16_7_SptkBGAm
chpn_op33_2_SptkBGAm
bor_ps5_AkPnStgb
chpn_op27_2_SptkBGCl
scn16_4_ENSTDkCl
scn15_11_StbgTGd2
gra_esp_2_AkPnStgb
liz_et6_AkP

In [17]:
from collections import defaultdict
import re

def identify_pieces_by_instrument(files):
    """
    Identifies which pieces are played by which instruments using a single regex pattern.

    Args:
    - files: A list of file names (without extensions).

    Returns:
    - pieces_dict: A dictionary where keys are piece names and values are the instruments that play them.
    - unseen_pieces: A set of pieces that are unique to the training set (not repeated).
    - repeated_pieces: A set of pieces that are repeated across multiple instruments.
    """
    # Regex pattern to capture both piece name and instrument name
    pattern = re.compile(r'MAPS_MUS-(.*)_(\w+)')
    pieces_dict = defaultdict(set)

    for file_name in files:
        match = pattern.search(file_name)

        if match:
            piece_name = match.group(1)  # First capturing group for the piece name
            instrument_name = match.group(2)  # Second capturing group for the instrument name

            pieces_dict[piece_name].add(instrument_name)

    unseen_pieces = set()
    repeated_pieces = set()

    for piece, instruments in pieces_dict.items():
        if len(instruments) == 1:
            unseen_pieces.add(piece)
        else:
            repeated_pieces.add(piece)

    return pieces_dict, unseen_pieces, repeated_pieces

pieces_dict, unseen_pieces, repeated_pieces = identify_pieces_by_instrument(maps_files)

print(f"{len(repeated_pieces)} Repeated Pieces (Seen across multiple instruments): {repeated_pieces}")
print(f"{len(unseen_pieces)} Unseen Pieces (Unique to one instrument): {unseen_pieces}")


70 Repeated Pieces (Seen across multiple instruments): {'mz_570_1', 'schumm-1', 'bk_xmas5', 'ty_november', 'chpn-p6', 'grieg_elfentanz', 'mz_331_3', 'alb_se2', 'mz_311_1', 'chpn-p7', 'deb_clai', 'bk_xmas1', 'chp_op31', 'scn15_11', 'mond_1', 'mz_331_2', 'chpn_op66', 'scn15_7', 'bk_xmas2', 'liz_et_trans5', 'grieg_walzer', 'liz_et2', 'schu_143_1', 'chpn-p14', 'br_im5', 'chpn_op25_e3', 'pathetique_1', 'ty_februar', 'ty_maerz', 'grieg_wanderer', 'waldstein_1', 'mendel_op62_5', 'scn16_3', 'deb_menu', 'appass_1', 'liz_rhap09', 'grieg_butterfly', 'chpn-p4', 'bor_ps6', 'chpn_op27_2', 'mz_545_3', 'chpn-p19', 'liz_rhap02', 'scn16_2', 'bach_846', 'ty_mai', 'ty_september', 'alb_esp2', 'alb_se3', 'bk_xmas4', 'chpn-e01', 'grieg_kobold', 'schuim-1', 'chpn-p13', 'mond_2', 'chpn_op33_2', 'scn16_4', 'mz_333_3', 'scn15_12', 'ty_juli', 'chpn-p15', 'mz_333_2', 'gra_esp_3', 'schub_d960_3', 'br_im2', 'chpn-p1', 'liz_et6', 'bach_847', 'schu_143_3', 'mz_332_2'}
90 Unseen Pieces (Unique to one instrument): {'scn

In [18]:
import re
from collections import defaultdict
import random

def split_dataset(files):
    """
    Splits the dataset into balanced train_tr, train_va, and test sets based on seen/unseen pieces
    and instrument categories.

    Args:
    - files: A list of file names (without extensions).

    Returns:
    - train_tr_files: A list of files for the training set.
    - train_va_files: A list of files for the validation set.
    - test_set1_files: A list of files for test set 1 (unseen real piano recordings).
    - test_set2_files: A list of files for test set 2 (both seen and unseen real piano recordings).
    """
    # Virtual instruments (for training)
    virtual_instruments = {'AkPnCGdD', 'AkPnStgb', 'AkPnBcht', 'AkPnBsdf', 'SptkBGCl', 'SptkBGAm', 'StbgTGd2'}

    # Real pianos (for testing)
    real_instruments = {'ENSTDkAm', 'ENSTDkCl'}

    # Regex pattern to extract piece name and instrument
    pattern = re.compile(r'MAPS_MUS-(.*)_(\w+)')

    # Dictionaries to store pieces by instrument
    pieces_dict = defaultdict(set)

    # Categorize pieces by instrument
    for file_name in files:
        match = pattern.search(file_name)
        if match:
            piece_name = match.group(1)
            instrument_name = match.group(2)
            pieces_dict[piece_name].add(instrument_name)

    # Split pieces into seen and unseen
    unseen_pieces = {piece for piece, instruments in pieces_dict.items() if len(instruments) == 1}
    seen_pieces = {piece for piece in pieces_dict if piece not in unseen_pieces}

    # Lists to store final file lists
    train_tr_files = []
    train_va_files = []
    test_set1_files = []
    test_set2_files = []

    # Step 1: Train (seen pieces from virtual instruments)
    for file_name in files:
        match = pattern.search(file_name)
        if match:
            piece_name = match.group(1)
            instrument_name = match.group(2)
            if instrument_name in virtual_instruments:
                if piece_name in seen_pieces:
                    train_tr_files.append(file_name)
                elif piece_name in unseen_pieces:
                    train_va_files.append(file_name)  # Validation will use unseen virtual pieces

    # Step 2: Test sets from real instruments
    for file_name in files:
        match = pattern.search(file_name)
        if match:
            piece_name = match.group(1)
            instrument_name = match.group(2)
            if instrument_name in real_instruments:
                if piece_name in unseen_pieces:
                    test_set1_files.append(file_name)  # Test Set 1: Unseen pieces
                test_set2_files.append(file_name)      # Test Set 2: Both seen and unseen pieces

    # Ensure validation set is balanced
    if len(train_va_files) > len(train_tr_files) * 0.2:  # Optional, if you want to limit validation size
        train_va_files = random.sample(train_va_files, int(len(train_tr_files) * 0.2))

    return train_tr_files, train_va_files, test_set1_files, test_set2_files

train_tr_files, train_va_files, test_set1_files, test_set2_files = split_dataset(file_list)

# Output the splits
print(f"Training Files: {len(train_tr_files)}")
print(f"Validation Files: {len(train_va_files)}")
print(f"Test Set 1 Files: {len(test_set1_files)}")
print(f"Test Set 2 Files: {len(test_set2_files)}")


Training Files: 130
Validation Files: 26
Test Set 1 Files: 10
Test Set 2 Files: 60


In [19]:
import os
import shutil

def create_paper_split(audio_folder, text_folder, train_tr_files, train_va_files, test_set1_files, test_set2_files, output_dir):
    """
    Creates a 'paper_split' folder with subfolders for train_tr, train_va, test_set1, and test_set2.
    Copies the appropriate files from the audio and text folders into these subfolders.

    Args:
    - audio_folder: Path to the folder containing the original audio files.
    - text_folder: Path to the folder containing the original text files.
    - train_tr_files: List of files for the training set.
    - train_va_files: List of files for the validation set.
    - test_set1_files: List of files for test set 1.
    - test_set2_files: List of files for test set 2.
    - output_dir: Path to the output directory where the "paper_split" folder will be created.
    """

    # Define the subfolder structure for the paper split
    split_folders = {
        'train_tr': train_tr_files,
        'train_va': train_va_files,
        'test_set1': test_set1_files,
        'test_set2': test_set2_files
    }

    # Create the 'paper_split' folder
    paper_split_dir = os.path.join(output_dir, 'paper_split')
    os.makedirs(paper_split_dir, exist_ok=True)

    # Iterate over the splits and copy the files
    for split_name, file_list in split_folders.items():
        split_dir = os.path.join(paper_split_dir, split_name)
        os.makedirs(split_dir, exist_ok=True)

        for file_base in file_list:
            # Find the corresponding audio and text files
            audio_file = os.path.join(audio_folder, f"{file_base}.wav")
            text_file = os.path.join(text_folder, f"{file_base}.txt")

            # Copy audio and text files into the new subfolder
            if os.path.exists(audio_file):
                shutil.copy(audio_file, split_dir)
            if os.path.exists(text_file):
                shutil.copy(text_file, split_dir)

    print(f"Dataset has been split and copied into {paper_split_dir}.")

# Example usage
audio_folder = '/root/dev/data/audio'
text_folder = '/root/dev/data/text'
output_dir = '/root/dev/data/paper_split'

# Assuming train_tr_files, train_va_files, test_set1_files, test_set2_files are already generated from the split
create_paper_split(audio_folder, text_folder, train_tr_files, train_va_files, test_set1_files, test_set2_files, output_dir)


Dataset has been split and copied into /root/dev/data/paper_split/paper_split.
