In [1]:
import glob
import pandas as pd
import os
from pydx7 import load_patch_from_bulk, load_patch
from utils import render_from_specs, validate_specs, serialize_specs

In [2]:
data = pd.read_csv('/home/hoyeol/projects/GCT634_final/data/DX7_YAMAHA_train.csv')

In [3]:
def find_syx_files(directory):
    syx_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".syx"):
                full_path = os.path.join(root, file)
                relative_path = os.path.relpath(full_path, directory)
                syx_files.append(relative_path)
    return syx_files

target_directory = "/home/hoyeol/projects/GCT634_final/data/dx7_patches/DX7_YAMAHA"
syx_files = find_syx_files(target_directory)
print(syx_files[0])
print(len(syx_files))

GreyMatter E! Card/7.syx
35


In [5]:
for i in range(100):
    patch_data = load_patch_from_bulk('/home/hoyeol/projects/GCT634_final/data/dx7_patches/DX7_YAMAHA/Factory patches/rom1a.syx', patch_number=i, load_from_sysex=True)
    if patch_data['name'] not in ['TUB BELLS', 'STEEL DRU']:
        if patch_data['has_fixed_freqs']:
            break

In [6]:
import IPython.display as ipd
audio = render_from_specs(patch_data)
ipd.Audio(audio, rate=48000)

In [7]:
print("specs = " + serialize_specs(patch_data))

specs = {
    'name': 'REFS WHIS',
    'modmatrix': [
		[0, 1, 1, 1, 0, 0],
		[0, 0, 0, 0, 0, 0],
		[0, 0, 1, 0, 0, 0],
		[0, 0, 0, 0, 1, 0],
		[0, 0, 0, 0, 0, 1],
		[0, 0, 0, 0, 0, 0]
    ],
    'outmatrix': [1, 0, 0, 0, 0, 0],
    'feedback': 2,
    'fixed_freqs': [1, 1, 1, 1, 1, 1],
    'coarse': [1, 0, 3, 1, 1, 3],
    'fine': [0, 0, 82, 67, 53, 32],
    'detune': [0, 0, 0, 0, 0, 0],
    'transpose': 0,
    'ol': [78, 64, 75, 66, 93, 90],
    'eg_rate': [
		[94, 99, 94, 60, 60, 60],
		[56, 0, 68, 39, 39, 39],
		[24, 0, 24, 8, 28, 28],
		[55, 0, 55, 0, 45, 49]
    ],
    'eg_level': [
		[96, 99, 96, 99, 99, 99],
		[78, 0, 89, 99, 99, 99],
		[0, 0, 0, 99, 99, 99],
		[0, 0, 0, 0, 0, 0]
    ],
    'sensitivity': [0, 0, 0, 0, 0, 1],
    'has_fixed_freqs': True,
}


In [8]:
def load_patch_from_bulk(bulk_patches,patch_number:int = 0,load_from_sysex=False):
    '''
    TODO: Incorporate:  
        - KB RATE Scaling (from dx7 users manual: "The EG for each operator can be set for a
                        long bass decay and a short treble deacay - as in an acoustic piano")
        - OP Detune parameter
    
    Args:
        patch_file: Path to dx7 cart file
        patch_number: Position of patch within cart
        load_from_sysex: Set it to 'True' when cart file is a sysex dump
    '''

    patch_offset = 6 if load_from_sysex==True else 0
    for i in [patch_number]:
        patch = bulk_patches[patch_offset + i*128:patch_offset+ (i+1)*128]

    return load_patch(patch)

In [10]:
def valid_char(c, invalid_chars=['/', '\\']):
    if (ord(c) < 32 or ord(c) == 127 or c in invalid_chars):
        return False
    return True

def is_invalid_name(name):
    if not isinstance(name, str):
        return True
    if any(not valid_char(c) for c in name):
        return True
    return False

def clean_name(name, replace='_NULL_', nan_names = ['NULL', 'N/A', 'NaN'], length=10):
    if not isinstance(name, str):
        return replace
    elif name in nan_names:
        return replace
    # 출력 가능한 문자만 남김 (null byte, \x1c 등 제거)
    cleaned = ''.join(c for c in name if valid_char(c))
    cleaned = cleaned if cleaned else replace
    return cleaned[:length]

In [11]:
from tqdm import tqdm
import numpy as np
target_directory = "/home/hoyeol/projects/GCT634_final/data/dx7_patches/DX7_YAMAHA"
data = {
    'patch_number': [],
    'name': [],
    'patch_data': [],
    'file_path': [],
    'syx_file': [],
    'has_fixed_freqs': []
}
syx_files = find_syx_files(target_directory)
total = 0
for j in tqdm(range(len(syx_files))):
    syx_file = syx_files[j]
    bulk_patches = np.fromfile(os.path.join(target_directory, syx_file), dtype=np.uint8)
    num_patches = (bulk_patches.shape[0]-6) // 128
    for i in range(num_patches):
        patch_data = load_patch_from_bulk(bulk_patches, patch_number=i, load_from_sysex=True)
        valid = validate_specs(patch_data)
        total += 1
        data['patch_number'].append(i)
        data['patch_data'].append(serialize_specs(patch_data))
        data['file_path'].append('DX7_YAMAHA/' + syx_file)
        data['syx_file'].append(syx_file.split('/')[-1])
        data['name'].append(patch_data['name'])
        data['has_fixed_freqs'].append(patch_data['has_fixed_freqs'])
print("done")
df = pd.DataFrame(data)
df['id'] = list(range(len(df)))
df

100%|██████████| 35/35 [00:00<00:00, 74.27it/s]

done





Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id
0,0,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,0
1,1,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,1
2,2,Snare Snp,"{\n 'name': 'Snare Snp',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,2
3,3,Windchime,"{\n 'name': 'Windchime',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,3
4,4,Crasher 1,"{\n 'name': 'Crasher 1',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,4
...,...,...,...,...,...,...,...
1115,27,BANJO,"{\n 'name': 'BANJO ',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1115
1116,28,HARP 1,"{\n 'name': 'HARP 1',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1116
1117,29,HARP 2,"{\n 'name': 'HARP 2',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1117
1118,30,BASS 3,"{\n 'name': 'BASS 3',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1118


In [12]:
print(df['name'].apply(is_invalid_name).sum())
df['name'] = df['name'].apply(clean_name)
df.to_csv('DX7_YAMAHA.csv')
df1_len = len(df)

6


In [13]:
from tqdm import tqdm
import numpy as np
target_directory = "/home/hoyeol/projects/GCT634_final/data/dx7_patches/DX7_AllTheWeb"
data = {
    'patch_number': [],
    'name': [],
    'patch_data': [],
    'file_path': [],
    'syx_file': [],
    'has_fixed_freqs': []
}
syx_files = find_syx_files(target_directory)
total = 0
for j in tqdm(range(len(syx_files))):
    syx_file = syx_files[j]
    bulk_patches = np.fromfile(os.path.join(target_directory, syx_file), dtype=np.uint8)
    num_patches = (bulk_patches.shape[0]-6) // 128
    for i in range(num_patches):
        patch_data = load_patch_from_bulk(bulk_patches, patch_number=i, load_from_sysex=True)
        validate_specs(patch_data)
        total += 1
        data['patch_number'].append(i)
        data['patch_data'].append(serialize_specs(patch_data))
        data['file_path'].append('DX7_AllTheWeb/' + syx_file)
        data['syx_file'].append(syx_file.split('/')[-1])
        data['name'].append(patch_data['name'])
        data['has_fixed_freqs'].append(patch_data['has_fixed_freqs'])
print("done")
df = pd.DataFrame(data)
df['id'] = list(range(df1_len, df1_len + len(df)))
df

100%|██████████| 12528/12528 [02:41<00:00, 77.36it/s]


done


Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id
0,0,LILPEOPLE,"{\n 'name': 'LILPEOPLE',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,1120
1,1,SPI EFX,"{\n 'name': 'SPI EFX ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1121
2,2,CASTENETS,"{\n 'name': 'CASTENETS',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1122
3,3,CASTENETS,"{\n 'name': 'CASTENETS',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1123
4,4,DINGL 19.,"{\n 'name': 'DINGL 19.',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,1124
...,...,...,...,...,...,...,...
390576,27,SpacedrUm,"{\n 'name': 'SpacedrUm',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TX70.SYX,TX70.SYX,False,391696
390577,28,RandmNots,"{\n 'name': 'RandmNots',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TX70.SYX,TX70.SYX,False,391697
390578,29,RndmNotes,"{\n 'name': 'RndmNotes',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TX70.SYX,TX70.SYX,True,391698
390579,30,RandmNts3,"{\n 'name': 'RandmNts3',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TX70.SYX,TX70.SYX,False,391699


In [14]:
print(df['name'].apply(is_invalid_name).sum())
df['name'] = df['name'].apply(clean_name)
df.to_csv('DX7_AllTheWeb.csv')

33497


In [15]:
from scipy.io.wavfile import write
import IPython.display as ipd
import pandas as pd
import numpy as np

df = pd.read_csv('DX7_AllTheWeb.csv')
df['name'].isna().sum()

0

In [16]:
from scipy.io.wavfile import write
import IPython.display as ipd
import pandas as pd
import numpy as np
from tqdm import tqdm

In [18]:
import copy

def _make_hashable_representation(data_item, ignore_keys_set):
    """
    Creates a hashable representation of a dictionary item,
    excluding specified keys and converting lists to tuples.
    """
    # Create a temporary dictionary with only the keys to compare
    temp_dict = {}
    for key, value in data_item.items():
        if key not in ignore_keys_set:
            # Recursively convert lists to tuples to make them hashable
            if isinstance(value, list):
                temp_dict[key] = _convert_lists_to_tuples_recursive(value)
            # Add elif for dicts if dicts can be values and need to be part of the signature
            # elif isinstance(value, dict):
            #     temp_dict[key] = tuple(sorted((k, _convert_lists_to_tuples_recursive(v)) for k, v in value.items()))
            else:
                temp_dict[key] = value
    
    # Sort items by key to ensure consistent order for the hashable representation
    # Then convert to a tuple of (key, value) pairs, making it hashable
    return tuple(sorted(temp_dict.items()))

def _convert_lists_to_tuples_recursive(item):
    """
    Recursively converts lists within a structure to tuples.
    """
    if isinstance(item, list):
        return tuple(_convert_lists_to_tuples_recursive(sub_item) for sub_item in item)
    # Add elif for dicts if they can appear as values inside lists/other structures
    # elif isinstance(item, dict):
    #     return tuple(sorted((k, _convert_lists_to_tuples_recursive(v)) for k, v in item.items()))
    return item

def mark_duplicates(data_list, ignore_keys=('name', 'transpose', 'sensitivity')):
    """
    Identifies duplicate dictionaries in a list, ignoring specified keys,
    and marks later occurrences with 'duplicate': True.

    Args:
        data_list (list): A list of dictionaries.
        ignore_keys (tuple, optional): A tuple of keys to ignore during comparison.
                                       Defaults to ('name', 'transpose').

    Returns:
        list: The original list with 'duplicate' keys added/updated in place.
    """
    if not isinstance(data_list, list):
        raise TypeError("Input must be a list of dictionaries.")
    if not all(isinstance(item, dict) for item in data_list):
        raise ValueError("All items in the list must be dictionaries.")

    seen_signatures = set()
    ignore_keys_set = set(ignore_keys) # Use a set for faster lookups
    duplicates_idx = []
    for i in tqdm(range(len(data_list))):
        item = data_list[i]
        # Generate a hashable signature for the current item, excluding ignored keys
        signature = _make_hashable_representation(item, ignore_keys_set)

        if signature in seen_signatures:
            duplicates_idx.append(i)
        else:
            # It's the first time we've seen this signature
            seen_signatures.add(signature)
            
    return duplicates_idx

In [19]:
df = pd.read_csv('DX7_YAMAHA.csv', index_col=0)
df

Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id
0,0,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,0
1,1,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,1
2,2,Snare Snp,"{\n 'name': 'Snare Snp',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,2
3,3,Windchime,"{\n 'name': 'Windchime',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,3
4,4,Crasher 1,"{\n 'name': 'Crasher 1',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,4
...,...,...,...,...,...,...,...
1115,27,BANJO,"{\n 'name': 'BANJO ',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1115
1116,28,HARP 1,"{\n 'name': 'HARP 1',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1116
1117,29,HARP 2,"{\n 'name': 'HARP 2',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1117
1118,30,BASS 3,"{\n 'name': 'BASS 3',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1118


In [20]:
dup_idxs = mark_duplicates([eval(x) for x in df['patch_data'].copy().tolist()])
print(len(dup_idxs))

100%|██████████| 1120/1120 [00:00<00:00, 43029.54it/s]

62





In [21]:
df = df.drop(dup_idxs, axis=0).reset_index(drop=True)
df['id'] = list(range(len(df)))
df.to_csv('DX7_YAMAHA_deduplicated.csv')
df

Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id
0,0,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,0
1,1,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,1
2,2,Snare Snp,"{\n 'name': 'Snare Snp',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,2
3,3,Windchime,"{\n 'name': 'Windchime',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,3
4,4,Crasher 1,"{\n 'name': 'Crasher 1',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,4
...,...,...,...,...,...,...,...
1053,26,LUTE,"{\n 'name': 'LUTE ',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1053
1054,28,HARP 1,"{\n 'name': 'HARP 1',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1054
1055,29,HARP 2,"{\n 'name': 'HARP 2',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1055
1056,30,BASS 3,"{\n 'name': 'BASS 3',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1056


In [22]:
df = pd.read_csv('DX7_AllTheWeb.csv', index_col=0)

In [23]:
# print(len(df), len(df[df['name'] != 'INVALID']))
# df = df[df['name'] != 'INVALID'].reset_index(drop=True) #filter out invalid names

In [24]:
dup_idxs = mark_duplicates([eval(x) for x in df['patch_data'].copy().tolist()])
print(len(dup_idxs))

100%|██████████| 390581/390581 [00:08<00:00, 48716.73it/s]


351264


In [25]:
df = df.drop(dup_idxs, axis=0).reset_index(drop=True)
df.to_csv('DX7_AllTheWeb_deduplicated.csv')
df

Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id
0,0,LILPEOPLE,"{\n 'name': 'LILPEOPLE',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,1120
1,1,SPI EFX,"{\n 'name': 'SPI EFX ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1121
2,2,CASTENETS,"{\n 'name': 'CASTENETS',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1122
3,4,DINGL 19.,"{\n 'name': 'DINGL 19.',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,1124
4,5,BELLO,"{\n 'name': 'BELLO ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1125
...,...,...,...,...,...,...,...
39312,26,BSlvrMn,"{\n 'name': 'B/SlvrMn ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,385967
39313,27,BsNylGtr,"{\n 'name': 'Bs/NylGtr',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,385968
39314,29,BsE.Tack,"{\n 'name': 'BsE.Tack ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TOP40A.SYX,TOP40A.SYX,True,387186
39315,22,BSqrLead,"{\n 'name': 'B/SqrLead',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/RYTHSECB.SYX,RYTHSECB.SYX,True,390251


In [26]:
df1 = pd.read_csv('DX7_YAMAHA_deduplicated.csv', index_col=0)
df2 = pd.read_csv('DX7_AllTheWeb_deduplicated.csv', index_col=0)
df = pd.concat([df1, df2]).reset_index(drop=True)
dup_idxs = mark_duplicates([eval(x) for x in df['patch_data'].copy().tolist()])
print(len(dup_idxs))
assert np.all(np.array(dup_idxs) >= len(df1))
df = df.drop(dup_idxs, axis=0).reset_index(drop=True)
df.to_csv('DX7_combined_deduplicated.csv')
df

100%|██████████| 40375/40375 [00:01<00:00, 32612.59it/s]


1058


Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id
0,0,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,0
1,1,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,1
2,2,Snare Snp,"{\n 'name': 'Snare Snp',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,2
3,3,Windchime,"{\n 'name': 'Windchime',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,3
4,4,Crasher 1,"{\n 'name': 'Crasher 1',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,4
...,...,...,...,...,...,...,...
39312,26,BSlvrMn,"{\n 'name': 'B/SlvrMn ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,385967
39313,27,BsNylGtr,"{\n 'name': 'Bs/NylGtr',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,385968
39314,29,BsE.Tack,"{\n 'name': 'BsE.Tack ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TOP40A.SYX,TOP40A.SYX,True,387186
39315,22,BSqrLead,"{\n 'name': 'B/SqrLead',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/RYTHSECB.SYX,RYTHSECB.SYX,True,390251


In [27]:
df = df[len(df1):].reset_index(drop=True)
df['id'] = list(range(len(df)))
df.to_csv('DX7_AllTheWeb_deduplicated.csv')
df

Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id
0,0,LILPEOPLE,"{\n 'name': 'LILPEOPLE',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,0
1,1,SPI EFX,"{\n 'name': 'SPI EFX ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1
2,2,CASTENETS,"{\n 'name': 'CASTENETS',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,2
3,4,DINGL 19.,"{\n 'name': 'DINGL 19.',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,3
4,5,BELLO,"{\n 'name': 'BELLO ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,4
...,...,...,...,...,...,...,...
38254,26,BSlvrMn,"{\n 'name': 'B/SlvrMn ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,38254
38255,27,BsNylGtr,"{\n 'name': 'Bs/NylGtr',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,38255
38256,29,BsE.Tack,"{\n 'name': 'BsE.Tack ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TOP40A.SYX,TOP40A.SYX,True,38256
38257,22,BSqrLead,"{\n 'name': 'B/SqrLead',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/RYTHSECB.SYX,RYTHSECB.SYX,True,38257


In [28]:
df = pd.read_csv('DX7_YAMAHA_deduplicated.csv', index_col=0)
df['split'] = 'train'
random_idxs = np.random.permutation(len(df))[:100]
df['wav_path'] = df['file_path'].str[:-4] + '/' + df['patch_number'].astype(str) + '_'+ df['name'].str.replace('/', '_', regex=False) + '.wav'
df.loc[random_idxs, 'split'] = 'test'

df.to_csv('DX7_YAMAHA_deduplicated.csv')
df = pd.read_csv('DX7_AllTheWeb_deduplicated.csv', index_col=0)
df['split'] = 'train'
df['wav_path'] = df['file_path'].str[:-4] + '/' + df['patch_number'].astype(str) + '_'+ df['name'].str.replace('/', '_', regex=False) + '.wav'
df.to_csv('DX7_AllTheWeb_deduplicated.csv')
df

Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id,split,wav_path
0,0,LILPEOPLE,"{\n 'name': 'LILPEOPLE',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,0,train,DX7_AllTheWeb/Aminet/7/0_LILPEOPLE.wav
1,1,SPI EFX,"{\n 'name': 'SPI EFX ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1,train,DX7_AllTheWeb/Aminet/7/1_SPI EFX .wav
2,2,CASTENETS,"{\n 'name': 'CASTENETS',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,2,train,DX7_AllTheWeb/Aminet/7/2_CASTENETS.wav
3,4,DINGL 19.,"{\n 'name': 'DINGL 19.',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,3,train,DX7_AllTheWeb/Aminet/7/4_DINGL 19..wav
4,5,BELLO,"{\n 'name': 'BELLO ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,4,train,DX7_AllTheWeb/Aminet/7/5_BELLO .wav
...,...,...,...,...,...,...,...,...,...
38254,26,BSlvrMn,"{\n 'name': 'B/SlvrMn ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,38254,train,DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA/26_BSlvrMn .wav
38255,27,BsNylGtr,"{\n 'name': 'Bs/NylGtr',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,38255,train,DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA/27_BsNylGtr.wav
38256,29,BsE.Tack,"{\n 'name': 'BsE.Tack ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TOP40A.SYX,TOP40A.SYX,True,38256,train,DX7_AllTheWeb/LiVeMuSiC/TOP40A/29_BsE.Tack .wav
38257,22,BSqrLead,"{\n 'name': 'B/SqrLead',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/RYTHSECB.SYX,RYTHSECB.SYX,True,38257,train,DX7_AllTheWeb/LiVeMuSiC/RYTHSECB/22_BSqrLead.wav


In [29]:
import pandas as pd
df = pd.read_csv('DX7_YAMAHA_deduplicated.csv', index_col=0)
df_train = df[df['split'] == 'train']
df_test = df[df['split'] == 'test']
df_train.drop(columns=['split']).reset_index().to_csv('DX7_YAMAHA_train.csv')
df_test.drop(columns=['split']).reset_index().to_csv('DX7_YAMAHA_test.csv')

In [None]:
# import pandas as pd
# df = pd.read_csv('data/DX7_YAMAHA_test_gemini.csv', index_col=0)
# df = df.reset_index()
# df.to_csv('data/DX7_YAMAHA_test_gemini.csv')

In [31]:
df_test

Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id,split,wav_path
1,1,Triangle,"{\n 'name': 'Triangle ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,1,test,DX7_YAMAHA/GreyMatter E! Card/7/1_Triangle .wav
8,8,Cowbell 1,"{\n 'name': 'Cowbell 1',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,True,8,test,DX7_YAMAHA/GreyMatter E! Card/7/8_Cowbell 1.wav
18,18,Timpani,"{\n 'name': 'Timpani ',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/7.syx,7.syx,False,18,test,DX7_YAMAHA/GreyMatter E! Card/7/18_Timpani .wav
32,1,IceCaves,"{\n 'name': '\\IceCaves',\n 'modmatrix':...",DX7_YAMAHA/GreyMatter E! Card/5.syx,5.syx,True,32,test,DX7_YAMAHA/GreyMatter E! Card/5/1_IceCaves.wav
43,12,YAMATALK-,"{\n 'name': 'YAMATALK-',\n 'modmatrix': ...",DX7_YAMAHA/GreyMatter E! Card/5.syx,5.syx,True,43,test,DX7_YAMAHA/GreyMatter E! Card/5/12_YAMATALK-.wav
...,...,...,...,...,...,...,...,...,...
1007,26,MULTI NOT,"{\n 'name': 'MULTI NOT',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom4b.syx,rom4b.syx,False,1007,test,DX7_YAMAHA/Factory patches/rom4b/26_MULTI NOT.wav
1010,1,FLUTE 2,"{\n 'name': 'FLUTE 2',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom2a.syx,rom2a.syx,False,1010,test,DX7_YAMAHA/Factory patches/rom2a/1_FLUTE 2.wav
1033,3,E.PIANO 3,"{\n 'name': 'E.PIANO 3',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1033,test,DX7_YAMAHA/Factory patches/rom1b/3_E.PIANO 3.wav
1045,18,PIPES 4,"{\n 'name': 'PIPES 4',\n 'modmatrix': ...",DX7_YAMAHA/Factory patches/rom1b.syx,rom1b.syx,False,1045,test,DX7_YAMAHA/Factory patches/rom1b/18_PIPES 4.wav


In [1]:
# You may need to install librosa: !pip install librosa
import os
import librosa
import numpy as pd
import pandas as pd

# --- Parameters ---
wav_dir = 'data/wav'
initial_seconds = 5
inaudibility_threshold = 0.001 # You can adjust this threshold if needed

# --- Function to check for inaudibility ---
def is_inaudible(wav_path, duration, threshold):
    """
    Checks if the beginning of a WAV file is almost inaudible.

    Args:
        wav_path (str): Path to the WAV file.
        duration (int): How many seconds of the start to check.
        threshold (float): RMS energy threshold for being considered inaudible.

    Returns:
        bool: True if inaudible, False otherwise.
    """
    if not os.path.exists(wav_path):
        print(f"Warning: File not found {wav_path}")
        return False # Or handle as an error

    try:
        # Load the first 'duration' seconds of the audio file
        y, sr = librosa.load(wav_path, duration=duration)

        # If there's no audio data, consider it inaudible
        if len(y) == 0:
            return True

        # Calculate Root Mean Square (RMS) energy
        rms = librosa.feature.rms(y=y)

        # Check if the maximum RMS value is below the threshold
        if np.max(rms) < threshold:
            return True
        else:
            return False
    except Exception as e:
        print(f"Error processing {wav_path}: {e}")
        return False

# # --- Main script ---
# inaudible_files = {}

# print("Checking audio files...")
# # Walk through all subdirectories and files
# for subdir, _, files in os.walk(wav_dir):
#     for file in files:
#         if file.lower().endswith('.wav'):
#             wav_path = os.path.join(subdir, file)

#             # Check if the file is inaudible
#             inaudible_status = is_inaudible(wav_path, initial_seconds, inaudibility_threshold)

#             # Store the result
#             inaudible_files[wav_path] = inaudible_status

# print("Check complete.")
# # --- Display results ---
# # You can create a pandas DataFrame from this dictionary
# df_inaudible = pd.DataFrame(list(inaudible_files.items()), columns=['wav_path', 'inaudible'])

# print(f"\nTotal files checked: {len(df_inaudible)}")
# print(f"Inaudible files found: {df_inaudible['inaudible'].sum()}")

# print("\nList of inaudible files:")
# for path, is_inaudible_flag in inaudible_files.items():
#     if is_inaudible_flag:
#         print(path)

# # Display the first few rows of the DataFrame
# print("\nDataFrame head:")
# print(df_inaudible.head())


In [20]:
import pandas as pd
import os
import librosa
import numpy as np
import pandas as pd


df_train_yamaha = pd.read_csv('data/DX7_YAMAHA_train.csv', index_col=0)
df_test_yamaha = pd.read_csv('data/DX7_YAMAHA_test.csv', index_col=0)
df_train_alltheweb = pd.read_csv('data/DX7_AllTheWeb_train.csv', index_col=0)


# --- Parameters ---
wav_dir = 'data/wav'
initial_seconds = 3
inaudibility_threshold = 0.001 # You can adjust this threshold if needed
inaudibles = []
df = df_test_yamaha
for i in range(len(df)):
    wav_path = os.path.join(wav_dir, df.iloc[i]['wav_path'])
    if is_inaudible(wav_path, initial_seconds, inaudibility_threshold):
        inaudibles.append(True)
        print(wav_path)
    else:
        inaudibles.append(False)
df['inaudible'] = inaudibles

In [21]:
df.to_csv('data/DX7_YAMAHA_test.csv')

In [15]:
df

Unnamed: 0,patch_number,name,patch_data,file_path,syx_file,has_fixed_freqs,id,split,wav_path,inaudible
0,0,LILPEOPLE,"{\n 'name': 'LILPEOPLE',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,0,train,DX7_AllTheWeb/Aminet/7/0_LILPEOPLE.wav,False
1,1,SPI EFX,"{\n 'name': 'SPI EFX ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,1,train,DX7_AllTheWeb/Aminet/7/1_SPI EFX .wav,False
2,2,CASTENETS,"{\n 'name': 'CASTENETS',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,2,train,DX7_AllTheWeb/Aminet/7/2_CASTENETS.wav,False
3,4,DINGL 19.,"{\n 'name': 'DINGL 19.',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,False,3,train,DX7_AllTheWeb/Aminet/7/4_DINGL 19..wav,False
4,5,BELLO,"{\n 'name': 'BELLO ',\n 'modmatrix': ...",DX7_AllTheWeb/Aminet/7.syx,7.syx,True,4,train,DX7_AllTheWeb/Aminet/7/5_BELLO .wav,False
...,...,...,...,...,...,...,...,...,...,...
38254,26,BSlvrMn,"{\n 'name': 'B/SlvrMn ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,38254,train,DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA/26_BSlvrMn .wav,False
38255,27,BsNylGtr,"{\n 'name': 'Bs/NylGtr',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA.SYX,SPRSPLTA.SYX,False,38255,train,DX7_AllTheWeb/LiVeMuSiC/SPRSPLTA/27_BsNylGtr.wav,False
38256,29,BsE.Tack,"{\n 'name': 'BsE.Tack ',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/TOP40A.SYX,TOP40A.SYX,True,38256,train,DX7_AllTheWeb/LiVeMuSiC/TOP40A/29_BsE.Tack .wav,False
38257,22,BSqrLead,"{\n 'name': 'B/SqrLead',\n 'modmatrix': ...",DX7_AllTheWeb/LiVeMuSiC/RYTHSECB.SYX,RYTHSECB.SYX,True,38257,train,DX7_AllTheWeb/LiVeMuSiC/RYTHSECB/22_BSqrLead.wav,False
