In [1]:
def note_name_to_midi_number(note_name): # Note names must be capital letters
                                         # Valid inputs are C0 through G9
    note_letter = note_name[:-1]
    octave_number = int(note_name[-1])
    note_number = {'C' : 0,
                   'C#' : 1, 'Db': 1,
                   'D' : 2,
                   'D#' : 3, 'Eb' : 3,
                   'E' : 4,
                   'F' : 5,
                   'F#' : 6, 'Gb': 6,
                   'G' : 7,
                   'G#': 8, 'Ab': 8,
                   'A' : 9,
                   'A#' : 10, 'Bb': 10,
                   'B' : 11}
    return 12*(1+octave_number) + note_number[note_letter]

def note_midi_number_to_name(note_midi_number): # MIDI numbers must be integers
                                                # Valid inputs are 12 through 127
    octave_number = (note_midi_number // 12) - 1
    note_number = note_midi_number % 12
    note_letter = {0 : 'C',
                   1: 'Db',
                   2: 'D',
                   3: 'Eb',
                   4 : 'E',
                   5 : 'F',
                   6: 'Gb',
                   7: 'G',
                   8: 'Ab',
                   9: 'A',
                   10: 'Bb',
                   11: 'B'}
    return note_letter[note_number] + str(octave_number)

In [2]:
# from os import listdir, getcwd, rename, remove
# from os.path import isfile, join

# path = join(getcwd(), 'Soundfont Creation', 'freepats-tools', 'samples')

# files = [f for f in listdir(path) if isfile(join(path, f)) and f[0] != '.']

# samples = []
# for f in files:
#     #print(f.split('.'))
#     sample = {'instrument' : f.split('.')[0],
#               'note' : f.split('.')[-3],
#               'midi_number' : note_name_to_midi_number(f.split('.')[-3]),
#               'expression' : '_'.join(f.split('.')[1:-3]) if len(f.split('.')) > 3 else '',
#               'filename' : f,
#               'extension' : f.split('.')[-1]}
#     samples.append(sample)
# for s in sorted(samples, key=lambda x: x['midi_number']):
#     print(s)
#     print('')

In [3]:
import soundfile as sf
from librosa.onset import onset_detect
from librosa.core import to_mono

from os import listdir, getcwd, rename, remove, makedirs
from os.path import isfile, join, basename, exists

def preprocess_sample_onsets(sample_info):
    # extract sample information
    input_filename = sample_info['filename']
    instrument = sample_info['instrument']
    expression = sample_info['expression']
    midi_number = sample_info['midi_number']
    
    # read sample from storage
    input_data, samplerate = sf.read(input_filename)
    
    # detect onset with librosa, then clip input data to start of onset with backtracking
    onsets = onset_detect(y=to_mono(input_data.T), sr=samplerate, hop_length=1, backtrack=True, units='samples')
    if len(onsets) > 0:
        onset = onsets[0]
    else:
        onset = 0
    output_data = input_data[onset:,:]
    
    # create directory for preprocessed samples if does not already exist
    if not exists('preprocessed_samples'):
        makedirs('preprocessed_samples')
    
    # write preprocessed sample as 24-bit PCM wave file
    output_filename = join(getcwd(), 'preprocessed_samples', f"{instrument}_{expression}_{midi_number}.wav")
    print(f"Writing {output_filename} with offset {onset}")
    sf.write(output_filename, output_data, samplerate, subtype='PCM_24')
    
    # return MySQL query and values (in this case, nothing)
    return None, None

In [4]:
import config

import mysql
import mysql.connector
from mysql.connector import errorcode

import logging
import boto3
from botocore.exceptions import ClientError

def connect(db_name=None):
    if db_name:
        cnx = mysql.connector.connect(
            host = config.rds_host,
            user = config.rds_user,
            passwd = config.rds_password,
            database = db_name)
    else:
        cnx = mysql.connector.connect(
            host = config.rds_host,
            user = config.rds_user,
            passwd = config.rds_password)
    cursor = cnx.cursor()
    return cursor, cnx

def download_file(object_name, bucket='instrument-samples-1337', file_name=None):
    """Download a file from an S3 bucket

    :param object_name: S3 object to download
    :param bucket: Bucket to download from
    :param file_name: file name to download to. If not specified then object_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if file_name is None:
        file_name = object_name

    # Upload the file
    s3_client = boto3.client('s3',
                             aws_access_key_id=config.aws_access_key_id,
                             aws_secret_access_key=config.aws_secret_access_key)
    try:
        response = s3_client.download_file(bucket, object_name, file_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [5]:
from os import listdir, getcwd, rename, remove, makedirs
from os.path import isfile, join, basename, exists
import pandas as pd

def process_sample(sample_id, processing_function):
    cur, cnx = connect('sound_shift')
    cur.execute(f"""SELECT * FROM sound_shift.samples
                    WHERE sample_id = {sample_id}""")
    df = pd.DataFrame(cur.fetchall())
    df.columns = [i[0] for i in cur.description]
    sample_metadata = df.iloc[0]
    print('Attemping to download the following sample:')
    print(sample_metadata)
    sample_info = {'id' : int(sample_metadata['sample_id']),
                   'instrument' : sample_metadata['instrument_name'],
                   'note' : sample_metadata['note'],
                   'midi_number' : note_name_to_midi_number(sample_metadata['note']),
                   'expression' : '_'.join(sample_metadata['expression'].split(' ')),
                   'filename' : join(getcwd(), 'tempfiles', sample_metadata['file_name']),
                   'object_name' : f"{sample_metadata['sample_id']}.{sample_metadata['file_extension']}",
                   'extension' : sample_metadata['file_extension']}
    sample_id = sample_info['id']
    sample_file_ext = sample_info['extension']
    object_name = sample_info['object_name']
    file_name = sample_info['filename']
    print(object_name)
    if not exists('tempfiles'):
        makedirs('tempfiles')
    if download_file(object_name, file_name=file_name):
        print(f'Processing sample with {processing_function} ...')
        query, values = processing_function(sample_info)
        if query:
            cur.executemany(query, values)
        remove(file_name)
        cnx.commit()
        cnx.close()
        print('Done!')
        print('')
    else:
        print('Download failed.')
        print('')
        cnx.close()

In [None]:
import gc

for i in range(2219,2230):
    process_sample(i, preprocess_sample_onsets)
    gc.collect()

#### Alternative multithreaded preprocessing

In [6]:
# Alternative Multithreaded code to perform preprocessing of samples.
# This is naive multithreaded code, so use with caution. Try using
# relatively small batch sizes so that you can catch failures early.

import _thread as thread
import gc

def batch_preprocess_by_sample_index(index_list):
    for i in index_list:
        process_sample(i, preprocess_sample_onsets)
        gc.collect()
    print(f'COMPLETED PREPROCESSING SAMPLES IN {index_list}')

    
# Example of processing four batches in parallel

thread.start_new_thread(batch_preprocess_by_sample_index, (list(range(2047,2100)),) )
thread.start_new_thread(batch_preprocess_by_sample_index, (list(range(2289,2300)),) )

123145544892416

Attemping to download the following sample:
sample_id                                         2047
instrument_name                             Vibraphone
note                                                B5
expression                                  sustain ff
source                                        Iowa2012
file_name          Vibraphone.sustain.ff.B5.stereo.aif
file_extension                                     aif
Name: 0, dtype: object
2047.aif
Attemping to download the following sample:
sample_id                                  2289
instrument_name                           bells
note                                         C6
expression                             brass ff
source                                 Iowa2012
file_name          bells.brass.ff.C6.stereo.aif
file_extension                              aif
Name: 0, dtype: object
2289.aif
Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Processing sample with <function preprocess_sampl

Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/bells_brass_ff_108.wav with offset 5574
Done!

Attemping to download the following sample:
sample_id                                   2298
instrument_name                            bells
note                                         Ab5
expression                              brass ff
source                                  Iowa2012
file_name          bells.brass.ff.Ab5.stereo.aif
file_extension                               aif
Name: 0, dtype: object
2298.aif
Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/bells_brass_ff_80.wav with offset 3740
Done!

Attemping to download the following sample:
sample_id                                  2299
instrument_name                           bells
note                          

Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_sustain_ff_53.wav with offset 8621
Done!

Attemping to download the following sample:
sample_id                                         2062
instrument_name                             Vibraphone
note                                                E4
expression                                  sustain ff
source                                        Iowa2012
file_name          Vibraphone.sustain.ff.E4.stereo.aif
file_extension                                     aif
Name: 0, dtype: object
2062.aif
Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_sustain_ff_64.wav with offset 1024
Done!

Attemping to download the following sample:
sample_id                                              2063
instrument_

Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_shortsustain_ff_56.wav with offset 1024
Done!

Attemping to download the following sample:
sample_id                                               2074
instrument_name                                   Vibraphone
note                                                     Ab4
expression                                   shortsustain ff
source                                              Iowa2012
file_name          Vibraphone.shortsustain.ff.Ab4.stereo.aif
file_extension                                           aif
Name: 0, dtype: object
2074.aif
Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_shortsustain_ff_68.wav with offset 1024
Done!

Attemping to download the following sample:
sample_id          

Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_shortsustain_ff_77.wav with offset 2538
Done!

Attemping to download the following sample:
sample_id                                              2086
instrument_name                                  Vibraphone
note                                                     G4
expression                                  shortsustain ff
source                                             Iowa2012
file_name          Vibraphone.shortsustain.ff.G4.stereo.aif
file_extension                                          aif
Name: 0, dtype: object
2086.aif
Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_shortsustain_ff_67.wav with offset 3032
Done!

Attemping to download the following sample:
sample_id                 

Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_shortsustain_ff_69.wav with offset 1024
Done!

Attemping to download the following sample:
sample_id                                               2098
instrument_name                                   Vibraphone
note                                                     Db6
expression                                   shortsustain ff
source                                              Iowa2012
file_name          Vibraphone.shortsustain.ff.Db6.stereo.aif
file_extension                                           aif
Name: 0, dtype: object
2098.aif
Processing sample with <function preprocess_sample_onsets at 0x109911158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_shortsustain_ff_85.wav with offset 1024
Done!

Attemping to download the following sample:
sample_id          

In [72]:
cur, cnx = connect('sound_shift')
cur.execute(f"""SELECT * FROM sound_shift.samples""")
df = pd.DataFrame(cur.fetchall())
df.columns = [i[0] for i in cur.description]
cnx.close()

In [73]:
def g(row):
    result = row['instrument_name']
    result += '_' + '_'.join(row['expression'].split())
    note_number = note_name_to_midi_number(row['note'])
    result += '_' + str(note_number) + '.wav'
    return result

In [74]:
df['filename'] = df.apply(g, axis=1)

In [75]:
df

Unnamed: 0,sample_id,instrument_name,note,expression,source,file_name,file_extension,filename
0,1,Flute,Gb6,vib ff,Iowa2012,Flute.vib.ff.Gb6.stereo.aif,aif,Flute_vib_ff_90.wav
1,2,Flute,Bb5,vib ff,Iowa2012,Flute.vib.ff.Bb5.stereo.aif,aif,Flute_vib_ff_82.wav
2,3,Flute,G4,vib ff,Iowa2012,Flute.vib.ff.G4.stereo.aif,aif,Flute_vib_ff_67.wav
3,4,Flute,Eb5,vib ff,Iowa2012,Flute.vib.ff.Eb5.stereo.aif,aif,Flute_vib_ff_75.wav
4,5,Flute,A6,vib ff,Iowa2012,Flute.vib.ff.A6.stereo.aif,aif,Flute_vib_ff_93.wav
...,...,...,...,...,...,...,...,...
2228,2333,Crotale,Gb6,ff,Iowa2012,Crotale.ff.Gb6.stereo.aif,aif,Crotale_ff_90.wav
2229,2334,Crotale,A7,ff,Iowa2012,Crotale.ff.A7.stereo.aif,aif,Crotale_ff_105.wav
2230,2335,Crotale,E6,ff,Iowa2012,Crotale.ff.E6.stereo.aif,aif,Crotale_ff_88.wav
2231,2336,Crotale,Ab6,ff,Iowa2012,Crotale.ff.Ab6.stereo.aif,aif,Crotale_ff_92.wav


In [76]:
from os import listdir, getcwd, rename, remove
from os.path import isfile, join

path = join(getcwd(), 'preprocessed_samples')

files = [f for f in listdir(path) if isfile(join(path, f)) and f[0] != '.']


In [77]:
df['processed'] = df.apply(lambda x: (x['filename'] in files), axis=1)

In [78]:
files2 = list(df['filename'])
len(set(files2))

2233

In [79]:
len(files2)

2233

In [80]:
[(files2[i],files2[j]) for i in range(2275) for j in range(2275) if files2[i] == files2[j] and i != j]

IndexError: list index out of range

In [81]:
len(files)

2233

In [82]:
len(df)

2233

In [50]:
df.iloc[810:830,:]

Unnamed: 0,sample_id,instrument_name,note,expression,source,file_name,file_extension,filename,processed
810,811,Violin,B5,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.B5.stereo.aif,aif,Violin_pizz_ff_sulA_83.wav,True
811,812,Violin,Ab5,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.Ab5.stereo.aif,aif,Violin_pizz_ff_sulA_80.wav,True
812,813,Violin,E6,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.E6.stereo.aif,aif,Violin_pizz_ff_sulA_88.wav,True
813,814,Violin,D6,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.D6.stereo.aif,aif,Violin_pizz_ff_sulA_86.wav,True
814,815,Violin,Db6,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.Db6.stereo.aif,aif,Violin_pizz_ff_sulA_85.wav,True
815,816,Violin,F6,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.F6.stereo.aif,aif,Violin_pizz_ff_sulA_89.wav,True
816,817,Violin,A5,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.A5.stereo.aif,aif,Violin_pizz_ff_sulA_81.wav,True
817,818,Violin,Bb4,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.Bb4.stereo.aif,aif,Violin_pizz_ff_sulA_70.wav,True
818,819,Violin,D5,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.D5.stereo.aif,aif,Violin_pizz_ff_sulA_74.wav,True
819,820,Violin,C6,pizz ff sulA,Iowa2012,Violin.pizz.ff.sulA.C6.stereo.aif,aif,Violin_pizz_ff_sulA_84.wav,True
