In [1]:
def note_name_to_midi_number(note_name): # Note names must be capital letters
                                         # Valid inputs are C0 through G9
    note_letter = note_name[:-1]
    octave_number = int(note_name[-1])
    note_number = {'C' : 0,
                   'C#' : 1, 'Db': 1,
                   'D' : 2,
                   'D#' : 3, 'Eb' : 3,
                   'E' : 4,
                   'F' : 5,
                   'F#' : 6, 'Gb': 6,
                   'G' : 7,
                   'G#': 8, 'Ab': 8,
                   'A' : 9,
                   'A#' : 10, 'Bb': 10,
                   'B' : 11}
    return 12*(1+octave_number) + note_number[note_letter]

def note_midi_number_to_name(note_midi_number): # MIDI numbers must be integers
                                                # Valid inputs are 12 through 127
    octave_number = (note_midi_number // 12) - 1
    note_number = note_midi_number % 12
    note_letter = {0 : 'C',
                   1: 'Db',
                   2: 'D',
                   3: 'Eb',
                   4 : 'E',
                   5 : 'F',
                   6: 'Gb',
                   7: 'G',
                   8: 'Ab',
                   9: 'A',
                   10: 'Bb',
                   11: 'B'}
    return note_letter[note_number] + str(octave_number)

In [2]:
# from os import listdir, getcwd, rename, remove
# from os.path import isfile, join

# path = join(getcwd(), 'Soundfont Creation', 'freepats-tools', 'samples')

# files = [f for f in listdir(path) if isfile(join(path, f)) and f[0] != '.']

# samples = []
# for f in files:
#     #print(f.split('.'))
#     sample = {'instrument' : f.split('.')[0],
#               'note' : f.split('.')[-3],
#               'midi_number' : note_name_to_midi_number(f.split('.')[-3]),
#               'expression' : '_'.join(f.split('.')[1:-3]) if len(f.split('.')) > 3 else '',
#               'filename' : f,
#               'extension' : f.split('.')[-1]}
#     samples.append(sample)
# for s in sorted(samples, key=lambda x: x['midi_number']):
#     print(s)
#     print('')

In [3]:
import soundfile as sf
from librosa.onset import onset_detect
from librosa.core import to_mono

from os import listdir, getcwd, rename, remove, makedirs
from os.path import isfile, join, basename, exists

def preprocess_sample_onsets(sample_info):
    # extract sample information
    input_filename = sample_info['filename']
    instrument = sample_info['instrument']
    expression = sample_info['expression']
    midi_number = sample_info['midi_number']
    
    # read sample from storage
    input_data, samplerate = sf.read(input_filename)
    
    # detect onset with librosa, then clip input data to start of onset with backtracking
    onsets = onset_detect(y=to_mono(input_data.T), sr=samplerate, hop_length=1, backtrack=True, units='samples')
    if len(onsets) > 0:
        onset = onsets[0]
    else:
        onset = 0
    output_data = input_data[onset:,:]
    
    # create directory for preprocessed samples if does not already exist
    if not exists('preprocessed_samples'):
        makedirs('preprocessed_samples')
    
    # write preprocessed sample as 24-bit PCM wave file
    output_filename = join(getcwd(), 'preprocessed_samples', f"{instrument}_{expression}_{midi_number}.wav")
    print(f"Writing {output_filename} with offset {onset}")
    sf.write(output_filename, output_data, samplerate, subtype='PCM_24')
    
    # return MySQL query and values (in this case, nothing)
    return None, None

In [4]:
import config

import mysql
import mysql.connector
from mysql.connector import errorcode

import logging
import boto3
from botocore.exceptions import ClientError

def connect(db_name=None):
    if db_name:
        cnx = mysql.connector.connect(
            host = config.rds_host,
            user = config.rds_user,
            passwd = config.rds_password,
            database = db_name)
    else:
        cnx = mysql.connector.connect(
            host = config.rds_host,
            user = config.rds_user,
            passwd = config.rds_password)
    cursor = cnx.cursor()
    return cursor, cnx

def download_file(object_name, bucket='instrument-samples-1337', file_name=None):
    """Download a file from an S3 bucket

    :param object_name: S3 object to download
    :param bucket: Bucket to download from
    :param file_name: file name to download to. If not specified then object_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if file_name is None:
        file_name = object_name

    # Upload the file
    s3_client = boto3.client('s3',
                             aws_access_key_id=config.aws_access_key_id,
                             aws_secret_access_key=config.aws_secret_access_key)
    try:
        response = s3_client.download_file(bucket, object_name, file_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [5]:
from os import listdir, getcwd, rename, remove, makedirs
from os.path import isfile, join, basename, exists
import pandas as pd

def process_sample(sample_id, processing_function):
    cur, cnx = connect('sound_shift')
    cur.execute(f"""SELECT * FROM sound_shift.samples
                    WHERE sample_id = {sample_id}""")
    df = pd.DataFrame(cur.fetchall())
    df.columns = [i[0] for i in cur.description]
    sample_metadata = df.iloc[0]
    print('Attemping to download the following sample:')
    print(sample_metadata)
    sample_info = {'id' : int(sample_metadata['sample_id']),
                   'instrument' : sample_metadata['instrument_name'],
                   'note' : sample_metadata['note'],
                   'midi_number' : note_name_to_midi_number(sample_metadata['note']),
                   'expression' : '_'.join(sample_metadata['expression'].split(' ')),
                   'filename' : join(getcwd(), 'tempfiles', sample_metadata['file_name']),
                   'object_name' : f"{sample_metadata['sample_id']}.{sample_metadata['file_extension']}",
                   'extension' : sample_metadata['file_extension']}
    sample_id = sample_info['id']
    sample_file_ext = sample_info['extension']
    object_name = sample_info['object_name']
    file_name = sample_info['filename']
    print(object_name)
    if not exists('tempfiles'):
        makedirs('tempfiles')
    if download_file(object_name, file_name=file_name):
        print(f'Processing sample with {processing_function} ...')
        query, values = processing_function(sample_info)
        if query:
            cur.executemany(query, values)
        remove(file_name)
        cnx.commit()
        cnx.close()
        print('Done!')
        print('')
    else:
        print('Download failed.')
        print('')
        cnx.close()

In [7]:
import gc

for i in range(2219,2230):
    process_sample(i, preprocess_sample_onsets)
    gc.collect()

Attemping to download the following sample:
sample_id                                   2219
instrument_name                       Vibraphone
note                                         Db3
expression                                   bow
source                                  Iowa2012
file_name          Vibraphone.bow.Db3.stereo.aif
file_extension                               aif
Name: 0, dtype: object
2219.aif
Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_bow_49.wav with offset 8358
Done!

Attemping to download the following sample:
sample_id                                  2220
instrument_name                      Vibraphone
note                                         D4
expression                                  bow
source                                 Iowa2012
file_name          Vibraphone.bow.D4.stereo.aif
file_extension                              

#### Alternative multithreaded preprocessing

In [8]:
# Alternative Multithreaded code to perform preprocessing of samples.
# This is naive multithreaded code, so use with caution. Try using
# relatively small batch sizes so that you can catch failures early.

import _thread as thread
import gc

def batch_preprocess_by_sample_index(index_list):
    for i in index_list:
        process_sample(i, preprocess_sample_onsets)
        gc.collect()
    print(f'COMPLETED PREPROCESSING SAMPLES IN {index_list}')

    
# Example of processing four batches in parallel

thread.start_new_thread(batch_preprocess_by_sample_index, (list(range(2000,2100)),) )
thread.start_new_thread(batch_preprocess_by_sample_index, (list(range(2230,2300)),) )

123145572655104

Attemping to download the following sample:
sample_id                                  2230
instrument_name                      Vibraphone
note                                         D6
expression                                  bow
source                                 Iowa2012
file_name          Vibraphone.bow.D6.stereo.aif
file_extension                              aif
Name: 0, dtype: object
2230.aif
Attemping to download the following sample:
sample_id                                               2000
instrument_name                                    Xylophone
note                                                     Ab4
expression                                  rosewood roll ff
source                                              Iowa2012
file_name          Xylophone.rosewood.roll.ff.Ab4.stereo.aif
file_extension                                           aif
Name: 0, dtype: object
2000.aif
Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Process

Attemping to download the following sample:
sample_id                                              2010
instrument_name                                   Xylophone
note                                                     D6
expression                                 rosewood roll ff
source                                             Iowa2012
file_name          Xylophone.rosewood.roll.ff.D6.stereo.aif
file_extension                                          aif
Name: 0, dtype: object
2010.aif
Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Xylophone_rosewood_roll_ff_86.wav with offset 1217
Done!

Attemping to download the following sample:
sample_id                                              2011
instrument_name                                   Xylophone
note                                                     B4
expression                                 rosewood roll ff
sou

Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Xylophone_rosewood_roll_ff_90.wav with offset 1066
Done!

Attemping to download the following sample:
sample_id                                              2020
instrument_name                                   Xylophone
note                                                     E5
expression                                 rosewood roll ff
source                                             Iowa2012
file_name          Xylophone.rosewood.roll.ff.E5.stereo.aif
file_extension                                          aif
Name: 0, dtype: object
2020.aif
Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Xylophone_rosewood_roll_ff_76.wav with offset 1389
Done!

Attemping to download the following sample:
sample_id                 

Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_sustain_ff_86.wav with offset 2751
Done!

Attemping to download the following sample:
sample_id                                         2025
instrument_name                             Vibraphone
note                                                A3
expression                                  sustain ff
source                                        Iowa2012
file_name          Vibraphone.sustain.ff.A3.stereo.aif
file_extension                                     aif
Name: 0, dtype: object
2025.aif
Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/bells_plastic_ff_105.wav with offset 1989
Done!

Attemping to download the following sample:
sample_id                                    2242
instrument_name          

Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/bells_plastic_ff_92.wav with offset 4813
Done!

Attemping to download the following sample:
sample_id                                    2250
instrument_name                             bells
note                                           B5
expression                             plastic ff
source                                   Iowa2012
file_name          bells.plastic.ff.B5.stereo.aif
file_extension                                aif
Name: 0, dtype: object
2250.aif
Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_sustain_ff_85.wav with offset 2118
Done!

Attemping to download the following sample:
sample_id                                         2030
instrument_name                             Vibraphone
n

Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/bells_plastic_ff_95.wav with offset 2719
Done!

Attemping to download the following sample:
sample_id                                     2258
instrument_name                              bells
note                                           Ab5
expression                              plastic ff
source                                    Iowa2012
file_name          bells.plastic.ff.Ab5.stereo.aif
file_extension                                 aif
Name: 0, dtype: object
2258.aif
Processing sample with <function preprocess_sample_onsets at 0x10f31f158> ...
Writing /Users/flatrionschool/Flatiron/Projects/sound-shift/preprocessed_samples/Vibraphone_sustain_ff_67.wav with offset 1024
Done!

Attemping to download the following sample:
sample_id                                          2035
instrument_name                              Vib