In [1]:
# Loosely inspired by: https://github.com/anbrjohn/BachMaker/blob/master/get_training_data.py
# Time-out code: https://stackoverflow.com/questions/25027122/break-the-function-after-certain-time


# Code to generate a corpus of midi files by scraping from a webpage and augmenting the data set


from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import urlopen, urlretrieve
from music21 import converter, interval, pitch
from music21.midi import MidiException
import time
import os
import signal
import sys
import warnings


# Ignore warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')


# Custom exception class for time-out
class TimeoutException(Exception):   
    pass

# Custom signal handler
def timeout_handler(signum, frame):   
    raise TimeoutException
    
# Change the behavior of SIGALRM
signal.signal(signal.SIGALRM, timeout_handler)


# Scraping function returns list of files matching arguments
def scrape(webpage, extension='.mid'):
    
    # Request html
    u = urlopen(webpage)
    try:
        html = u.read()
    finally:
        u.close()
    
    # Find files of type 'extension'
    files = []
    for link in BeautifulSoup(html, parseOnlyThese=SoupStrainer('a')):
        if link.has_attr('href'):
            linkname = link['href']
            if linkname[-len(extension):] == extension:
                files += [linkname]
    return files    


# Downloads files from list if they don't already exist in specified dest_dir
def download(prefix, dest_dir, files, delay=0):

    # Given a list of files from a webpage, download them to directory
    total = len(files)
    i = 1
    for file in files:
        filename = prefix+file
        new_file = os.path.join(dest_dir, file.replace('/', '_'))
        if not os.path.exists(new_file):
            urlretrieve(filename, new_file)
            time.sleep(delay)
        print("Downloaded file", i, "out of", total)
        i += 1
        

# Augments midis in source_dir and saves them to dest_dir based on a list of intervals
def augment_midis(source_dir, dest_dir, intervals=['P1', 'M2', 'M3', 'M6', 'M7', 'P4', 'P5'], timeout=30):
    
    # Check if destination exists and create if not
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    # Iterate over midis in directory
    midi_list = [file for file in os.listdir(source_dir) if file.endswith('.mid')]
    total = len(midi_list)
    i = 1
    for file in midi_list:
        # Set time-out alarm (seconds) in case transposing is taking too long
        signal.alarm(timeout)
        try:
            s1 = converter.parse(os.path.join(source_dir,file))
            for interval in intervals:
                augmented_midi = os.path.join(dest_dir, os.path.splitext(os.path.basename(file))[0]+'_'+interval+'.mid')
                # Only augment if it hasn't already been done
                if not os.path.exists(augmented_midi):
                    s2 = s1.transpose(interval)
                    s2.write('midi', augmented_midi)
        except TimeoutException:
            print("Time-out augmenting file", i, "out of", total, "(", file, ")")
            continue
        except (MidiException, IndexError):
            print("Exception augmenting file", i, "out of", total, "(", file, ")")
            continue
        else:
            signal.alarm(0)
            print("Augmented file", i, "out of", total)
        finally:
            i += 1

# Normalize midis in source_dir to a consistent key
def normalize_midis(source_dir, dest_dir, final_pitch='C', timeout=30):
    
    # Check if destination exists and create if not
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    # Iterate over midis in directory
    midi_list = [file for file in os.listdir(source_dir) if file.endswith('.mid')]
    total = len(midi_list)
    i = 1
    for file in midi_list:
        # Set time-out alarm (seconds) in case transposing is taking too long
        signal.alarm(timeout)
        try:
            normalized_midi = os.path.join(dest_dir, os.path.splitext(os.path.basename(file))[0]+'_'+final_pitch+'.mid')
            # Only normalize if it hasn't already been done
            if not os.path.exists(normalized_midi):
                s1 = converter.parse(os.path.join(source_dir,file))
                s1_k = s1.analyze('key')
                s1_i = interval.Interval(s1_k.tonic, pitch.Pitch(final_pitch))
                s2 = s1.transpose(s1_i)
                s2.write('midi', normalized_midi)
        except TimeoutException:
            print("Time-out normalizing file", i, "out of", total, "(", file, ")")
            continue
        except (MidiException, IndexError):
            print("Exception normalizing file", i, "out of", total, "(", file, ")")
            continue
        else:
            signal.alarm(0)
            print("Normalized file", i, "out of", total)
        finally:
            i += 1
            
            
# Location of download links
webpage = "http://www.bachcentral.com/midiindexcomplete.html"

# What all the download links begin with
file_prefix = "http://www.bachcentral.com/"

# Destination directory
output = './bach_midis/'

# Destination directory for augmented dataset
augmented = './bach_midis/augmented/'

# Destination directory for augmented dataset
normalized = './bach_midis/normalized/'


corpus = scrape(webpage)
download(file_prefix, output, corpus, 2)
augment_midis(output, augmented, timeout=90)
normalize_midis(output, normalized, timeout=90)
print('Complete!')

Downloaded file 1 out of 223
Downloaded file 2 out of 223
Downloaded file 3 out of 223
Downloaded file 4 out of 223
Downloaded file 5 out of 223
Downloaded file 6 out of 223
Downloaded file 7 out of 223
Downloaded file 8 out of 223
Downloaded file 9 out of 223
Downloaded file 10 out of 223
Downloaded file 11 out of 223
Downloaded file 12 out of 223
Downloaded file 13 out of 223
Downloaded file 14 out of 223
Downloaded file 15 out of 223
Downloaded file 16 out of 223
Downloaded file 17 out of 223
Downloaded file 18 out of 223
Downloaded file 19 out of 223
Downloaded file 20 out of 223
Downloaded file 21 out of 223
Downloaded file 22 out of 223
Downloaded file 23 out of 223
Downloaded file 24 out of 223
Downloaded file 25 out of 223
Downloaded file 26 out of 223
Downloaded file 27 out of 223
Downloaded file 28 out of 223
Downloaded file 29 out of 223
Downloaded file 30 out of 223
Downloaded file 31 out of 223
Downloaded file 32 out of 223
Downloaded file 33 out of 223
Downloaded file 34 

In [2]:
# Loosely inspired by: https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5


# From a corpus of midi files, generate training tokens for MaskGAN


from music21 import converter, instrument, note, chord
from music21.midi import MidiException
import time
import os
import signal
import sys
import warnings


# Ignore warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')

            
# Tokenize midis in source_dir for training sequence model
def tokenize_midis(source_dir, dest_file, timeout=30, chordify=True):
    
    # Iterate over midis in directory
    midi_list = [file for file in os.listdir(source_dir) if file.endswith('.mid')]
    total = len(midi_list)
    i = 1
    outfile = open(dest_file, 'w')
    
    for file in midi_list:
        tokens = []
        # Set time-out alarm (seconds) in case transposing is taking too long
        signal.alarm(timeout)
        try:
            s1 = converter.parse(os.path.join(source_dir,file))
            if chordify:
                s1 = s1.chordify()
            notes_to_parse = None
            parts = instrument.partitionByInstrument(s1)
            if parts: # file has instrument parts
                notes_to_parse = parts.parts[0].recurse()
            else: # file has notes in a flat structure
                notes_to_parse = s1.flat.notes
            for element in notes_to_parse:
                if isinstance(element, note.Note):
                    tokens.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    tokens.append('.'.join((pitch.name + str(pitch.octave)) for pitch in element.pitches))
        except TimeoutException:
            print("Time-out tokenizing file", i, "out of", total, "(", file, ")")
            continue
        except (MidiException, IndexError):
            print("Exception tokenizing file", i, "out of", total, "(", file, ")")
            continue
        else:
            signal.alarm(0)
            print("Tokenized file", i, "out of", total)
        finally:
            i += 1
        
        outfile.write(" ".join(tokens)+"\n")
        
    outfile.close()
    print('Tokens written to %s' % dest_file)
    print('Vocabulary size is %i' % len(set(w for w in open(dest_file).read().split())))


tokenize_midis('./bach_midis/normalized/', './bach_midis/normalized_corpus.txt', timeout=30, chordify=True)

Tokenized file 1 out of 214
Tokenized file 2 out of 214
Tokenized file 3 out of 214
Tokenized file 4 out of 214
Tokenized file 5 out of 214
Tokenized file 6 out of 214
Tokenized file 7 out of 214
Tokenized file 8 out of 214
Tokenized file 9 out of 214
Tokenized file 10 out of 214
Tokenized file 11 out of 214
Tokenized file 12 out of 214
Tokenized file 13 out of 214
Tokenized file 14 out of 214
Tokenized file 15 out of 214
Tokenized file 16 out of 214
Tokenized file 17 out of 214
Tokenized file 18 out of 214
Tokenized file 19 out of 214
Tokenized file 20 out of 214
Tokenized file 21 out of 214
Tokenized file 22 out of 214
Tokenized file 23 out of 214
Tokenized file 24 out of 214
Tokenized file 25 out of 214
Tokenized file 26 out of 214
Tokenized file 27 out of 214
Tokenized file 28 out of 214
Tokenized file 29 out of 214
Tokenized file 30 out of 214
Tokenized file 31 out of 214
Tokenized file 32 out of 214
Tokenized file 33 out of 214
Tokenized file 34 out of 214
Tokenized file 35 out o