# Filtering the clean_midi data set
In this notebook we filter out some files from the `clean_midi` data set. Our criteria are
* Max one tempo change event (setting initial tempo)
* Max one time signature change event (setting initial time signature)
* The time signature must be 4/4
* Loading the midi file using `pretty_midi` must succeed without warnings


In [9]:
import pretty_midi
import numpy as np
import collections
import os
import warnings 
warnings.filterwarnings('error')

In [10]:
# Get all files
listOfFiles = list()
dirName = os.path.join("..", "data_raw", "clean_midi") # path to raw data
for (dirpath, dirnames, filenames) in os.walk(dirName):
    listOfFiles += [os.path.join(dirpath, file) for file in filenames]
    
print("Number of files found: ", len(listOfFiles))

# Filter based on format
mfiles = []
for file in listOfFiles:
    if(file.endswith('.mid')):
        spl = os.path.splitext(file)[0]
        if (not spl[-1].isdigit()):
            mfiles.append(file)
        
print("Unique MIDI files found: ", len(mfiles))

Number of files found:  5051


In [5]:
# Checks if time signature is 4/4
def check_ts(ts):
    if(ts.numerator==4 and ts.denominator==4):
        return True
    else:
        return False

# Check if file satisfies our criteria
def check_file(filename):
    try:
        pm = pretty_midi.PrettyMIDI(fn)
        tc = pm.get_tempo_changes()[0]
        ts = pm.time_signature_changes
        is_ok = [True, True, True]
        if(len(tc)>1):
            is_ok[0] = False
        if(len(ts)>1):
            is_ok[1] = False
        if (not check_ts(ts[0])):
            is_ok[2] = False
    except:
        is_ok = [False, False, False]
        pass
    return(is_ok)

# Function for naming
def create_fn(fn, d):
    p = fn.split("/")
    p = p[-1].lower().replace(" ", "_")
    p = os.path.join(d, p)
    return(p)

In [None]:
# Run the filtering (will take half an hour or so)
ok_files = []
location = os.path.join('..', 'clean_midi_filtered')
j = 0
for fn in mfiles:
    j += 1
    aa = check_file(fn)
    if(aa[0] and aa[1] and aa[2]):
        ok_files.append(fn)
        pm = pretty_midi.PrettyMIDI(fn)
        new_name = create_fn(fn, location)
        pm.write(new_name)
        
    if(j%100==0):
        print(j, len(ok_files))
        