In [None]:
import pandas as pd

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

raw_tunes = pd.read_json("tunes.json")

In [None]:
# create new dataframe for cleaning tunes
tunes = raw_tunes.copy(deep=True)

print('initial length of tunes:', len(tunes))

# remove line breaks
tunes['abc'] = tunes['abc'].str.replace('\n', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\r', '', regex=True)

# remove spaces
tunes['abc'] = tunes['abc'].str.replace(' ', '', regex=True)

tunes = tunes[~tunes.abc.str.contains('w:')]
print('drop tunes with lyrics:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('z')]
print('drop tunes with rests:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('!')]
print('drop tunes with expression markings, codas etc.:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[SOH]', regex=True)]
print('drop tunes with dal segno, coda, fermata:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('\[.{2,5}\]', regex=True)]
print('drop tunes with double stops:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('\[3', regex=True)]
tunes = tunes[~tunes.abc.str.contains('\[4', regex=True)]
print('drop tunes with third and fourth endings:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[C-F],', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[A-G],,', regex=True)]
print('drop tunes with notes lower than G,:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[efgab]\'', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[a-g]\'\'', regex=True)]
print('drop tunes with notes higher than d\':', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[A-G]\'', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[a-g],', regex=True)]
print('drop tunes with capital letters and apostrophes; lowercase letters and commas:', len(tunes))

In [None]:
### clean abc

# remove up- and down-bows
tunes['abc'] = tunes['abc'].str.replace('u', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('v', '', regex=True)

# remove chord symbols
tunes['abc'] = tunes['abc'].str.replace('".*?"', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('"', regex=True)]
print('drop tunes with mismatched quotation marks:', len(tunes))

# remove slurs, but protect triplet markings
tunes['abc'] = tunes['abc'].str.replace('\(3', 'TRIPLET', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\(', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('TRIPLET', '(3', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\)', '', regex=True)


## remove ornaments
# remove turns, trills, accents, mordents, slides, rolls
tunes['abc'] = tunes['abc'].str.replace('[~TLMPJR]', '', regex=True)

# remove staccato
tunes['abc'] = tunes['abc'].str.replace('.', '', regex=False)

# remove grace notes
tunes['abc'] = tunes['abc'].str.replace('\{.*?\}', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('\}', regex=True)] # apparently, only unmatched closing braces exist in the dataset, not unmatched open braces
print('drop tunes with mismatched curly braces:', len(tunes))

In [None]:
def compare(setting_id: int):
    print("raw:\n", raw_tunes.loc[raw_tunes['setting_id'] == setting_id]['abc'])
    print("clean:\n", tunes.loc[tunes['setting_id'] == setting_id]['abc'])

In [None]:
# tunes.loc[tunes['abc'].str.contains('/2', regex=True)]

In [None]:
samp = tunes.sample(n=10)
samp

In [None]:
# custom tokens for tokenization - in order, such that larger tokens are caught first (hopefully)

accidentals = ['_', '=', '^', '']
notes = ['G,', 'A,', 'B,', 'C', 'D', 'E', 'F', 'G', 'A', 'B', "c'", "d'", 'c', 'd', 'e', 'f', 'g', 'a', 'b']
durations = ['/8', '/4', '/2', '/', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '']
note_combs = [a + n + d for a in accidentals for n in notes for d in durations]

# barlines and repeats
barlines = [':||:', ':|:', '::', '|:', ':|', '[|', '|]', '[1', '[2', '|']
tuplets = ['(3']
duration_modifiers = ['>', '<']
ties = ['-']

custom_tokens = barlines + tuplets + duration_modifiers + ties + note_combs
# NB - make sure to also add mode, meter values!