In [2]:
import pandas as pd

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

raw_tunes = pd.read_json("tunes.json")

In [3]:
# create new dataframe for cleaning tunes
tunes = raw_tunes.copy(deep=True)

print('initial length of tunes:', len(tunes))

# remove line breaks
tunes['abc'] = tunes['abc'].str.replace('\n', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\r', '', regex=True)

# remove spaces
tunes['abc'] = tunes['abc'].str.replace(' ', '', regex=True)

tunes = tunes[~tunes.abc.str.contains('w:')]
print('drop tunes with lyrics:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('z')]
print('drop tunes with rests:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[vV]', regex=True)]
print('drop tunes with multiple voices:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('!')]
print('drop tunes with expression markings, codas etc.:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[SOH]', regex=True)]
print('drop tunes with dal segno, coda, fermata:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('\[.{2,5}\]', regex=True)]
print('drop tunes with double stops:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('\[3', regex=True)]
tunes = tunes[~tunes.abc.str.contains('\[4', regex=True)]
print('drop tunes with third and fourth endings:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[C-F],', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[A-G],,', regex=True)]
print('drop tunes with notes lower than G,:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[efgab]\'', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[a-g]\'\'', regex=True)]
print('drop tunes with notes higher than d\':', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[A-G]\'', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[a-g],', regex=True)]
print('drop tunes with capital letters and apostrophes; lowercase letters and commas:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('K', regex=True)]
print('drop tunes with key changes:', len(tunes))



initial length of tunes: 39154
drop tunes with lyrics: 39121
drop tunes with rests: 35137
drop tunes with multiple voices: 34899
drop tunes with expression markings, codas etc.: 34813
drop tunes with dal segno, coda, fermata: 34617
drop tunes with double stops: 33297
drop tunes with third and fourth endings: 33293
drop tunes with notes lower than G,: 33252
drop tunes with notes higher than d': 33141
drop tunes with capital letters and apostrophes; lowercase letters and commas: 33075
drop tunes with key changes: 32382


In [4]:
### clean abc

# remove up- and down-bows
tunes['abc'] = tunes['abc'].str.replace('u', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('v', '', regex=True)

# remove chord symbols
tunes['abc'] = tunes['abc'].str.replace('".*?"', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('"', regex=True)]
print('drop tunes with mismatched quotation marks:', len(tunes))

# remove slurs, but protect triplet markings
tunes['abc'] = tunes['abc'].str.replace('\(3', 'TRIPLET', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\(', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('TRIPLET', '(3', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\)', '', regex=True)


## remove ornaments
# remove turns, trills, accents, mordents, slides, rolls
tunes['abc'] = tunes['abc'].str.replace('[~TLMPJR]', '', regex=True)

# remove staccato
tunes['abc'] = tunes['abc'].str.replace('.', '', regex=False)

# remove grace notes
tunes['abc'] = tunes['abc'].str.replace('\{.*?\}', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('\}', regex=True)] # apparently, only unmatched closing braces exist in the dataset, not unmatched open braces
print('drop tunes with mismatched curly braces:', len(tunes))

drop tunes with mismatched quotation marks: 32380
drop tunes with mismatched curly braces: 32373


In [5]:
def compare(setting_id: int):
    print("raw:\n", raw_tunes.loc[raw_tunes['setting_id'] == setting_id]['abc'])
    print("clean:\n", tunes.loc[tunes['setting_id'] == setting_id]['abc'])

In [6]:
# tunes.loc[tunes['abc'].str.contains('/2', regex=True)]

In [7]:
samples = tunes.sample(n=10)['abc'].to_list()

In [8]:
# custom tokens for tokenization - in order, such that larger tokens are caught first (hopefully)

accidentals = ['_', '=', '^', '']
notes = ['G,', 'A,', 'B,', 'C', 'D', 'E', 'F', 'G', 'A', 'B', "c'", "d'", 'c', 'd', 'e', 'f', 'g', 'a', 'b']
durations = ['/8', '/4', '/2', '/', '3/2', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '']
note_combs = [a + n + d for a in accidentals for n in notes for d in durations]

# barlines and repeats
barlines = [':||:', ':|:', '::', '|:', ':|', '[|', '|]', '[1', '[2', '|', '1', '2']
tuplets = ['(3']
duration_modifiers = ['>', '<']
ties = ['-']

custom_tokens = barlines + tuplets + duration_modifiers + ties + note_combs
# NB - make sure to also add mode, meter values!



In [9]:
token_set = set(custom_tokens)

def tokenize(notes):
    notes = notes
    tokens = []
    start = 0
    #stop = 0
    while start < len(notes):
        found_token = False
        for i in range(4, 0, -1):
            stop = start + i
            if notes[start:stop] in token_set:
                tokens.append(notes[start:stop])
                start = stop
                found_token = True
                break
        if not found_token:
            #print('error found')
            return ' '.join(tokens) + ' tokenization error: ' + notes[start:]

    return ' '.join(tokens)

In [10]:
tunes['tokenized'] = tunes['abc'].apply(lambda x: tokenize(x))

In [12]:
errors = tunes[tunes.tokenized.str.contains('error', regex=True)]

In [20]:
valid.columns

Index(['tune_id', 'setting_id', 'name', 'type', 'meter', 'mode', 'abc', 'date',
       'username', 'tokenized'],
      dtype='object')

In [43]:
tunes['combined'] = tunes.apply(lambda x: f"R: {x['type']}\tM: {x['meter']}\tK: {x['mode']}\t{x['tokenized']}\t", axis=1)

In [44]:
valid = tunes[~tunes.tokenized.str.contains('error', regex=True)]

In [40]:
print(tunes['combined'][0])

R: polka	M: 2/4	K: Gmajor	|: d > g f e | d B A G | E/ F/ G E/ F/ G | B A G F | d > g f e | d B A G | E/ F/ G E/ F/ G | B A G2 :||: e > f g a | b a g f | e/ f/ g e/ f/ g | d/ c/ B/ A/ B A | e > f g a | b a g f | e/ f/ g e/ f/ g | a g g2 :|	


In [33]:
valid.columns

Index(['tune_id', 'setting_id', 'name', 'type', 'meter', 'mode', 'abc', 'date',
       'username', 'tokenized', 'combined'],
      dtype='object')

In [None]:
lambda x: f"R:{x['type']}\n M:{x['meter']}\n K:{x['mode']}\n {x['abc']}\n"

In [45]:
valid['combined'].to_csv(path_or_buf='2021_07_20_tokenized_tunes.txt', sep = '.', index=False, header=False, quoting=csv.QUOTE_NONE)

In [None]:
# we should feed the tunes into the model in the following format:

"""
X: <setting_id>
R: <type>
M: <meter>
K: <mode>
<abc>
/n
"""