In [1]:
import pandas as pd

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

raw_tunes = pd.read_json("tunes.json")

In [2]:
# create new dataframe for cleaning tunes
tunes = raw_tunes.copy(deep=True)

print('initial length of tunes:', len(tunes))

# remove line breaks
tunes['abc'] = tunes['abc'].str.replace('\n', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\r', '', regex=True)

# remove spaces
tunes['abc'] = tunes['abc'].str.replace(' ', '', regex=True)

tunes = tunes[~tunes.abc.str.contains('w:')]
print('drop tunes with lyrics:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('z')]
print('drop tunes with rests:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('!')]
print('drop tunes with expression markings, codas etc.:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[SOH]', regex=True)]
print('drop tunes with dal segno, coda, fermata:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('\[.{2,5}\]', regex=True)]
print('drop tunes with double stops:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('\[3', regex=True)]
tunes = tunes[~tunes.abc.str.contains('\[4', regex=True)]
print('drop tunes with third and fourth endings:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[C-F],', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[A-G],,', regex=True)]
print('drop tunes with notes lower than G,:', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[efgab]\'', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[a-g]\'\'', regex=True)]
print('drop tunes with notes higher than d\':', len(tunes))

tunes = tunes[~tunes.abc.str.contains('[A-G]\'', regex=True)]
tunes = tunes[~tunes.abc.str.contains('[a-g],', regex=True)]
print('drop tunes with capital letters and apostrophes; lowercase letters and commas:', len(tunes))

initial length of tunes: 39154
drop tunes with lyrics: 39121
drop tunes with rests: 35137
drop tunes with expression markings, codas etc.: 35044
drop tunes with dal segno, coda, fermata: 34838
drop tunes with double stops: 33483
drop tunes with third and fourth endings: 33479
drop tunes with notes lower than G,: 33437
drop tunes with notes higher than d': 33322
drop tunes with capital letters and apostrophes; lowercase letters and commas: 33255


In [3]:
### clean abc

# remove up- and down-bows
tunes['abc'] = tunes['abc'].str.replace('u', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('v', '', regex=True)

# remove chord symbols
tunes['abc'] = tunes['abc'].str.replace('".*?"', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('"', regex=True)]
print('drop tunes with mismatched quotation marks:', len(tunes))

# remove slurs, but protect triplet markings
tunes['abc'] = tunes['abc'].str.replace('\(3', 'TRIPLET', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\(', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('TRIPLET', '(3', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\)', '', regex=True)


## remove ornaments
# remove turns, trills, accents, mordents, slides, rolls
tunes['abc'] = tunes['abc'].str.replace('[~TLMPJR]', '', regex=True)

# remove staccato
tunes['abc'] = tunes['abc'].str.replace('.', '', regex=False)

# remove grace notes
tunes['abc'] = tunes['abc'].str.replace('\{.*?\}', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('\}', regex=True)] # apparently, only unmatched closing braces exist in the dataset, not unmatched open braces
print('drop tunes with mismatched curly braces:', len(tunes))

drop tunes with mismatched quotation marks: 33253
drop tunes with mismatched curly braces: 33245


In [4]:
def compare(setting_id: int):
    print("raw:\n", raw_tunes.loc[raw_tunes['setting_id'] == setting_id]['abc'])
    print("clean:\n", tunes.loc[tunes['setting_id'] == setting_id]['abc'])

In [5]:
# tunes.loc[tunes['abc'].str.contains('/2', regex=True)]

In [6]:
samp = tunes.sample(n=10)
samp

Unnamed: 0,tune_id,setting_id,name,type,meter,mode,abc,date,username
9970,6934,6934,Duncan Gray,reel,4/4,Gmajor,DGFG|ABF2|G2A>c|1BGG2:|2BGGd||Bddc/B/|ccc2|cBAG|F/G/A/F/D2|dBcA|BGA2|G2A>c|BGG2|],2007-03-12 12:42:53,Falkbeer
22834,14630,26964,Marcus Hernon's Air,waltz,3/4,Gmajor,|:ef|g4ag|f4gf|e3/2dBA|B4BA|G2g2g/f/e|f3dcA|G4AF|G4:||:GB|c4BA|B/d/d-d2BA|G2g2fd|e4ga|b4b/a/g/b/|a3ged|e6-|e3def|g4ag|f3dgf|e3dBA|B4BA|G2g2g/f/e|f3dcA|G4AF|G4:|,2015-08-01 08:42:33,Stefan Thamm
38468,2337,2337,Will You Come Down To Limerick,slip jig,9/8,Gmajor,d|:cAGGAGGBd|cAGGABcAd|cAGGAGGAG|1cAAfedcAd:|2cAAfedcAG||ddggbfg2f|ddggfgabc'|bagafagfd|1cAAfedcAG:|2cAAfedcAd||,2004-01-02 18:10:56,gian marco
854,16306,30880,All In Good Company,jig,6/8,Dmajor,|:A|FEDFAd|fdAF2A|GFGBAG|F/G/AFEAG|FEDF2d|fdAF2A|GFGBA^G|Adcd2:||:e|fdAFAd|gecA2e|gecABG|F/G/AdF2e|fdAF2d|gecA2e|gecAec|d/e/fed2:|,2017-08-19 23:57:05,ceolachan
30031,2926,2926,Radstock,reel,4/4,Amixolydian,=c4E3=F|G2G2G4|A2AB=cBcd|e2e2A4|A2AB=cBcd|e2efg2fg|afdfegec|A2A2A4||g2g2gfed|cdefg2gf|e2a2a3g|e2a2a2ef|g2gagfed|cdefg2fg|afdfegec|A2A>GA4||,2004-04-29 20:34:06,fidicen
21347,1500,34951,"Liverpool, The",hornpipe,4/4,Dmajor,FEFAdfaf|gefddcBA|GBGBFAFA|EFGAGFED|FEFAefaf|gefddcBA|fafdegec|d2d2d4:||:dffdceec|BddBBAGF|GBGBFAFA|EFGAGFED|FEFAdfaf|gefddcBA|fafdegec|d2d2d4:|,2019-05-28 22:34:25,Andy Hornby
25202,19733,38980,Mr. Bushby Maitland's,reel,4/4,Dmajor,A|FDAFd2dA|d2dfeEEA|FDAFd2dB|cdegfddA|FDAFd2dA|d2dfeEEA|FDAFdABG|FAdgfdd|]g|f>edefgaf|gfedcAeg|fedefgaf|ecdbaddg|fdfgfgaf|g>fedcAeg|fdgbafge|fdecdDD|],2020-09-12 13:22:58,Tadhg
31585,1189,14466,Rory O'More,jig,6/8,Gmajor,|:e|dGGBGG|dGGG2e|dcBBAG|FAAAA/B/c|dGGBGG|dGGedc|BB/c/ddef|gGGG2:||:f|gfeedB|cBAG2F|EFGGAB|Beddef|g2fedB|cBAGAF|[1E2FGAB|ed^cde:|[2EGGGG/A/B|ed^cd2|],2005-11-20 17:57:56,ceolachan
34897,9942,20141,Superfly,reel,4/4,Bminor,|:Bff2gfeg|f2fgfedf|e2efedce|edc/B/ABAFA|Bff2gfeg|f2fgfedf|e2efedcd|edc/B/AB2BA:||:Bcdef2Bc|defBg2Bc|defBa2fa|gfededc/B/A|[1Bcdef2Bc|defBg2Bc|defaa2Bc|defdgfed:|[2Bcdef2Bc|defBg2Bc|defaa2Bc|gfededc/B/A||,2009-10-02 06:10:40,bogman
7953,174,39427,"Curlews, The",reel,4/4,Bminor,|:fBB2FBB2|fecea3e|(3fgaedcAA2|cAcea2ba|fBB2Bcde|fecea3f|g3ef3e|dcBAFBB2:||:BdcdBFF2|DFF2Bcdf|ecBcAEE2|cEE2ABcA|BdcdBFF2|DFF2Bcdf|dcBAf2ec|dcBAFBB2:|,2020-10-29 21:54:53,JACKB


In [8]:
# custom tokens for tokenization - in order, such that larger tokens are caught first (hopefully)

accidentals = ['_', '=', '^', '']
notes = ['G,', 'A,', 'B,', 'C', 'D', 'E', 'F', 'G', 'A', 'B', "c'", "d'", 'c', 'd', 'e', 'f', 'g', 'a', 'b']
durations = ['/8', '/4', '/2', '/', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '']
note_combs = [a + n + d for a in accidentals for n in notes for d in durations]

# barlines and repeats
barlines = [':||:', ':|:', '::', '|:', ':|', '[|', '|]', '[1', '[2', '|']
tuplets = ['(3']
duration_modifiers = ['>', '<']
ties = ['-']

custom_tokens = barlines + tuplets + duration_modifiers + ties + note_combs
# NB - make sure to also add mode, meter values!



1534


In [None]:
# we should feed the tunes into the model in the following format:

"""
X: <setting_id>
R: <type>
M: <meter>
K: <mode>
<abc>
/n
"""