In [1]:
import pandas as pd

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 4000)

In [2]:
raw_tunes = pd.read_json("tunes.json")
# raw_tunes

In [53]:
print(len(raw_tunes))

# create new dataframe for cleaning tunes
tunes = raw_tunes.copy(deep=True)

### drop rows
# ...that include lyrics
tunes = tunes[~tunes.abc.str.contains('w:')]
print(len(tunes))

# ...that include rests
tunes = tunes[~tunes.abc.str.contains('z')]
print(len(tunes))

# ...that include double stops
tunes = tunes[~tunes.abc.str.contains('\[.{2,4}\]', regex=True)]
print(len(tunes))


39154
39121
35137
34225


In [54]:
### clean abc

# remove line breaks
tunes['abc'] = tunes['abc'].str.replace('\n', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('\r', '', regex=True)

# remove spaces
tunes['abc'] = tunes['abc'].str.replace(' ', '', regex=True)

# remove up- and down-bows
tunes['abc'] = tunes['abc'].str.replace('u', '', regex=True)
tunes['abc'] = tunes['abc'].str.replace('v', '', regex=True)


## remove ornaments
# remove turns
tunes['abc'] = tunes['abc'].str.replace('~', '', regex=True)

# remove grace notes
tunes['abc'] = tunes['abc'].str.replace('\{.*?\}', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('\}', regex=True)] # drop tunes with mismatched curly braces (apparently, only unmatched closing braces exist in the dataset)
print(len(tunes))

# remove chord symbols
tunes['abc'] = tunes['abc'].str.replace('".*?"', '', regex=True)
tunes = tunes[~tunes.abc.str.contains('"', regex=True)] # drop tunes with mismatched quotation marks
print(len(tunes))

# remove slurs - or maybe just drop?
# tunes['abc'] = tunes['abc'].str.replace('\(', '', regex=True)
# tunes['abc'] = tunes['abc'].str.replace('\)', '', regex=True)
# tunes['abc'] = tunes['abc'].str.replace('\((^3)', '$1', regex=True)
# tunes['abc'] = tunes['abc'].str.replace('\)', '', regex=True)

# tunes['abc'] = tunes['abc'].str.replace('\((?:[^3])', '', regex=True)
# tunes['abc'] = tunes['abc'].str.replace('\)', '', regex=True)


34216
34214


In [32]:
samp = tunes.sample(n=10)
samp

Unnamed: 0,tune_id,setting_id,name,type,meter,mode,abc,date,username
4573,10353,10546,Bridgett's,reel,4/4,Adorian,A2a2a2ba|g2ede2dB|G2g2g2ag|e2dBd2BG|A2a2a2ba|g2ede2eg|edBde2de|B2A2A2E2:||:A2ABcBcd|e2dcB2d2|G2GABABc|d2cBc2e2|A2ABcBcd|e2dcB2G2|g2agedBd|e2dBB2A2:|,2010-06-22 11:23:06,"Tall, Dark, and Mysterious"
5099,4685,4685,"Cage, The",hornpipe,4/4,Gmajor,"|:dc|BdcBAGBA|GDDED2EF|GDADBDcB|AGFGADdc|BdcBAGBA|GEAGFEDC|B,DGDEcAF|A2G2G2:||:(3def|gGBdgbag|fagfe2(3gfe|dDFAdfed|cdgBADGA|BdcBAGBA|GEAGFEDC|B,DGDEcAF|A2G2G2:|",2005-07-06 21:11:14,Dr. Dow
31107,4966,17352,"Road To Corofin, The",reel,4/4,Dmajor,"d2AdBAFE|DB,A,B,DEFA|d2Adfded|cAGBBAFA|d/d/dfdefag|f/f/fedcAag|f/f/fedcAGE|EDCED2A/B/c:||d2Adfdef|gbedcAGE|EDFAdfed|cAGEEDFA|DEFAdfaf|gbedceag|f/g/fedcAGE|EDCED2A/B/c:||",2005-11-25 12:57:08,Will Harmon
26541,1729,33566,"Noon Lassies, The",reel,4/4,Gmajor,|:B2BABcdB|A2AGAcBA|B2BABcdB|AGABG3A:|Bd3e2dB|AGABGED2|Bd3e2dB|AGABG3A|Bd3ed3|gd3ABAG|A2AGABcA|dBABG3A||,2018-11-21 00:56:34,Matt Saunders
36611,6501,18197,"Traditional, The",jig,6/8,Gmajor,|:B/c/|dBGGFG|Bcd=fdB|cA=FFEF|C=FAc2B/c/|dBGGFG|BB/c/dfga|gfdcAF|GAGG2:||:B/c/|dgggfg|bagagf|cde=fef|ag=fag^f|d2ggfg|Bcdfga|g>fdcAF|GAGG2:|,2008-03-19 07:38:26,ceolachan
12060,2549,39820,"Flowers Of Edinburgh, The",reel,4/4,Gmajor,"|:GE|DB,DEG3A|BABdcBAG|FGFEDB,FG|AFBFEAGE|DB,DEG3A|BABdefge|dcBAGFGA|B2G2GB:||:ef|g3fgbag|f3efagf|edefgfed|Beede2ge|dBGBd3B|edefgfed|dcBAGFGA|B2GFG2:|",2020-12-10 09:57:35,Fernando Durbán Galnares
36958,7964,25050,Trollpolska,waltz,3/4,Fmajor,D2|G2GDG2B2|d6Bc|d2dBd2de|c2cBA2AB|c2cde2dc|BcBAG2FG|A2AFD2EF|G6:|,2014-10-30 13:25:25,maki
36751,8818,8818,"Trip To Dublin, The",reel,4/4,Adorian,|:cB|ABcdefec|dABGE2G2|ABcdefed|cAAGA2cB|ABcdefec|dABGE2G2|ecdBcABG|cAAGA2:||:cd|ecc2Gcec|dBB2GBdB|cAAGABAG|E2EDE2cd|ecc2Gcec|dBB2GBdB|1ABcdefed|cAAGA2:|2ecdBcABG|cAAGA2||,2008-08-16 17:27:27,ethical blend
1560,5081,28259,April's Fool,jig,6/8,Gmajor,|:F|GABGAB|E3EDE|GABd2B|deBdBA|GABGAB|E3EDE|GgedBA|AGFG2:|A|B2ddBd|e3g2g|agedBd|edBAGA|B2ddBd|e3g2g|agedBA|AGFG2A||B2ddBd|e3g2g|agedBd|edBAGA|GABGAB|E3EDE|GgedBA|AGFG2|],2016-02-11 14:15:04,GaryAMartin
1001,3451,3451,An Amaid,reel,4/4,Dminor,D2FDGDAd|D2FDGBAF|D2FDGDAc|1BGAFDCFE:|2BGAFGfge|||:afgagece|bagbagfe|BcdAdAGF|1DEFGEedf:|2DEFGE4||,2004-08-27 18:39:21,Trinil


In [6]:
def compare(setting_id: int):
    print("raw:\n", raw_tunes.loc[raw_tunes['setting_id'] == setting_id]['abc'])
    print("clean:\n", tunes.loc[tunes['setting_id'] == setting_id]['abc'])
    

In [18]:
compare(19903)

raw:
 27041    e2 b bab | e2 b bab | d2 b bag | fga agf |\re2 b bab | e2 b bab | d2 b bag | fgf e2 :|\rE2 B, G2 B, | DB,A, A,B,D | E2 B, G2 B, | A,G,A, B,2 E |\rBAF G2 B, | C2 G ECA, | EG,B, E2 B, } G,A,F E2 :|\rE2 G BAG | F2 G AGF | EB,E GEG | AGA B2 G |\rB3 G3 | EGE ^A,3 | B,EG BAG | FGF E2 :|
Name: abc, dtype: object
clean:
 27041    e2bbab|e2bbab|d2bbag|fgaagf|e2bbab|e2bbab|d2bbag|fgfe2:|E2B,G2B,|DB,A,A,B,D|E2B,G2B,|A,G,A,B,2E|BAFG2B,|C2GECA,|EG,B,E2B,}G,A,FE2:|E2GBAG|F2GAGF|EB,EGEG|AGAB2G|B3G3|EGE^A,3|B,EGBAG|FGFE2:|
Name: abc, dtype: object


In [23]:
tunes.loc[tunes['abc'].str.contains('\{.*?\}', '', regex=True)]
# raw_tunes.loc[raw_tunes['abc'].str.contains('\{.*?\}', '', regex=True)]
tunes.loc[tunes['abc'].str.contains('\}', '', regex=True)]

Unnamed: 0,tune_id,setting_id,name,type,meter,mode,abc,date,username


In [25]:
compare(38954)

raw:
 6846    |:"G" D3 E DE |"G" B6 |"G" d3 B ed |"G" B6 |\r\n"D" A3 B AB |"D7" c2 A2 FD |"C" E2 D2 E2 |"G" B6 |\r\n"G" D3 E DE |"G" B6 |"G" d3 B ed |"G" B6 |"\r\n"D7" AB dc BA |"D7" c2 B2 A2 |"D7" Bd dc AF |"G" G6 :||\r\n|:"G" dB ed Bd |"G" gd ed Bd |"G" dc cB BA |"D" A6 |\r\n"D" a2- ag fe |"D7" de c2 d2 |"D7" dB Be dc |"G" B6 |\r\n"G" dB ed Bd |"G" gd ed Bd |"G" dc cB BA |"D" A6 |\r\n"D" ab ag fe |"D7" de ed cA |"G" Bd dc AF |"G" G6 :||
Name: abc, dtype: object
clean:
 6846    |:D3EDE|B6|d3Bed|B6|A3BAB|c2A2FD|E2D2E2|B6|D3EDE|B6|d3Bed|B6|D7D7D7GGGGDDD7D7GGGGDDD7GG"G6:||
Name: abc, dtype: object


In [29]:
tunes.loc[tunes['setting_id'] == 28582]
tunes.loc[tunes['abc'].str.contains('"')]

Unnamed: 0,tune_id,setting_id,name,type,meter,mode,abc,date,username


In [55]:
raw_tunes.loc[raw_tunes['abc'].str.contains('\(')]
print('---')
tunes.loc[tunes['abc'].str.contains('\)')]

---


Unnamed: 0,tune_id,setting_id,name,type,meter,mode,abc,date,username
3,15326,28582,'S Ann An Ìle,strathspey,4/4,Gmajor,"D2|:[G,2G2]B>dc>AB>G|E2A>GF<DD>F|[G,2G2]B>dc>AB>G|[1E>AF<DG2G>D:|[2E>AF<DG2(G>E)|D>DD>B,D>DD>(B|d>B)A>BG<EE>G|D>DD>B,D>DD>(B|[1d>B)A<BG2G>E:|[2d>B)A<BG2||",2016-04-03 09:15:08,DonaldK
14,18139,35288,128 South,reel,4/4,Dmajor,"|:DE|F3FEDFA|B3BA2dc|B3AEDFA|GFEDB,2DC|D3EFDdc|B3dA2F/E/D|B,2GAE3D|D6:||:DA|d3cAGFG|A3FEDFA|B3BADGF|(E4E)Ddc|B3AEDde|f3abff/e/d|B3AE3D|D6:|",2019-07-13 21:06:08,Ian Varley
26,16424,32371,2-Rivisjenkka,march,4/4,Eminor,".B,!tento!BE>F(3(E/F/E/)DE>F|(3(E/F/E/D)/D/E/G/B/c/d/c/B/c/B/B/F/G/|.B,!tento!BE>F(3E/F/E/DE/G/B/c/|[1d/c/B/c/B/B/B,/D/F/G/(EE/)E/F/G/:|[2d/c/B/c/B/B/B,/D/F/G/EEg/f/]||:e.dF(e/d/)B/G/F/B,/E/A/G/F/|(3(E/F/E/D)/D/E/G/B/(c/.d/)(c/B/)c/B/B/g/f/|(ed/)F/e>dB/G/F/(B,/.E/)(G/B/)c/|1d/c/B/(c/B/)(B/B,)/D/F/G/EEg/f/:|2d/c/B/(c/B/)(B/B,/)D/(F/G/)EE(F/G/)|]",2018-04-30 13:58:03,pbsinclair42
34,2711,31745,250 To Vigo,reel,4/4,Bminor,fBBABdF(G|G)BdBAcec|fBBABdF(G|G)ABdcBBg|fBBABdF(G|G)BdBAcec|fBBABdF(G|G)Bfedefg|affedec(d|d)ecded(3)cBA|affedec(d|d)faffeef|affedec(d|d)ecded(3)cBA|fBBABfc(d|1d)fecdcBA:|2d)fecdcBd||cFFcdGGd|cFFedBBd|cFFcdGGd|cfecdcBd|cFFcdGGd|cFFedBBd|cFFcdGGd|cfecdefg||affdcaB(c|c)aBcdcBA|affdcaB(c|c)aaffeef|affdcaB(c|c)aBcdcBA|fBBABfc(d|d)fecdcBA||,2018-01-24 21:47:08,Rachael
35,15717,29529,"28th Of January, The",reel,4/4,Adorian,"A,A,CEDDFD|A,A,CED2EG|ABcBAGEG|ABcBAGED|A,A,CEDDFD|A,A,CED2EG|Acea(3gfeBA|EGA2A4:||:ea2(aa)bag|egaba3(g|g)fgabagf|efgabagf|e2a2abag|egaba2ag|degfedBG|EGA2A4:|",2016-12-18 16:58:20,callison
67,6970,18552,"72nd Highlanders' Farewell To Aberdeen, The",march,4/4,Dmajor,A<B|A2d2d2(ef)|g>fe<df4|(g2B2)B2(cd)|e>fe<cA2A<B|(A2d2)d2(ef)|g>fe<df4|(g2B2)f>ed<c|d4d2:|f<g|a2A2A2((3AB=c)|B2G2A2A<B|(A2d2)A2d2|e>dc<BA2f<g|a2A2A2((3AB=c)|B2G2A2A<B|(A2d2)f>ed<c|d4d2:|,2007-03-20 04:30:02,DonaldK
77,19648,38794,9 Years Later,jig,6/8,Fmajor,|:AAAAGA|FFFABc|fag-gfd|dcAcde|ffffec|dcAcde|1fAcfag-|gfddcB:|2fcagf(e|ee)Acde|||:ffffec|dfgaba-|afcbag-|gfaecB|A-Af-fec|[1dddcde|fcagf(e|ee)Acde:|2ddddcB|A-Ad-dcA|G-G(ccc)B|],2020-08-18 15:24:40,SCRFiddle
87,9866,9866,A Bhriogais Uallach,three-two,3/2,Amixolydian,|:AAABA2ABAGd2|deeec2BcdcB2|deeeA2ABAGd2|eAA2(G2G)ABAA2:||:aeeeeddeeeg2|gaaaf2efgfe2|deABGAAABde2|gAA2(G2G)ABdA2):|,2009-09-04 23:52:13,malcombpiper
90,10876,20516,A Birthday,polka,2/4,Gmajor,dGge|de/d/cB|ADAd|BGGD|BGge|de/d/cB|ADAB|1G3D:|2(G2G/)A/B/A/||cece|dd/c/dd/e/|ff/e/fa|g2ee|cece|dd/c/dd/e/|fa/g/fB|1c3c/G/:|2c2Ge||dGge|de/d/cB|ADAd|BGGD|BGge|de/d/cB|ADAB|1G3D:|2(G2G/)A/B/A/||,2010-12-01 13:13:29,birlibirdie
94,18819,36848,A Bit Of Sweed,hornpipe,4/4,Dminor,P:AD>EF>GA>d^c>d|B>d^c>d(A3A/)A/|B>AG>FE>DE>F|G>AG>FE>DF>E|D>EF>GA>d^c>d|B>d^c>d(A3A/)A/|B>AG>FE>DE>F|1G>AF>ED4:|2G>AF>ED2A2|]P:B|:B2B>AB>de>f|g>fe>fd2(3ddd|^c>Ac>eA'>gf>e|f>ge>fd>fe>d|B2B>AB>de>f|g>fe>fd2(3ddd|^c>Ac>eA'>gf>e|1d2f2d2A2:|2d2f2d4|],2019-12-17 19:09:02,JayN


In [40]:
# tunes.loc[tunes['abc'].str.contains('\[')]
# tunes.loc[tunes['abc'].str.contains('\]')]
tunes.loc[tunes['abc'].str.contains('.', regex=False)]

Unnamed: 0,tune_id,setting_id,name,type,meter,mode,abc,date,username
26,16424,32371,2-Rivisjenkka,march,4/4,Eminor,".B,!tento!BE>F(3(E/F/E/DE>F|(3(E/F/E/D/D/E/G/B/c/d/c/B/c/B/B/F/G/|.B,!tento!BE>F(3E/F/E/DE/G/B/c/|[1d/c/B/c/B/B/B,/D/F/G/(EE/E/F/G/:|[2d/c/B/c/B/B/B,/D/F/G/EEg/f/]||:e.dF(e/d/B/G/F/B,/E/A/G/F/|(3(E/F/E/D/D/E/G/B/(c/.d/(c/B/c/B/B/g/f/|(ed/F/e>dB/G/F/(B,/.E/(G/B/c/|1d/c/B/(c/B/(B/B,/D/F/G/EEg/f/:|2d/c/B/(c/B/(B/B,/D/(F/G/EE(F/G/|]",2018-04-30 13:58:03,pbsinclair42
123,14855,27437,A Cloudy Morning,reel,4/4,Dmajor,(3.d.d.d(dABdAF|AFAgfdBc|(3.d.d.d(dABdAF|GFEFGABc|dcdABdAF|ABdefdef|gfgefedf|edcedAFA||abafafdf|gefdedBd|afdfabaf|edefd2(df|afdfabaf|gbfaedBe|(3.d.d.d(dABdAF|ABdefdd2||,2015-10-28 02:22:28,Moxhe
169,750,29347,A Fig For A Kiss,slip jig,9/8,Edorian,|:.E2BBGEBGE|FADAFdAFD|EFGAB=cBAG|1BeBAGFE2D:|2BeBAGFE2B|||:eBegegbge|dAdfdfafd|1eBegegbge|gfeBe^de2B:|2e/2f/2gfed=cBAG|BeBAGFE3||,2016-11-19 18:48:11,G.Ryckeboer
210,14361,26313,A L'envers,slip jig,9/8,Edorian,E>FG>BA2|G>AB>GE>D|E>FG>BA2|1G>AB>GE2:|2G>AB>GE>B||c2A>cB>A|G>FE>FG>B|c2A>cB>A|1G>FG>BA>B:|2G>FG>BA2||AA.e2.A2B2-|B>GA>Bc>dc2|A2AA.e2.A2|B>GA>Bc>dcB:||,2015-04-15 19:26:06,Boris Rorsvort
231,2152,2152,A Merry Christmas,jig,6/8,Gmajor,|:D|GABAGE|cegdBG|GABAGE|FDDDEF|GABAGE|cegdBG|EFG(A<dc|BGGG2:||:B|d^cedef|gfedBG|Bcdefg|(gbgafd|g/2a/2b.agfe|dBgdBG|EFG(A<dc|BGGG2:|,2003-11-14 12:06:36,ponad
267,422,422,A Parcel Of Land,reel,4/4,Gmajor,G3Bd4|efgagedB|AE(3EEEABcd|fd(3dddfdcA|BGGGDGGG|efgagedB|AcagfdcA|1FGAFGFED:|2AGFAGABd|||:(3.g.a.gfagedc|BcdBGABd|eA(3AAAeAeg|fd(3dddfdcA|dggfgedc|BggfgdBd|ceagfdcA|1AGFAGABd:|2FGAFG4||,2001-12-06 19:17:40,radriano
311,19701,38910,A Sligo Air,waltz,3/4,Amajor,AF|E2F2A2|A2B2cA|d3dB/c/d|e2c2AA|A3BBA|B3Bcd|e2dcBA|F2A2AF|E2EcEF|.AAABcA|B2A2A2|1A3B:|2A3ABc||d3edB|c2A2F2|E2c2EF|A4.AA|A2c2e2|B3Bcd|e2dcBA|F2A2AF|E2EcEF|.AAABcA|B2A2A2|1A3ABc:|2A4|,2020-09-05 08:59:22,gian marco
384,833,38465,A Trip To The Cottage,jig,6/8,Dmajor,g|ecccBc|dBBBAB|eAAAAB|cBBBcd|.e2ccBc|dBBBAB|eAABAB|cAAA3:||d3AFA|DFAB2A|d3ede|fedB2A|dcdAFA|DFAB2A|d2fefg|fdcd3:||,2020-07-09 19:21:31,Rover
691,11210,28115,Airaidh Nam Badan,jig,6/8,Fmajor,f2(f//e//d//c//|cA(G/F/F>G(A/B/|c>dccAF|A2(G/F/(GAc|d3f2f//(f//e//d//|cA(G/F/F>.G.A/.B/|c>dccAF|A2(G/F/(GAc|(d3c2||(A/B/|cdf(g>fg|a2(g/f/fdc|(d>cd(cdf|g3d(f//e//f//g//|a.f.fg.d.d|f.c.cd>ef|A2(G/F/(GAc|(d3c2||(A/B/|cdf(g>fg|a2(g/f/fdc|(d>cd(cdf|g3d(f//e//f//g//|a.f.fg.d.d|fcA/c/d>ef|A2(G/F/(GAc|1d3:|2(d3c3|],2016-01-22 01:46:22,JoJofidhlear
744,7642,21143,Albert Farmer's Bonfire,barndance,4/4,Dmajor,|:A>d|f2f>dA2F>A|G>AG>FE2E2|e2(3fedc>AB>c|dAfdA2de|f2-f>dA2(3FGA|G2GFE4|e>fe>dc2(3ABc|[1d2A2D2:|[2d2f2d2|||:(3bag|f2f>df2A>f|g2g>fe2e2|g>f(3fedc>A(3ABc|[1d>fA>df>Aa>g|f2-f>df>Aa>f|g>fe>dc4|gfedcABc|d2d2d2:|[2d>fA>dF>AA>d|f2f>dA2F2|G>AG>FE2E2|c>ee>dcABc|.d2.A2.D2|],2013-03-10 10:57:14,ceolachan


In [36]:
tunes.loc[tunes['abc'].str.contains('\{')]
tunes.loc[tunes['abc'].str.contains('\}')]

Unnamed: 0,tune_id,setting_id,name,type,meter,mode,abc,date,username
