In [1]:
import pandas as pd
import glob
import random

In [2]:
filenames = glob.glob("*.abc")

clean_tunes = []
for filename in filenames:
    with open(filename, "rb") as f:
        binary = f.read()
        raw_abc = binary.decode("utf-8", "replace")
        raw_tunes = raw_abc.split("X:")
        raw_tunes.pop(0) # remove header
        raw_tunes = [tune.strip() for tune in raw_tunes]
        # remove explicit newlines
        raw_tunes = [tune.replace("/\r", "\r").replace("!\r", "\r").strip("!") for tune in raw_tunes]
        raw_tunes = [tune.splitlines()[1:] for tune in raw_tunes] # [1:] to remove tune number
        
        for tune in raw_tunes:
            # gather metadata about meter, type and key
            tune_meter, tune_type, tune_key = "", "", ""
            for line in tune:
                if line[:2] == "M:":
                    tune_meter = line
                elif line[:2] == "R:":
                    tune_type = line
                elif line[:2] == "K:":
                    tune_key = line
            
            tune_meter = tune_meter.strip()
            tune_type = tune_type.strip()
            tune_key = tune_key.strip()
            
            # drop all lines containing metadata, leaving only the abc
            tune = [line for line in tune if ":" not in line[:2]]
            just_abc = " ".join(tune)
            
            # question: should we drop the tune_type field, since there's a lot of variation?
            clean_tune = "\n".join((tune_meter, tune_type, tune_key, just_abc))
            
            clean_tunes.append(clean_tune)

In [3]:
sample = random.sample(clean_tunes, 10)
for tune in sample:
    print(tune, "\n\n")

M:4/4
R:Reel
K:G
edeg A2gf|edef g2fg|edeg A2ag|egdB BAA2|\ edeg A2gf|edef g2fg|eage degd|egdB BAA2:||\ aAA2 a2ge|dBde g2fg|aAA2 a2ge|dBeg BAA2|\ aAA2 a2ge|dBde g2fg|eage degd|egdB BAA2:||\ 


M:4/4
R:Reel
K:G
GBde dBB2 | ABcd BGFG | E2cE D2BD | dBB2 AFEF | GBde dBB2 | ABcd BGFG |\ E2cE D2BD | dBAB G4 :: g3d edBd | dgfd edBd | G3B dGBd |\ gedB A4 | d3e dBB2 | ABcd BGFG | E2cE D2BD | dBAB G4 :| 


M:2/2
R:Reel
K:A
AGAc B2cd|ecdB cABG|EAA2 BEed|cABG AECE|\ (3ABA Ac B2cd|eAae cABG|EAA2 BEed|1 cABG A2CE:|2 cABG a2ce||\ a2(3aga b3g|f2ga bgeg|abaf (3gag eg|fagb a2ce|\ aec'a gabg|(3fgf ga bgeg|aec'a fdBd|cABG A2 ce:|| 


M:C
R:Hornpipe
K:F
cB|A2F2 FAGF| EG B2 BdcB| Acde fcdB| A2 F2 F2 cB| A2F2 FAGF|! EG B2 BdcB |  Acde fcdB| A2 F2 F2:|  BA | Bcde fcdB A2 F2 F2|| 


M:3/4

K:C
E2 |A,2 A,2 C2|E2- D2 C2|E-C- B,4-|B,4 B,2|\ D2 D2 F2|A2 G2 F2|A-F- E4-|E4 E2| A2 G2 F2|E4 E2|G2 F2 E2|D-A- A2 F2|\ A2 A,2 C2|E2- C2 B,2|C-B,- A,4-|A,4 || 


M:6/8
R:jig
K:G
D|GAB d2 B|dba g2d|ege ede|gdB AGE| GAB d2 B|db

In [None]:
 """
known issues:

may need to add rule to filter out % symbols
the data is still a bit messy, but putting it through ~/cleaning.ipynb might make it good enough
the keys for many of the tunes are abbreviated, i.e. "G" or "Gmaj" rather than "Gmajor" etc.
 """