In [3]:
import re
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
output_notebook()

## Walkthrough of the Tunesformer Repo

### The data

The data comes from an amalgamation of abc files from a variety of sources and is stored on huggingface in a dataset called
`irishman`.

In [4]:
from datasets import load_dataset

irishman = load_dataset("sander-wood/irishman")
print(f"Consists of {irishman['train'].shape[0]} training rows and {irishman['validation'].shape[0]} validation rows")
irishman = pd.DataFrame(irishman['train'])

Consists of 214122 training rows and 2162 validation rows


In [34]:
key = r"(K:(?P<key>[\w]+))"
time_sig = r"(M:(?P<time_sig>[\d]+\/[\d]+))"
def extract_key(abc):
   m = re.search(key, abc)
   return m.group('key') if m else None

def extract_tonic(abc):
    m = extract_key(abc)
    return None if m=='none' or not m else m[0]


def extract_mode(abc):
    
    m = extract_key(abc)
    if not m or m=='none':
        return None
    mode = m[1:]
    if len(mode)==0 or mode=='maj':
        return 'major'
    if mode=='min':
        return 'minor'
    return 'modal'
    

def extract_time_sig(abc):
    m=re.search(time_sig, abc)
    return m.group('time_sig') if m else None

irishman['key']=irishman['abc notation'].apply(extract_key)
irishman['time_sig']=irishman['abc notation'].apply(extract_time_sig)
irishman['tonic']=irishman['abc notation'].apply(extract_tonic)
irishman['mode']=irishman['abc notation'].apply(extract_mode)

irishman['key_mode'] = irishman['key'] + '_' + irishman['mode']

There are 2500 songs without a key indication.  

In [35]:
missing_keys = (irishman['key']=='none')
missing_keys.sum()

2508

In [42]:
keys = pd.DataFrame(irishman.groupby(['tonic','mode']).size().reset_index(name='count'))
keys['key_mode'] = keys['tonic'] + keys['mode']

In [53]:

keys = keys[keys['key_mode']!='none']
keys = keys.sort_values(by='count', ascending=True)
p = figure(y_range=keys['key_mode'], height=500, title=f"Key Counts ",
           toolbar_location=None, tools="")

p.hbar(y=keys['key_mode'], right=keys['count'], height=0.9,fill_color='gray',line_color='pink')

p.x_range.start = 0
#p.xaxis.major_label_orientation = "vertical"

show(p)

There are some unusual time signatures (which are clearly errors) in the dataset. For example, 9/81,
or 10/16, or 10/4 seem likely to be mistakes. 

In [72]:
sigs = pd.DataFrame(irishman.groupby('time_sig').size().reset_index(name='count'))
for i,x in enumerate(sigs[sigs['count']<20]['time_sig']):
    if i % 20 < 19:
        print(f"{x},",end="")
    else:
        print(f"{x}")

1/2,1/4,1/8,10/16,10/4,10/8,11/16,11/8,12/16,12/6,13/16,13/28,13/4,13/8,14/8,15/16,15/8,17/8,18/16,18/4
18/8,2/1,2/3,21/4,22/16,22/8,23/4,26/8,28/4,3/16,3/3,3/6,32/44,4/3,43/44,432/444,45/44,46/8,5/16,5/2
6/16,6/5,6/6,6/86,6/9,63/84,7/4,8/16,8/2,8/4,9/12,9/3,9/6,9/81,