In [30]:
import re
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
output_notebook()
import torch
import os
import time
import json
import torch
import random
from utils import *
from config import *
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Config, get_scheduler
import config

In [78]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

batch_size = torch.cuda.device_count()
print(f"Using batch size {batch_size}")
patchilizer = Patchilizer()
device = "cpu"
print(f"Using device {device}")

Using batch size 1
Using device cpu


In [79]:
patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS, 
                    max_length=PATCH_LENGTH, 
                    max_position_embeddings=PATCH_LENGTH,
                    vocab_size=1)
char_config = GPT2Config(num_hidden_layers=CHAR_NUM_LAYERS, 
                    max_length=PATCH_SIZE, 
                    max_position_embeddings=PATCH_SIZE,
                    vocab_size=128)
model = TunesFormer(patch_config, char_config, share_weights=SHARE_WEIGHTS)

The char level decoder has 3 GPT2 blocks and a "linear" head that converts the 768 dimensional embeddings
into 128 logits.

In [33]:
print(model.char_level_decoder)

CharLevelDecoder(
  (base): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(128, 768)
      (wpe): Embedding(32, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-2): 3 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=128, bias=False)
  )
)


The "patch level decoder" has 9 GPT2 blocks. Its input is 4096 dimensional,
corresponding to "patches" or "measures" that have (up to) 32 symbols, each chosen from a 128 character vocabulary.
These get embedded in 768 dimensions.

In [34]:
print(model.patch_level_decoder)

PatchLevelDecoder(
  (patch_embedding): Linear(in_features=4096, out_features=768, bias=True)
  (base): GPT2Model(
    (wte): Embedding(1, 768)
    (wpe): Embedding(128, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-8): 9 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)


## Walkthrough of the Tunesformer Repo

For reference, one can get lots of information about abc at the [abc notation home page](https://abcnotation.com/).  Let's get a copy of whiskey before breakfast to work with.

In [35]:
with open("abcs/whiskey-before-breakfast.abc") as f:
    whiskey = f.read()

### The data

The data comes from an amalgamation of abc files from a variety of sources and is stored on huggingface in a dataset called
`irishman`.

In [36]:
import datasets
import fsspec
import transformers
print(fsspec.__version__)
print(datasets.__version__)
print(transformers.__version__)

2023.10.0
2.16.1
4.32.1


In [37]:
from datasets import load_dataset


irishman = load_dataset("sander-wood/irishman")
print(f"Consists of {irishman['train'].shape[0]} training rows and {irishman['validation'].shape[0]} validation rows")
irishman_df = pd.DataFrame(irishman['train'])

Consists of 214122 training rows and 2162 validation rows


In [38]:
key = r"(K:(?P<key>[\w]+))"
time_sig = r"(M:(?P<time_sig>[\d]+\/[\d]+))"
def extract_key(abc):
   m = re.search(key, abc)
   return m.group('key') if m else None

def extract_tonic(abc):
    m = extract_key(abc)
    return None if m=='none' or not m else m[0]


def extract_mode(abc):
    
    m = extract_key(abc)
    if not m or m=='none':
        return None
    mode = m[1:]
    if len(mode)==0 or mode=='maj':
        return 'major'
    if mode=='min':
        return 'minor'
    return 'modal'
    

def extract_time_sig(abc):
    m=re.search(time_sig, abc)
    return m.group('time_sig') if m else None

irishman_df['key']=irishman_df['abc notation'].apply(extract_key)
irishman_df['time_sig']=irishman_df['abc notation'].apply(extract_time_sig)
irishman_df['tonic']=irishman_df['abc notation'].apply(extract_tonic)
irishman_df['mode']=irishman_df['abc notation'].apply(extract_mode)

irishman_df['key_mode'] = irishman_df['key'] + '_' + irishman_df['mode']

There are 2500 songs without a key indication.  

In [39]:
missing_keys = (irishman_df['key']=='none')
missing_keys.sum()

2508

In [40]:
keys = pd.DataFrame(irishman_df.groupby(['tonic','mode']).size().reset_index(name='count'))
keys['key_mode'] = keys['tonic'] + keys['mode']

In [41]:

keys = keys[keys['key_mode']!='none']
keys = keys.sort_values(by='count', ascending=True)
p = figure(y_range=keys['key_mode'], height=500, title=f"Key Counts ",
           toolbar_location=None, tools="")

p.hbar(y=keys['key_mode'], right=keys['count'], height=0.9,fill_color='gray',line_color='pink')

p.x_range.start = 0
#p.xaxis.major_label_orientation = "vertical"

show(p)

There are some unusual time signatures (which are clearly errors) in the dataset. For example, 9/81,
or 10/16, or 432/444 (!) seem likely to be mistakes.   For now we won't worry about this.

In [42]:
sigs = pd.DataFrame(irishman_df.groupby('time_sig').size().reset_index(name='count'))
for i,x in enumerate(sigs[sigs['count']<20]['time_sig']):
    if i % 20 < 19:
        print(f"{x},",end="")
    else:
        print(f"{x}")

1/2,1/4,1/8,10/16,10/4,10/8,11/16,11/8,12/16,12/6,13/16,13/28,13/4,13/8,14/8,15/16,15/8,17/8,18/16,18/4
18/8,2/1,2/3,21/4,22/16,22/8,23/4,26/8,28/4,3/16,3/3,3/6,32/44,4/3,43/44,432/444,45/44,46/8,5/16,5/2
6/16,6/5,6/6,6/86,6/9,63/84,7/4,8/16,8/2,8/4,9/12,9/3,9/6,9/81,

## Tokenization

The library has a class called "Patchilizer" which functions as the tokenizer.  I think it's called "patchilizer" because it operates on the bar level.  The idea here is that you can think of the data as having a hierarchical structure made up of notes within bars. 

At its root the tokenization is on the character level.

In [43]:
from utils import Patchilizer
P = Patchilizer()
encoded = P.encode(whiskey)
for i,x in enumerate(P.split_bars(whiskey)):
    print(f"Bar {i}: {x}")


Bar 0: 

X:1
T:Whiskey Before Breakfast
L:1/8
M:4/4
Z:abc-transcription Josh Larios <hades@elsewhere.org>, 2014.01.13
B:Complete Tractor, p.210
N:The bluegrassers all play that E minor chord in measure 11, but most old time backup players just play A.
K:D
|:
Bar 1:  "D"DE FG A2 AA |
Bar 2:  AB AG FE DF |
Bar 3:  "G"G2 BG "D"F2 AF |
Bar 4:  "A"ED EF EC B,A, |
Bar 5: 
"D"DE FG A2 AA |
Bar 6:  AB AG FE DF |
Bar 7:  "G"G2 BG "D"F2 AF |
Bar 8:  "A"ED EF "D"D2 A2 ::
Bar 9: 
"D"A2 d2 d2 dd |
Bar 10:  f2 d2 B2 A2 |
Bar 11:  "Em (A)"e2 ef e2 ef |
Bar 12:  "A7" gf ed cB Ac |
Bar 13: 
"D"d2 fd "A"c2 ec |
Bar 14:  "G"Bc dB "D"AF ED |
Bar 15:  "G"G2 BG "D"F2 AF |
Bar 16:  "A"ED EF "D"D2-D2 :|


The encoding into "patches" puts the control lines into individual patches, and then collects each measure into a patch, where the initial bar (if any) goes into the first patch and patches end with a '|' or related delimiter.

In [44]:
for i,r in enumerate(encoded):
    print(f"Patch {i}: {'-'.join([chr(x) for x in r if x>2])}")

Patch 0: X-:-1-

Patch 1: T-:-W-h-i-s-k-e-y- -B-e-f-o-r-e- -B-r-e-a-k-f-a-s-t-

Patch 2: L-:-1-/-8-

Patch 3: M-:-4-/-4-

Patch 4: Z-:-a-b-c---t-r-a-n-s-c-r-i-p-t-i-o-n- -J-o-s-h- -L-a-r-i-o-s
Patch 5: B-:-C-o-m-p-l-e-t-e- -T-r-a-c-t-o-r-,- -p-.-2-1-0-

Patch 6: N-:-T-h-e- -b-l-u-e-g-r-a-s-s-e-r-s- -a-l-l- -p-l-a-y- -t-h-a
Patch 7: K-:-D-

Patch 8: |-:- -"-D-"-D-E- -F-G- -A-2- -A-A- -|
Patch 9:  -A-B- -A-G- -F-E- -D-F- -|
Patch 10:  -"-G-"-G-2- -B-G- -"-D-"-F-2- -A-F- -|
Patch 11:  -"-A-"-E-D- -E-F- -E-C- -B-,-A-,- -|
Patch 12: 
-"-D-"-D-E- -F-G- -A-2- -A-A- -|
Patch 13:  -A-B- -A-G- -F-E- -D-F- -|
Patch 14:  -"-G-"-G-2- -B-G- -"-D-"-F-2- -A-F- -|
Patch 15:  -"-A-"-E-D- -E-F- -"-D-"-D-2- -A-2- -:-:
Patch 16: 
-"-D-"-A-2- -d-2- -d-2- -d-d- -|
Patch 17:  -f-2- -d-2- -B-2- -A-2- -|
Patch 18:  -"-E-m- -(-A-)-"-e-2- -e-f- -e-2- -e-f- -|
Patch 19:  -"-A-7-"- -g-f- -e-d- -c-B- -A-c- -|
Patch 20: 
-"-D-"-d-2- -f-d- -"-A-"-c-2- -e-c- -|
Patch 21:  -"-G-"-B-c- -d-B- -"-D-"-A-F- -E-D- -|
Patch 22

## Training Data

The training data comes from the `irishman['train']` dataset.  This is a generator that yields dictionaries.
The dictionaries have two keys: `abc notation` and `control code`.



In [45]:
irishman['train']

Dataset({
    features: ['abc notation', 'control code'],
    num_rows: 214122
})

In [46]:
print(f"Control code:\n{irishman['train'][0]['control code']}\nABC:\n{irishman['train'][0]['abc notation']}")

Control code:
S:2
B:5
E:5
B:6

ABC:
X:1
L:1/8
M:4/4
K:Emin
|: E2 EF E2 EF | DEFG AFDF | E2 EF E2 B2 |1 efe^d e2 e2 :|2 efe^d e3 B |: e2 ef g2 fe | 
 defg afdf |1 e2 ef g2 fe | efe^d e3 B :|2 g2 bg f2 af | efe^d e2 e2 ||


The data passed to the training loop is encoded using the "patchilizer".  It appears we drop the X: key at the start of each
abc tune.

In [47]:
P = Patchilizer()
item = irishman['train'][0]
text = item['control code']+"\n".join(item['abc notation'].split('\n')[1:])
print(text)

S:2
B:5
E:5
B:6
L:1/8
M:4/4
K:Emin
|: E2 EF E2 EF | DEFG AFDF | E2 EF E2 B2 |1 efe^d e2 e2 :|2 efe^d e3 B |: e2 ef g2 fe | 
 defg afdf |1 e2 ef g2 fe | efe^d e3 B :|2 g2 bg f2 af | efe^d e2 e2 ||


The input patch is a m x PATCHSIZE torch tensor where m is the number of bars. In our case
PATCHSIZE=32. A special initial and ending bar is attached (the initial one is [bos, bos, ... bos,eos]
and the final one is [bos, eos, eos,....eos]).

In [154]:
input_patch = torch.tensor(P.encode(text,add_special_patches=True))
print(input_patch.unsqueeze(0).shape)


torch.Size([1, 20, 32])


The batch size is set to 1 (assuming only one GPU, in fact batch size is number of GPU's available.) Although the code fiddles
around quite a bit with sizes, what actually happens is that the tune with M bars is encoded into an M x PATCH_SIZE = M x 32
tensor, this gets "unsqueezed" into a 1 x M x PATCH_SIZE array, and that gets supplied to the model.

The first level of the model, the `patch_level_decoder`, returns a 1 x M x 768 (1 x M x embedding_dimension) vector for the patches.

This is (part of) the input to the `char_level_decoder`.


In [157]:
embedding = model.patch_level_decoder(input_patch.unsqueeze(0))["last_hidden_state"]
print(embedding.shape)

torch.Size([1, 20, 768])
