# Experiments with AI Guru and MMM_API mix

The following notebook contains code for:
- using miditok.MMM to create tokenizer
- using AI Guru's solution to tokenize database (and create tokenizer config)
- using AI Guru's tokenizer with original model to try and create a midi file (unsucessfully)

### Training the tokenizer and tokenizing the database 

In [2]:
import torch
MODEL_PATH = "../models/model.pt"
model = torch.jit.load(MODEL_PATH)

ModuleNotFoundError: No module named 'torch'

In [3]:
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

ModuleNotFoundError: No module named 'transformers'

Loading hugging face token

In [None]:
import configparser

config = configparser.RawConfigParser()
config.read('../local_config.cfg')

tokens = dict(config.items('TOKENS'))
hf_token = tokens["hf_token"]

Training miditok.MMM tokenizer and pushing it to hf

In [4]:
import miditok
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from torch.utils.data import DataLoader
from pathlib import Path

config = miditok.TokenizerConfig()
config.additional_params = { "base_tokenizer" : 'MIDILike' }

tokenizer = miditok.MMM(config)

midi_paths = list(Path("/home/julia/WIMU/Orchestrify/data/external/Jazz Midi").glob("**/*.mid"))
tokenizer.train(vocab_size=512, files_paths=midi_paths)
tokenizer.save_params(Path("models", "tokenizer.json"))

tokenizer.push_to_hub("juleczka/orchestrify_tokenizer", private=True, token=hf_token)

ModuleNotFoundError: No module named 'miditok'

Dividing dataset into chunks, creating collator and dataloader

In [None]:
dataset_chunks_dir = Path("/home/julia/WIMU/Orchestrify/data/processed")
split_files_for_training(
    files_paths=midi_paths,
    tokenizer=tokenizer,
    save_dir=dataset_chunks_dir,
    max_seq_len=1024,
)

# Create a Dataset, a DataLoader and a collator to train a model
dataset = DatasetMIDI(
    files_paths=list(dataset_chunks_dir.glob("**/*.mid")),
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator)

Using AI Guru version

In [None]:
!pip3 install music21 > /dev/null 2>&1

In [None]:
import os
import datasetcreatorconfig
import datasetcreator

In [5]:
dataset_creator_config = datasetcreatorconfig.JSBDatasetCreatorTrackConfig()
dataset_creator = datasetcreator.DatasetCreator(dataset_creator_config)
dataset_creator.create(datasets_path='../data/external/Jazz Midi', overwrite=False)


NameError: name 'datasetcreatorconfig' is not defined

In [None]:
from transformers import PreTrainedTokenizerFast

In [6]:
from torch.utils.data.dataset import Dataset
import random
import numpy as np

class TokenSequenceDataset(Dataset):

    def __init__(self, tokenizer, dataset_paths, block_size, simulate=False):

        pad_token_id = tokenizer.encode("[PAD]")[0]
        unk_token_id = tokenizer.encode("[UNK]")[0]

        # Read all lines from all files.
        lines = []
        for dataset_path in dataset_paths:
            assert os.path.isfile(dataset_path), f"Input file path {dataset_path} not found"
            lines += open(dataset_path, "r").readlines()

        # In simulation just use a few samples.
        if simulate:
            random.shuffle(lines)
            lines = lines[:10]

        # Turn lines into training examples. Also gather some statistics.
        self.examples = []
        unknown_tokens_set = []
        unknown_tokens = []
        tokens_count = 0
        unknown_token_lines_count = 0
        too_long_lines_count = 0
        encoded_lengths = []
        for line in lines:

            #Skip empty lines.
            line = line.strip()
            if line == "":
                continue

            # Encode the line.
            encoded_line = tokenizer.encode(line)
            encoded_lengths += [len(encoded_line)]
            tokens_count += len(encoded_line)

            # Create a warning about unknown tokens. And then skip the line.
            if unk_token_id in encoded_line:
                index = encoded_line.index(unk_token_id)
                token = tokenizer.decode(encoded_line[index])
                token = line.split()[index]
                if token not in unknown_tokens_set:
                    unknown_tokens_set += [token]
                #logger.warning(f"Skipping line because of unknown token {token}")
                unknown_tokens += [token]
                unknown_token_lines_count += 1
                continue

            # Skip sequence if it is too long.
            if len(encoded_line) > block_size:
                #logger.warning(f"Skipping line because it is too long... {len(encoded_line)} > {block_size}")
                too_long_lines_count += 1
                continue

            # Pad and truncate.
            tensor = np.full((block_size,), pad_token_id, dtype=np.longlong)
            tensor[:len(encoded_line)] = encoded_line
            assert len(tensor) == block_size

            self.examples += [{
                "input_ids": torch.tensor(tensor, dtype=torch.long),
                "labels": torch.tensor(tensor, dtype=torch.long)
            }]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

ModuleNotFoundError: No module named 'torch'

### Using the pretrained tokenizer

In [None]:
output_path = './models_2'

tokenizer = PreTrainedTokenizerFast(tokenizer_file='../data/external/Jazz Midi/jsb_mmmtrack/tokenizer.json')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


training_args = TrainingArguments(
    output_dir=os.path.join(output_path),
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    num_train_epochs=10,
    per_gpu_train_batch_size=16,
    save_steps=1_000,
    save_total_limit=2,
    prediction_loss_only=False,
    logging_strategy="steps",
    logging_dir=os.path.join(output_path, "logs"),
    load_best_model_at_end=True,
    save_strategy="steps"
)

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=768
)

dataset_train = TokenSequenceDataset(
    tokenizer=tokenizer,
    dataset_paths=['../data/external/Jazz Midi/jsb_mmmtrack/token_sequences_train.txt'],
    block_size=768,
    simulate=False
)

dataset_valid = TokenSequenceDataset(
    tokenizer=tokenizer,
    dataset_paths=['../data/external/Jazz Midi/jsb_mmmtrack/token_sequences_valid.txt'],
    block_size=768,
    simulate=False
)


In [None]:
dataloader = DataLoader(dataset_valid, batch_size=16, collate_fn=data_collator)

### Attempt to generate midi using original model

The code below uses AI Guru's tokenization with the original model. It succeeds in producing an output, however, the tokens are nonsense - after rendering back to midi we get a second of silence.

In [None]:
# see model's structure - it takes 2 arguments
print(model.code)

In [None]:
# take one batch from loader
batch = next(iter(dataloader))
input_ids = batch['input_ids']
input_ids.shape

In [None]:
# Initialize `argument_2` with empty tensors
# The dimensions and data types here are based on your model's expected format
batch = next(iter(dataloader))
input_ids = batch['input_ids']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# TODO: 
batch_size = input_ids.size(0)
num_heads = 8           # No idea what this values actually are
hidden_dim = 512        # But they do produce the right size of argument_2 values
sequence_length = input_ids.size(1)

past_key_values = tuple(
    (
        torch.zeros((batch_size, num_heads, sequence_length, hidden_dim // num_heads), dtype=torch.float32).to(device),  # past_key
        torch.zeros((batch_size, num_heads, sequence_length, hidden_dim // num_heads), dtype=torch.float32).to(device)   # past_value
    )
    for _ in range(6)  # Six layers, assuming GPT-2 small
)

generated_sequence = [[] for _ in range(batch_size)]

# Define generation parameters
max_length = 50
temperature = 1.11835682 # just anything to push forward - taken from one of API tests

with torch.no_grad():
    for _ in range(max_length):
        # Pass the current input and past states to the model
        # from MMM_API:
        # auto outputs = model->forward(inputs).toTuple();
        # logits = outputs->elements()[0].toTensor().index(
        #   {torch::indexing::Slice(),-1,torch::indexing::Slice()});
        # past_key_values = outputs->elements()[1];
        logits, past_key_values = model(input_ids, past_key_values)
        logits = logits[:,-1,:]
        # auto probs = (logits / param->temperature()).softmax(1);
        # auto next_tokens = probs.multinomial(1);
        probs = (logits / temperature).softmax(dim=1)
        next_tokens = probs.multinomial(1);
        # inputs.clear();
        # inputs.push_back( next_tokens );
        # inputs.push_back( past_key_values );
        input_ids = next_tokens

        for i in range(len(generated_sequence)):
            generated_sequence[i].append(next_tokens[i, 0].item())


# Print the generated token IDs
print("Generated sequence:", generated_sequence)


In [7]:
tracks = [tokenizer.decode(generated_sequence[i]) for i in range(len(generated_sequence))]

NameError: name 'generated_sequence' is not defined

In [None]:
!pip3 install note_seq

In [8]:
BAR_LENGTH_120BPM = 4.0 * 60 / 120
NOTE_LENGTH_16TH_120BPM = 0.25 * 60 / 120

import note_seq

def empty_note_sequence(qpm=120.0, total_time=0.0):
    note_sequence = note_seq.protobuf.music_pb2.NoteSequence()
    note_sequence.tempos.add().qpm = qpm
    note_sequence.ticks_per_quarter = note_seq.constants.STANDARD_PPQ
    note_sequence.total_time = total_time
    return note_sequence

def token_sequence_to_note_sequence(token_sequence, use_program=True, use_drums=True):

    if isinstance(token_sequence, str):
        token_sequence = token_sequence.split()

    note_sequence = empty_note_sequence()
    current_program = 1
    current_is_drum = False
    for token_index, token in enumerate(token_sequence):

        if token == "PIECE_START":
            pass
        elif token == "PIECE_END":
            print("The end.")
            break
        elif token == "TRACK_START":
            current_bar_index = 0
            pass
        elif token == "TRACK_END":
            pass
        elif token.startswith("INST"):
            current_instrument = token.split("=")[-1]
            if current_instrument != "DRUMS" and use_program:
                current_instrument = int(current_instrument)
                current_program = int(current_instrument)
                current_is_drum = False
            if current_instrument == "DRUMS" and use_drums:
                current_instrument = 0
                current_program = 0
                current_is_drum = True
        elif token == "BAR_START":
            current_time = current_bar_index * BAR_LENGTH_120BPM
            current_notes = {}
        elif token == "BAR_END":
            current_bar_index += 1
            pass
        elif token.startswith("NOTE_ON"):
            pitch = int(token.split("=")[-1])
            note = note_sequence.notes.add()
            note.start_time = current_time
            note.end_time = current_time + 4 * NOTE_LENGTH_16TH_120BPM
            note.pitch = pitch
            note.instrument = int(current_instrument)
            note.program = current_program
            note.velocity = 80
            note.is_drum = current_is_drum
            current_notes[pitch] = note
        elif token.startswith("NOTE_OFF"):
            pitch = int(token.split("=")[-1])
            if pitch in current_notes:
                note = current_notes[pitch]
                note.end_time = current_time
        elif token.startswith("TIME_DELTA"):
            delta = float(token.split("=")[-1]) * NOTE_LENGTH_16TH_120BPM
            current_time += delta
        elif token.startswith("DENSITY="):
            pass
        elif token == "[PAD]":
            pass
        else:
            assert False, token

    return note_sequence

ModuleNotFoundError: No module named 'note_seq'

In [None]:
tracks

In [None]:
song = token_sequence_to_note_sequence(tracks)