In [1]:
import torch
MODEL_PATH = "../models/model.pt"
model = torch.jit.load(MODEL_PATH)

In [2]:
from tokenizers import Tokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

In [3]:
import configparser

config = configparser.RawConfigParser()
config.read('../local_config.cfg')

tokens = dict(config.items('TOKENS'))
hf_token = tokens["hf_token"]

In [4]:
import miditok
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from torch.utils.data import DataLoader
from pathlib import Path

# Creating a multitrack tokenizer configuration, read the doc to explore other parameters
# config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
# tokenizer = REMI(config)

config = miditok.TokenizerConfig()
config.additional_params = { "base_tokenizer" : 'MIDILike' }

tokenizer = miditok.MMM(config)

# Train the tokenizer with Byte Pair Encoding (BPE)
midi_paths = list(Path("/home/julia/WIMU/Orchestrify/data/external/Jazz Midi").glob("**/*.mid"))
tokenizer.train(vocab_size=512, files_paths=midi_paths)
tokenizer.save_params(Path("models", "tokenizer.json"))
# And pushing it to the Hugging Face hub (you can download it back with .from_pretrained)
tokenizer.push_to_hub("juleczka/orchestrify_tokenizer", private=True, token=hf_token)

# Split MIDIs into smaller chunks for training
dataset_chunks_dir = Path("/home/julia/WIMU/Orchestrify/data/processed")
split_files_for_training(
    files_paths=midi_paths,
    tokenizer=tokenizer,
    save_dir=dataset_chunks_dir,
    max_seq_len=1024,
)

# Create a Dataset, a DataLoader and a collator to train a model
dataset = DatasetMIDI(
    files_paths=list(dataset_chunks_dir.glob("**/*.mid")),
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator)

  tokenizer.train(vocab_size=512, files_paths=midi_paths)
  tokenizer.save_params(Path("models", "tokenizer.json"))
No files have been modified since last commit. Skipping to prevent empty commit.
Splitting music files (/home/julia/WIMU/Orchestrify/data/processed): 100%|██████████| 934/934 [00:14<00:00, 65.24it/s]


In [5]:
!pip3 install music21



In [6]:
import os
import datasetcreatorconfig
import datasetcreator

In [7]:
dataset_creator_config = datasetcreatorconfig.JSBDatasetCreatorTrackConfig()
dataset_creator = datasetcreator.DatasetCreator(dataset_creator_config)
dataset_creator.create(datasets_path='../data/external/Jazz Midi', overwrite=False)


In [8]:

from transformers import PreTrainedTokenizerFast

In [9]:
from torch.utils.data.dataset import Dataset
import random
import numpy as np

class TokenSequenceDataset(Dataset):

    def __init__(self, tokenizer, dataset_paths, block_size, simulate=False):

        pad_token_id = tokenizer.encode("[PAD]")[0]
        unk_token_id = tokenizer.encode("[UNK]")[0]

        # Read all lines from all files.
        lines = []
        for dataset_path in dataset_paths:
            assert os.path.isfile(dataset_path), f"Input file path {dataset_path} not found"
            lines += open(dataset_path, "r").readlines()

        # In simulation just use a few samples.
        if simulate:
            random.shuffle(lines)
            lines = lines[:10]

        # Turn lines into training examples. Also gather some statistics.
        self.examples = []
        unknown_tokens_set = []
        unknown_tokens = []
        tokens_count = 0
        unknown_token_lines_count = 0
        too_long_lines_count = 0
        encoded_lengths = []
        for line in lines:

            #Skip empty lines.
            line = line.strip()
            if line == "":
                continue

            # Encode the line.
            encoded_line = tokenizer.encode(line)
            encoded_lengths += [len(encoded_line)]
            tokens_count += len(encoded_line)

            # Create a warning about unknown tokens. And then skip the line.
            if unk_token_id in encoded_line:
                index = encoded_line.index(unk_token_id)
                token = tokenizer.decode(encoded_line[index])
                token = line.split()[index]
                if token not in unknown_tokens_set:
                    unknown_tokens_set += [token]
                #logger.warning(f"Skipping line because of unknown token {token}")
                unknown_tokens += [token]
                unknown_token_lines_count += 1
                continue

            # Skip sequence if it is too long.
            if len(encoded_line) > block_size:
                #logger.warning(f"Skipping line because it is too long... {len(encoded_line)} > {block_size}")
                too_long_lines_count += 1
                continue

            # Pad and truncate.
            tensor = np.full((block_size,), pad_token_id, dtype=np.longlong)
            tensor[:len(encoded_line)] = encoded_line
            assert len(tensor) == block_size

            self.examples += [{
                "input_ids": torch.tensor(tensor, dtype=torch.long),
                "labels": torch.tensor(tensor, dtype=torch.long)
            }]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [10]:

output_path = './models_2'

# tokenizer = Tokenizer.from_file('../data/external/Jazz Midi/jsb_mmmtrack/tokenizer.json')
tokenizer = PreTrainedTokenizerFast(tokenizer_file='../data/external/Jazz Midi/jsb_mmmtrack/tokenizer.json')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


training_args = TrainingArguments(
    output_dir=os.path.join(output_path),
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    num_train_epochs=10,
    per_gpu_train_batch_size=16,
    save_steps=1_000,
    save_total_limit=2,
    prediction_loss_only=False,
    logging_strategy="steps",
    logging_dir=os.path.join(output_path, "logs"),
    load_best_model_at_end=True,
    save_strategy="steps"
)

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=768
)

dataset_train = TokenSequenceDataset(
    tokenizer=tokenizer,
    dataset_paths=['../data/external/Jazz Midi/jsb_mmmtrack/token_sequences_train.txt'],
    block_size=768,
    simulate=False
)

dataset_valid = TokenSequenceDataset(
    tokenizer=tokenizer,
    dataset_paths=['../data/external/Jazz Midi/jsb_mmmtrack/token_sequences_valid.txt'],
    block_size=768,
    simulate=False
)

# def patched_get(self, obj, cls):
#     return self.__getattribute__("forward")
# torch.jit._script._CachedForward.__get__ = patched_get


# trainer = Trainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=dataset_train,
#     eval_dataset=dataset_valid
# )

In [11]:
dataloader = DataLoader(dataset_valid, batch_size=16, collate_fn=data_collator)

In [12]:
from transformers import AdamW

In [13]:
print(model.code)

def forward(self,
    input_ids: Tensor,
    argument_2: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]]:
  lm_head = self.lm_head
  transformer = self.transformer
  _0, _1, _2, _3, _4, _5, = argument_2
  _6, past_value, = _0
  _7, past_value0, = _1
  _8, past_value1, = _2
  _9, past_value2, = _3
  _10, past_value3, = _4
  _11, past_value4, = _5
  _12 = (transformer).forward(input_ids, _6, past_value, _7, past_value0, _8, past_value1, _9, past_value2, _10, past_value3, _11, past_value4, )
  _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, = _12
  _26 = (lm_head).forward(_13, )
  _27 = ((_14, _15), (_16, _17), (_18, _19), (_20, _21), (_22, _23), (_24, _25))
  return (_26, _27)



In [22]:
# Initialize `argument_2` with empty tensors
# The dimensions and data types here are based on your model's expected format
batch = next(iter(dataloader))
input_ids = batch['input_ids']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

batch_size = input_ids.size(0)
num_heads = 8           # Set based on model's actual configuration
hidden_dim = 512         # This is usually hidden_size / num_heads
sequence_length = input_ids.size(1)

past_key_values = tuple(
    (
        torch.zeros((batch_size, num_heads, sequence_length, hidden_dim // num_heads), dtype=torch.float32).to(device),  # past_key
        torch.zeros((batch_size, num_heads, sequence_length, hidden_dim // num_heads), dtype=torch.float32).to(device)   # past_value
    )
    for _ in range(6)  # Six layers, assuming GPT-2 small
)

# Generation loop
generated_sequence = [[] for _ in range(batch_size)]

# Define generation parameters
max_length = 50  # Adjust based on desired output length
end_token_id = 1  # Replace with the correct end token ID
temperature = 0.5 # just anything to push forward

with torch.no_grad():
    for _ in range(max_length):
        # Pass the current input and past states to the model
        # from MMM_API:
        # auto outputs = model->forward(inputs).toTuple();
        # logits = outputs->elements()[0].toTensor().index(
        #   {torch::indexing::Slice(),-1,torch::indexing::Slice()});
        # past_key_values = outputs->elements()[1];
        logits, past_key_values = model(input_ids, past_key_values)
        logits = logits[:,-1,:]
        # auto probs = (logits / param->temperature()).softmax(1);
        # auto next_tokens = probs.multinomial(1);
        probs = (logits / temperature).softmax(dim=1)
        next_tokens = probs.multinomial(1);
        # inputs.clear();
        # inputs.push_back( next_tokens );
        # inputs.push_back( past_key_values );
        input_ids = next_tokens

        for i in range(len(generated_sequence)):
            generated_sequence[i].append(next_tokens[i, 0].item())


# Print the generated token IDs
print("Generated sequence:", generated_sequence)


Generated sequence: [[42, 43, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 42, 42, 42, 42, 42, 106, 42, 69, 42, 45, 42, 86, 42, 72, 42, 45, 42, 72, 42, 45, 42, 98, 43, 42, 45, 58, 43, 42, 45, 68], [42, 42, 42, 42, 42, 43, 43, 42, 42, 42, 43, 43, 43, 43, 42, 43, 42, 121, 42, 42, 45, 43, 42, 106, 43, 42, 72, 42, 45, 42, 72, 42, 92, 42, 45, 42, 45, 72, 42, 84, 45, 45, 71, 42, 45, 64, 45, 45, 74, 42], [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 42, 45, 42, 84, 42, 43, 43, 42, 84, 42, 43, 42, 45, 43, 42, 45, 42, 45, 42, 45, 43, 42, 84, 42, 45, 121, 43, 42, 72, 42, 73, 42, 45, 64, 45, 66, 45, 45, 45], [42, 42, 42, 42, 42, 42, 43, 42, 42, 43, 42, 42, 42, 72, 42, 72, 42, 45, 42, 72, 42, 42, 42, 63, 42, 45, 43, 42, 45, 43, 42, 72, 43, 42, 106, 42, 72, 42, 45, 45, 45, 43, 42, 45, 64, 45, 45, 45, 45, 45], [42, 42, 42, 42, 45, 42, 42, 42, 43, 42, 42, 72, 42, 45, 42, 72, 42, 84, 42, 45, 42, 45, 42, 45, 43, 42, 72, 42, 45, 42, 45, 43, 42, 71, 42, 45, 45, 42, 84, 42, 118, 

In [34]:
tracks = [tokenizer.decode(generated_sequence[i]) for i in range(len(generated_sequence))]

In [31]:
!pip3 install note_seq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting note_seq
  Downloading note_seq-0.0.5-py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting absl-py
  Using cached absl_py-2.1.0-py3-none-any.whl (133 kB)
Installing collected packages: pydub, absl-py, note_seq
Successfully installed absl-py-2.1.0 note_seq-0.0.5 pydub-0.25.1


In [32]:
BAR_LENGTH_120BPM = 4.0 * 60 / 120
NOTE_LENGTH_16TH_120BPM = 0.25 * 60 / 120

import note_seq

def empty_note_sequence(qpm=120.0, total_time=0.0):
    note_sequence = note_seq.protobuf.music_pb2.NoteSequence()
    note_sequence.tempos.add().qpm = qpm
    note_sequence.ticks_per_quarter = note_seq.constants.STANDARD_PPQ
    note_sequence.total_time = total_time
    return note_sequence

def token_sequence_to_note_sequence(token_sequence, use_program=True, use_drums=True):

    if isinstance(token_sequence, str):
        token_sequence = token_sequence.split()

    note_sequence = empty_note_sequence()
    current_program = 1
    current_is_drum = False
    for token_index, token in enumerate(token_sequence):

        if token == "PIECE_START":
            pass
        elif token == "PIECE_END":
            print("The end.")
            break
        elif token == "TRACK_START":
            current_bar_index = 0
            pass
        elif token == "TRACK_END":
            pass
        elif token.startswith("INST"):
            current_instrument = token.split("=")[-1]
            if current_instrument != "DRUMS" and use_program:
                current_instrument = int(current_instrument)
                current_program = int(current_instrument)
                current_is_drum = False
            if current_instrument == "DRUMS" and use_drums:
                current_instrument = 0
                current_program = 0
                current_is_drum = True
        elif token == "BAR_START":
            current_time = current_bar_index * BAR_LENGTH_120BPM
            current_notes = {}
        elif token == "BAR_END":
            current_bar_index += 1
            pass
        elif token.startswith("NOTE_ON"):
            pitch = int(token.split("=")[-1])
            note = note_sequence.notes.add()
            note.start_time = current_time
            note.end_time = current_time + 4 * NOTE_LENGTH_16TH_120BPM
            note.pitch = pitch
            note.instrument = int(current_instrument)
            note.program = current_program
            note.velocity = 80
            note.is_drum = current_is_drum
            current_notes[pitch] = note
        elif token.startswith("NOTE_OFF"):
            pitch = int(token.split("=")[-1])
            if pitch in current_notes:
                note = current_notes[pitch]
                note.end_time = current_time
        elif token.startswith("TIME_DELTA"):
            delta = float(token.split("=")[-1]) * NOTE_LENGTH_16TH_120BPM
            current_time += delta
        elif token.startswith("DENSITY="):
            pass
        elif token == "[PAD]":
            pass
        else:
            assert False, token

    return note_sequence

In [35]:
tracks

['INST=0 INST=1 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=1 INST=0 INST=0 INST=0 INST=0 INST=0 NOTE_ON=41 INST=0 NOTE_ON=74 INST=0 INST=3 INST=0 NOTE_OFF=45 INST=0 NOTE_OFF=49 INST=0 INST=3 INST=0 NOTE_OFF=49 INST=0 INST=3 INST=0 TIME_DELTA=12.0 INST=1 INST=0 INST=3 NOTE_OFF=52 INST=1 INST=0 INST=3 NOTE_OFF=74',
 'INST=0 INST=0 INST=0 INST=0 INST=0 INST=1 INST=1 INST=0 INST=0 INST=0 INST=1 INST=1 INST=1 INST=1 INST=0 INST=1 INST=0 NOTE_OFF=36 INST=0 INST=0 INST=3 INST=1 INST=0 NOTE_ON=41 INST=1 INST=0 NOTE_OFF=49 INST=0 INST=3 INST=0 NOTE_OFF=49 INST=0 NOTE_OFF=79 INST=0 INST=3 INST=0 INST=3 NOTE_OFF=49 INST=0 NOTE_OFF=77 INST=3 INST=3 DENSITY=2 INST=0 INST=3 NOTE_OFF=50 INST=3 INST=3 NOTE_OFF=48 INST=0',
 'INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=0 INST=1 INST=0 INST=3 INST=0 NOTE_OFF=77 INST=0 INST=1 INST=1 INST=0 NOTE_OFF=77 INST=0 INST=1 INST=0 INST=3 INST=1 INST=0 INST=3 IN

In [36]:
song = token_sequence_to_note_sequence(tracks)

In [38]:
note_seq.play_sequence(song)

  synthesized /= np.abs(synthesized).max()
  return scaled.astype("<h").tobytes(), nchan
