In [None]:
import torch
MODEL_PATH = "../models/model.pt"
model = torch.jit.load(MODEL_PATH)

In [None]:
from tokenizers import Tokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

In [None]:
import configparser

config = configparser.RawConfigParser()
config.read('../local_config.cfg')

tokens = dict(config.items('TOKENS'))
hf_token = tokens["hf_token"]

In [None]:
import miditok
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from torch.utils.data import DataLoader
from pathlib import Path

# Creating a multitrack tokenizer configuration, read the doc to explore other parameters
# config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
# tokenizer = REMI(config)

config = miditok.TokenizerConfig()
config.additional_params = { "base_tokenizer" : 'MIDILike' }

tokenizer = miditok.MMM(config)

# Train the tokenizer with Byte Pair Encoding (BPE)
midi_paths = list(Path("/home/julia/WIMU/Orchestrify/data/external/Jazz Midi").glob("**/*.mid"))
tokenizer.train(vocab_size=512, files_paths=midi_paths)
tokenizer.save_params(Path("models", "tokenizer.json"))
# And pushing it to the Hugging Face hub (you can download it back with .from_pretrained)
tokenizer.push_to_hub("juleczka/orchestrify_tokenizer", private=True, token=hf_token)

# Split MIDIs into smaller chunks for training
dataset_chunks_dir = Path("/home/julia/WIMU/Orchestrify/data/processed")
split_files_for_training(
    files_paths=midi_paths,
    tokenizer=tokenizer,
    save_dir=dataset_chunks_dir,
    max_seq_len=1024,
)

# Create a Dataset, a DataLoader and a collator to train a model
dataset = DatasetMIDI(
    files_paths=list(dataset_chunks_dir.glob("**/*.mid")),
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator)

  tokenizer.train(vocab_size=512, files_paths=midi_paths)
  tokenizer.save_params(Path("models", "tokenizer.json"))
No files have been modified since last commit. Skipping to prevent empty commit.
Splitting music files (/home/julia/WIMU/Orchestrify/data/processed): 100%|██████████| 934/934 [00:13<00:00, 68.25it/s]


In [None]:
!pip3 install music21



In [None]:
import os
import datasetcreatorconfig
import datasetcreator

In [None]:
dataset_creator_config = datasetcreatorconfig.JSBDatasetCreatorTrackConfig()
dataset_creator = datasetcreator.DatasetCreator(dataset_creator_config)
dataset_creator.create(datasets_path='../data/external/Jazz Midi', overwrite=False)


In [None]:

from transformers import PreTrainedTokenizerFast

In [None]:
from torch.utils.data.dataset import Dataset
import random
import numpy as np

class TokenSequenceDataset(Dataset):

    def __init__(self, tokenizer, dataset_paths, block_size, simulate=False):

        pad_token_id = tokenizer.encode("[PAD]")[0]
        unk_token_id = tokenizer.encode("[UNK]")[0]

        # Read all lines from all files.
        lines = []
        for dataset_path in dataset_paths:
            assert os.path.isfile(dataset_path), f"Input file path {dataset_path} not found"
            lines += open(dataset_path, "r").readlines()

        # In simulation just use a few samples.
        if simulate:
            random.shuffle(lines)
            lines = lines[:10]

        # Turn lines into training examples. Also gather some statistics.
        self.examples = []
        unknown_tokens_set = []
        unknown_tokens = []
        tokens_count = 0
        unknown_token_lines_count = 0
        too_long_lines_count = 0
        encoded_lengths = []
        for line in lines:

            #Skip empty lines.
            line = line.strip()
            if line == "":
                continue

            # Encode the line.
            encoded_line = tokenizer.encode(line)
            encoded_lengths += [len(encoded_line)]
            tokens_count += len(encoded_line)

            # Create a warning about unknown tokens. And then skip the line.
            if unk_token_id in encoded_line:
                index = encoded_line.index(unk_token_id)
                token = tokenizer.decode(encoded_line[index])
                token = line.split()[index]
                if token not in unknown_tokens_set:
                    unknown_tokens_set += [token]
                #logger.warning(f"Skipping line because of unknown token {token}")
                unknown_tokens += [token]
                unknown_token_lines_count += 1
                continue

            # Skip sequence if it is too long.
            if len(encoded_line) > block_size:
                #logger.warning(f"Skipping line because it is too long... {len(encoded_line)} > {block_size}")
                too_long_lines_count += 1
                continue

            # Pad and truncate.
            tensor = np.full((block_size,), pad_token_id, dtype=np.longlong)
            tensor[:len(encoded_line)] = encoded_line
            assert len(tensor) == block_size

            self.examples += [{
                "input_ids": torch.tensor(tensor, dtype=torch.long),
                "labels": torch.tensor(tensor, dtype=torch.long)
            }]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [None]:

output_path = './models_2'

# tokenizer = Tokenizer.from_file('../data/external/Jazz Midi/jsb_mmmtrack/tokenizer.json')
tokenizer = PreTrainedTokenizerFast(tokenizer_file='../data/external/Jazz Midi/jsb_mmmtrack/tokenizer.json')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


training_args = TrainingArguments(
    output_dir=os.path.join(output_path),
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    num_train_epochs=10,
    per_gpu_train_batch_size=16,
    save_steps=1_000,
    save_total_limit=2,
    prediction_loss_only=False,
    logging_strategy="steps",
    logging_dir=os.path.join(output_path, "logs"),
    load_best_model_at_end=True,
    save_strategy="steps"
)

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=768
)

dataset_train = TokenSequenceDataset(
    tokenizer=tokenizer,
    dataset_paths=['../data/external/Jazz Midi/jsb_mmmtrack/token_sequences_train.txt'],
    block_size=768,
    simulate=False
)

dataset_valid = TokenSequenceDataset(
    tokenizer=tokenizer,
    dataset_paths=['../data/external/Jazz Midi/jsb_mmmtrack/token_sequences_valid.txt'],
    block_size=768,
    simulate=False
)

# def patched_get(self, obj, cls):
#     return self.__getattribute__("forward")
# torch.jit._script._CachedForward.__get__ = patched_get


# trainer = Trainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=dataset_train,
#     eval_dataset=dataset_valid
# )

In [None]:
dataloader = DataLoader(dataset_valid, batch_size=16, collate_fn=data_collator)

In [None]:
from transformers import AdamW

In [None]:
print(model.code)

def forward(self,
    input_ids: Tensor,
    argument_2: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]]:
  lm_head = self.lm_head
  transformer = self.transformer
  _0, _1, _2, _3, _4, _5, = argument_2
  _6, past_value, = _0
  _7, past_value0, = _1
  _8, past_value1, = _2
  _9, past_value2, = _3
  _10, past_value3, = _4
  _11, past_value4, = _5
  _12 = (transformer).forward(input_ids, _6, past_value, _7, past_value0, _8, past_value1, _9, past_value2, _10, past_value3, _11, past_value4, )
  _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, = _12
  _26 = (lm_head).forward(_13, )
  _27 = ((_14, _15), (_16, _17), (_18, _19), (_20, _21), (_22, _23), (_24, _25))
  return (_26, _27)



In [None]:
# Initialize `argument_2` with empty tensors
# The dimensions and data types here are based on your model's expected format
batch = next(iter(dataloader))
input_ids = batch['input_ids']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

batch_size = input_ids.size(0)
num_heads = 8           # Set based on model's actual configuration
hidden_dim = 512         # This is usually hidden_size / num_heads
sequence_length = input_ids.size(1)

argument_2 = tuple(
    (
        torch.zeros((batch_size, num_heads, sequence_length, hidden_dim // num_heads), dtype=torch.float32).to(device),  # past_key
        torch.zeros((batch_size, num_heads, sequence_length, hidden_dim // num_heads), dtype=torch.float32).to(device)   # past_value
    )
    for _ in range(6)  # Six layers, assuming GPT-2 small
)

# Generation loop
generated_sequence = []

# Define generation parameters
max_length = 50  # Adjust based on desired output length
end_token_id = 1  # Replace with the correct end token ID

with torch.no_grad():
    for _ in range(max_length):
        print(_)
        # Pass the current input and past states to the model
        output_logits, argument_2 = model(input_ids, argument_2)

        # Get the next token ID (e.g., using argmax for simplicity; replace with sampling if needed)
        next_token_id = torch.argmax(output_logits, dim=-1)[0, -1]  # Last token's prediction

        # Append to the generated sequence
        generated_sequence.append(next_token_id.item())

        # Stop if the end token is generated
        if next_token_id.item() == end_token_id:
            break

        # Update input_ids for the next iteration
        probs = torch.softmax(output_logits, dim=-1)
        # Sample the next token from the probability distribution
        next_token_id = torch.multinomial(probs[0, -1], num_samples=1)# Make it batch-size compatible

# Print the generated token IDs
print("Generated sequence:", generated_sequence)


0


: 

In [None]:
# Set up for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=5e-5)

num_tracks = 6  # From the Tuple structure of argument_2
hidden_size = 768  # Replace with your model's hidden state size
sequence_length = 1  # Start with a single token

# Create empty past state tensors
empty_tensor = torch.zeros((1, sequence_length, hidden_size), dtype=torch.float32)
argument_2 = tuple((empty_tensor.clone(), empty_tensor.clone()) for _ in range(num_tracks))

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention = batch['attention_mask']
        optimizer.zero_grad()

        outputs, argument_2 = model(inputs, argument_2)
        logits = outputs.logits

        next_token_id = torch.argmax(output_logits, dim=-1)[:, -1]
        generated_sequence.append(next_token_id.item())

        if next_token_id.item() == end_token_id:
            break

        input_ids = next_token_id.unsqueeze(0)

        # loss_fn = nn.CrossEntropyLoss()
        # loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
        # loss.backward()
        # optimizer.step()

        # print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


dict_keys(['input_ids', 'labels', 'attention_mask'])




RuntimeError: forward() Expected a value of type 'Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]' for argument 'argument_2' but instead found type 'Tensor'.
Position: 2
Value: tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
Declaration: forward(__torch__.transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel self, Tensor input_ids, ((Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor)) argument_2) -> ((Tensor, ((Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor), (Tensor, Tensor))))
Cast error details: Object tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]) had a different number of elements than type Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]