In [1]:
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DatasetJSON, DataCollator, split_midis_for_training
from miditok.data_augmentation import augment_midi_dataset
from miditok.utils import get_midi_programs
from torch.utils.data import DataLoader
from pathlib import Path
from symusic import Score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(config)

## Playing with tokenizer

In [10]:
print('io_format', tokenizer.io_format)
print('is_multi_voc', tokenizer.is_multi_voc)
print('one_token_stream', tokenizer.one_token_stream)

io_format ('T',)
is_multi_voc False
one_token_stream True


In [24]:
len(tokenizer.vocab)

409

In [None]:
tokenizer.vocab

In [28]:
len(tokenizer.vocab_bpe)

30000

In [3]:
midi_paths = list(Path("/home/lklimkiewicz/priv/midi/data").glob("**/*.mid"))

In [4]:
midi = Score(midi_paths[0])

In [7]:
midi

Score(ttype=Tick, tpq=480, begin=0, end=135216, tracks=8, notes=2176, time_sig=1, key_sig=1, markers=0, lyrics=0)

In [10]:
midi.note_num()

2176

In [8]:
dir(midi)

['__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'adjust_time',
 'clip',
 'copy',
 'dump_abc',
 'dump_midi',
 'dumps_abc',
 'dumps_midi',
 'empty',
 'end',
 'from_abc',
 'from_file',
 'from_midi',
 'key_signatures',
 'lyrics',
 'markers',
 'note_num',
 'pianoroll',
 'resample',
 'shift_pitch',
 'shift_time',
 'shift_velocity',
 'sort',
 'start',
 'tempos',
 'ticks_per_quarter',
 'time_signatures',
 'to',
 'tpq',
 'tracks',
 'ttype']

In [87]:
get_midi_programs(midi)

[(48, False),
 (58, False),
 (6, False),
 (79, False),
 (79, False),
 (15, False),
 (48, False),
 (42, False)]

In [30]:
tokens = tokenizer(midi)

In [None]:
tokens.tokens

In [32]:
len(tokens.ids), len(tokens.tokens), len(tokens.events)

(3016, 9870, 9870)

In [27]:
# train
tokenizer.learn_bpe(vocab_size=30000, files_paths=midi_paths[:100])

  tokenizer.learn_bpe(vocab_size=30000, files_paths=midi_paths[:100])







## Playing

In [None]:
midi.dump_midi('out.mid')

In [None]:
!timidity out.mid

## Loading and saving

In [34]:
# save
tokenizer.save_pretrained('tokenizer')

In [35]:
# load
REMI.from_pretrained('./tokenizer')

config.json not found in /home/lklimkiewicz/priv/midi/src/tokenizer


30000 tokens with ('T',) io format(one token stream), with BPE

## Augmentation

In [None]:
augment_midi_dataset(
    Path('/home/lklimkiewicz/priv/midi/data/vgmusic/3DO 3DO'),
    pitch_offsets=[-12, 12],
    velocity_offsets=[-4, 5],
    duration_offsets=[-0.5, 1],
    out_path=Path('./augmented_dataset'),
)

## Spliting into chunks

In [68]:
!timidity "/home/lklimkiewicz/priv/midi/data/khinsider/Commodore/Gauntlet/main theme.mid"

Playing /home/lklimkiewicz/priv/midi/data/khinsider/Commodore/Gauntlet/main theme.mid
MIDI file: /home/lklimkiewicz/priv/midi/data/khinsider/Commodore/Gauntlet/main theme.mid
Format: 1  Tracks: 5  Divisions: 960
Sequence: Track 0
Track name: Gauntlet (C64) Main Theme. Original by: Ben Daglish
Track name: Sequenced by: Bull (Jonathan Konrad)
Track name: Write to me at: jonojk@hotmail.com (use 'vgmusic' as subject)
Track name: I hope you enjoy this classic song!


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Playing time: ~25 seconds
Notes cut: 0
Notes lost totally: 0


In [81]:
ok = 0
for path in midi_paths[:10000]:
    try:
        if len(Score(path).time_signatures) > 0:
            ok += 1
    except Exception:
        pass
ok

8573

In [None]:
split_midis_for_training(
    files_paths=midi_paths[:6],
    tokenizer=tokenizer,
    save_dir=Path('./chunks_for_training'),
    max_seq_len=100,
)

## Offline tokenization

In [None]:
def midi_valid(midi) -> bool:
    if any(ts.numerator != 4 for ts in midi.time_signature_changes):
        return False  # time signature different from 4/*, 4 beats per bar
    return True

tokenizer.tokenize_dataset(
    Path("/home/lklimkiewicz/priv/midi/data/vgmusic/3DO 3DO"),
    './tokenized_dataset',
    midi_valid
)

In [85]:
dataset = DatasetJSON(
    list(Path("/home/lklimkiewicz/priv/midi/data").glob("**/*.json")),
    max_seq_len=1024
)

## Online dataset generation

In [43]:
dataset = DatasetMIDI(
    files_paths=midi_paths,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["PAD_None"],
    eos_token_id=tokenizer["BOS_None"],
)
collator = DataCollator(tokenizer["PAD_None"])
data_loader = DataLoader(
    dataset=dataset,
    collate_fn=collator,
    batch_size=32
)

In [None]:
for batch in data_loader:
    print(batch)
    break