In [None]:
from miditok import REMI, TokenizerConfig
from symusic import Score
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from tokenizers.pre_tokenizers import Split

from tqdm import tqdm
import os
from tokenizers import decoders, processors
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers
import os
from tokenizers.pre_tokenizers import CharDelimiterSplit
from tokenizers.processors import TemplateProcessing
from transformers import AutoTokenizer

# Load a pretrained tokenizer (BERT in this case)
w_tokenizer = AutoTokenizer.from_pretrained("/tmp2/b11902010/dmir_lab/mdlm/tokenizers/tokenizer_test")
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "num_velocities": 32,
    "special_tokens": ["[PAD]", "[BOS]", "[EOS]", "[MASK]"],  # Adjusted for Hugging Face compatibility
    "use_chords": True,
    "chord_maps": {
        "+" : (0, 4, 8), "/o7" : (0, 3, 6, 10), "7" : (0, 4, 7, 10),
        "M" : (0, 4, 7), "M7" : (0, 4, 7, 11), "m" : (0, 3, 7),
        "m7" : (0, 3, 7, 10), "o" : (0, 3, 6), "o7" : (0, 3, 6, 9),
        "sus2" : (0, 2, 7), "sus4" : (0, 5, 7)
    },
    "chord_tokens_with_root_note": True,
    "use_rests": False,
    "use_tempos": False,
    "use_time_signatures": False,
    "use_programs": False,
    "num_tempos": 32,  # Number of tempo bins
    "tempo_range": (50, 180),  # (min, max)
}

config = TokenizerConfig(**TOKENIZER_PARAMS)
tokenizer = REMI(config)


# Open the file in read mode

folder = 'uncond_good'
mod = 13
folder_path = f"/tmp2/b11902010/dmir_lab/mdlm/outputs/samples/testing/midi/"

# Create the directory if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

with open(f"/tmp2/b11902010/dmir_lab/mdlm/outputs/samples/testing/text", "a") as file:

    ignored_tokens = {"[BOS]", "[UNK]", "[PAD]", "[EOS]", "[MASK]"}  # Define tokens to skip
    for idx, line in enumerate(file):
        data = line.strip().split()  # Split the line into tokens
        filtered_data = [x for x in data if x not in ignored_tokens]
        token_ids = [tokenizer.vocab[x] for x in filtered_data]
        score = tokenizer.decode([token_ids])

        name = str(idx%mod).zfill(3)
        r = str(idx//mod).zfill(2)
        score.dump_midi(f'{folder_path}/{name}_{r}.mid')


In [2]:
import pickle

# Open the pickle file in read-binary mode
with open('/tmp2/b11902010/dmir_lab/diffusion_compose_and_embellish/dictionary_new.pkl', 'rb') as file:
    data = pickle.load(file)


# Print the loaded data to see what's inside
print(data)


({'[PAD]': 0, '[NONE]': 1, 'Bar_None': 2, 'Beat_0': 3, 'Beat_1': 4, 'Beat_10': 5, 'Beat_11': 6, 'Beat_12': 7, 'Beat_13': 8, 'Beat_14': 9, 'Beat_15': 10, 'Beat_2': 11, 'Beat_3': 12, 'Beat_4': 13, 'Beat_5': 14, 'Beat_6': 15, 'Beat_7': 16, 'Beat_8': 17, 'Beat_9': 18, 'Chord_A#_+': 19, 'Chord_A#_/o7': 20, 'Chord_A#_7': 21, 'Chord_A#_M': 22, 'Chord_A#_M7': 23, 'Chord_A#_m': 24, 'Chord_A#_m7': 25, 'Chord_A#_o': 26, 'Chord_A#_o7': 27, 'Chord_A#_sus2': 28, 'Chord_A#_sus4': 29, 'Chord_A_+': 30, 'Chord_A_/o7': 31, 'Chord_A_7': 32, 'Chord_A_M': 33, 'Chord_A_M7': 34, 'Chord_A_m': 35, 'Chord_A_m7': 36, 'Chord_A_o': 37, 'Chord_A_o7': 38, 'Chord_A_sus2': 39, 'Chord_A_sus4': 40, 'Chord_B_+': 41, 'Chord_B_/o7': 42, 'Chord_B_7': 43, 'Chord_B_M': 44, 'Chord_B_M7': 45, 'Chord_B_m': 46, 'Chord_B_m7': 47, 'Chord_B_o': 48, 'Chord_B_o7': 49, 'Chord_B_sus2': 50, 'Chord_B_sus4': 51, 'Chord_C#_+': 52, 'Chord_C#_/o7': 53, 'Chord_C#_7': 54, 'Chord_C#_M': 55, 'Chord_C#_M7': 56, 'Chord_C#_m': 57, 'Chord_C#_m7': 58, 

In [23]:
from tokenizers import Tokenizer, models
from transformers import PreTrainedTokenizerFast
import pickle
import os
import transformers

# Load your vocabulary data
with open('/tmp2/b11902010/dmir_lab/diffusion_compose_and_embellish/dictionary_new.pkl', 'rb') as file:
    data = pickle.load(file)

token_to_id, id_to_token = data

# Define your special tokens
special_tokens = {
    'pad_token': "[PAD]",
    'mask_token': "[MASK]",
    'bos_token': "[BOS]",
    'eos_token': "[EOS]"
}

# Add special tokens to vocab if they're not present
for token in special_tokens:
    if token not in token_to_id:
        token_id = max(token_to_id.values()) + 1
        token_to_id[token] = token_id
        id_to_token[token_id] = token
        special_tokens[token] = token_id

# Create the tokenizer with WordLevel model
tokenizer = Tokenizer(models.WordLevel(unk_token='[UNK]', vocab=token_to_id))
tokenizer.get_vocab = lambda: token_to_id

# Save directory
save_dir = '/tmp2/b11902010/dmir_lab/diffusion_compose_and_embellish/tokenizers/cne_tokenizer'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the tokenizer JSON
tokenizer.save(os.path.join(save_dir, 'tokenizer.json'))

# Load with PreTrainedTokenizerFast
tokenizer_fast = PreTrainedTokenizerFast.from_pretrained(save_dir)

# Specify special tokens in the tokenizer configuration
special_tokens = {
    'pad_token': "[PAD]",
    'mask_token': "[MASK]",
    'bos_token': "[BOS]",
    'eos_token': "[EOS]"
}
print(special_tokens)
tokenizer_fast.add_special_tokens(special_tokens)

# Save the tokenizer with special tokens info
tokenizer_fast.save_pretrained(save_dir)

# Load again to verify
tokenizer_loaded = transformers.AutoTokenizer.from_pretrained(save_dir)

# Test the tokenizer
encoded = tokenizer_loaded("Chord_A#_M7")
decoded = tokenizer_loaded.decode(encoded['input_ids'])
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")

# Check special tokens
print("Special tokens:", tokenizer_loaded.special_tokens_map)


{'pad_token': '[PAD]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'}
Encoded: {'input_ids': [26], 'token_type_ids': [0], 'attention_mask': [1]}
Decoded: Chord_A#_M7
Special tokens: {'bos_token': '[BOS]', 'eos_token': '[EOS]', 'pad_token': '[PAD]', 'mask_token': '[MASK]'}
