In [21]:
import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator

```bibtex
@inproceedings{pop909-ismir2020,
    author = {Ziyu Wang* and Ke Chen* and Junyan Jiang and Yiyi Zhang and Maoran Xu and Shuqi Dai and Guxian Bin and Gus Xia},
    title = {POP909: A Pop-song Dataset for Music Arrangement Generation},
    booktitle = {Proceedings of 21st International Conference on Music Information Retrieval, {ISMIR}},
    year = {2020}
}

In [22]:
FILE_PATH = glob.glob("POP909/*/*.mid", recursive=True)[:100]
TEST_FILE_PATH = glob.glob("POP909/*/*.mid", recursive=True)[100:200]

tokenizer = REMI()
train_dataset = DatasetMIDI(
    files_paths=FILE_PATH,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
    pre_tokenize = True
)
test_dataset = DatasetMIDI(
    files_paths=TEST_FILE_PATH,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
    pre_tokenize = True
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collator)

Pre-tokenizing: 100%|██████████| 100/100 [00:02<00:00, 42.71it/s]
Pre-tokenizing: 100%|██████████| 100/100 [00:01<00:00, 58.03it/s]


# Task 1

In [23]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_dim, 
            num_heads=1,
            batch_first=True
        )
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)

        self.dropout = nn.Dropout(0.1)
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.fc1 = nn.Linear(hidden_dim, hidden_dim * 2)
        self.fc2 = nn.Linear(hidden_dim*2, vocab_size)

        self._init_weights()
    
    def _init_weights(self):
        # Initialize attention weights with smaller values
        for name, param in self.attention.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param, gain=0.1)
        
        # Initialize FC layers
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.ln1(out)
        
        attn_out, _ = self.attention(out, out, out)
        out = 0.7 * out + 0.3 * attn_out
        out = self.ln2(out)

        out = F.relu(self.fc1(out))
        out = self.fc2(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

In [24]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.0001, device='mps'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
    train(model, train_loader, test_loader, vocab_size)

Epoch 1/20 | Train Loss: 4.4840 | Val Loss: 4.0342
Epoch 2/20 | Train Loss: 3.6950 | Val Loss: 3.4718
Epoch 3/20 | Train Loss: 3.0479 | Val Loss: 2.8478
Epoch 4/20 | Train Loss: 2.5783 | Val Loss: 2.5294
Epoch 5/20 | Train Loss: 2.3482 | Val Loss: 2.3935
Epoch 6/20 | Train Loss: 2.2234 | Val Loss: 2.3053
Epoch 7/20 | Train Loss: 2.1310 | Val Loss: 2.2354
Epoch 8/20 | Train Loss: 2.0712 | Val Loss: 2.2015
Epoch 9/20 | Train Loss: 2.0223 | Val Loss: 2.1731
Epoch 10/20 | Train Loss: 1.9846 | Val Loss: 2.1371
Epoch 11/20 | Train Loss: 1.9519 | Val Loss: 2.1182
Epoch 12/20 | Train Loss: 1.9193 | Val Loss: 2.0949
Epoch 13/20 | Train Loss: 1.8840 | Val Loss: 2.0878
Epoch 14/20 | Train Loss: 1.8624 | Val Loss: 2.0535
Epoch 15/20 | Train Loss: 1.8306 | Val Loss: 2.0536
Epoch 16/20 | Train Loss: 1.8102 | Val Loss: 2.0353
Epoch 17/20 | Train Loss: 1.7832 | Val Loss: 2.0293
Epoch 18/20 | Train Loss: 1.7656 | Val Loss: 2.0109
Epoch 19/20 | Train Loss: 1.7394 | Val Loss: 1.9998
Epoch 20/20 | Train L

In [25]:
def sample(model, start_token, max_length=100, temperature=1.0, device='mps'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)

Generated token sequence:
[1, 115, 161, 4, 200, 35, 109, 138, 204, 30, 111, 138, 42, 103, 133, 196, 40, 107, 147, 153, 33, 109, 104, 143, 204, 32, 32, 107, 131, 215, 44, 109, 133, 155, 40, 107, 137, 44, 104, 136, 35, 103, 133, 44, 102, 133, 198, 44, 100, 133, 206, 104, 133, 210, 42, 103, 131, 214, 32, 106, 131, 218, 44, 107, 106, 136, 135, 49, 106, 132, 194, 37, 108, 132, 198, 40, 105, 126, 50, 126, 53, 126, 56, 126, 216, 54, 126, 56, 126, 41, 121, 126, 125, 54, 121, 127, 126, 53, 122, 127, 4, 189, 56, 121, 125, 191, 53, 122, 133, 193, 46, 122, 129, 195, 121, 129, 199, 49, 121, 129, 203, 48, 121, 133, 207, 46, 122, 129, 211, 44, 121, 129, 215, 51, 119, 164, 129, 219, 47, 119, 128, 4, 189, 49, 120, 128, 191, 44, 120, 129, 49, 121, 128, 198, 44, 122, 128, 128, 49, 120, 128, 60, 121, 128, 198, 51, 122, 136, 202, 51, 117, 127, 204, 53, 123, 127, 206, 51, 122, 127, 210, 58, 117, 127, 214, 53, 118, 127, 161, 123, 127, 66, 124, 127, 124, 124, 135, 124, 123, 136, 216, 124, 132, 124, 134, 124, 

In [26]:
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth

output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn.mid")
fs.midi_to_audio("rnn.mid", "rnn.wav")

  output_score = tokenizer.tokens_to_midi([generated_sequence])


FluidSynth runtime version 2.4.6
Copyright (C) 2000-2025 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file 'rnn.wav'..


In [27]:
display(Audio("rnn.wav"))

In [28]:
token_list = []
for tokens in generated_sequence:
    token_list.append(tokenizer.token_id_type(tokens))
print(token_list)

['BOS', 'Velocity', 'Duration', 'Bar', 'Position', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Duration', 'Pitch', 'Velocity', 'Velocity', 'Duration', 'Position', 'Pitch', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Duration', 'Pitch', 'Velocity', 'Duration', 'Pitch', 'Velocity', 'Duration', 'Pitch', 'Velocity', 'Duration', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Position', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Velocity', 'Duration', 'Duration', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Position', 'Pitch', 'Velocity', 'Duration', 'Pitch', 'Duration', 'Pitch', 'Duration', 'Pitch', 'Duration', 'Position', 'Pitch', 'Duration', 'Pitch', 'Duration', 'Pitch', 'Vel

# Task 2 