In [1]:
import pandas as pd
import numpy as np
import ast
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [2]:
df = pd.read_csv('midi_df_v2.csv')
df = df[(df["duration"] >= 120) & (df["duration"] <= 300)]

In [3]:
df["token_sequence"] = df["token_sequence"].apply(ast.literal_eval)
df = df[df["token_sequence"].apply(len) > 100]

context_length = 64
target_length = 64

inputs, targets = [], []

for seq in df["token_sequence"]:
    for i in range(context_length, len(seq) - target_length, target_length):
        input_seq = seq[i - context_length:i]
        target_seq = seq[i:i + target_length]
        inputs.append(input_seq)
        targets.append(target_seq)

In [4]:
class MusicDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = [torch.tensor(seq, dtype=torch.long) for seq in inputs]
        self.targets = [torch.tensor(seq, dtype=torch.long) for seq in targets]

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

dataset = MusicDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [5]:
class MusicTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Parameter(torch.randn(1, 512, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(d_model, vocab_size)

    def forward(self, src):
        x = self.embedding(src) + self.pos_encoder[:, :src.size(1)]
        x = self.transformer(x)
        return self.decoder(x)


In [6]:
vocab_size = max(max(seq) for seq in df["token_sequence"]) + 1
model = MusicTransformer(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):
    model.train()
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss {total_loss/len(dataloader):.4f}")


Epoch 1: Loss 3892.6068
Epoch 2: Loss 3752.0176
Epoch 3: Loss 3677.2236
Epoch 4: Loss 3642.9436
Epoch 5: Loss 3622.3220
Epoch 6: Loss 3607.7288
Epoch 7: Loss 3595.3531
Epoch 8: Loss 3585.3006
Epoch 9: Loss 3575.5687
Epoch 10: Loss 3566.8915


In [None]:
def generate(model, seed_seq, max_length=128):
    model.eval()
    generated = seed_seq[:]
    for _ in range(max_length):
        input_seq = torch.tensor([generated[-64:]], dtype=torch.long).to(device)
        with torch.no_grad():
            output = model(input_seq)
        next_token = output[0, -1].argmax().item()
        generated.append(next_token)
    return generated
