In [1]:
import os
import numpy as np
import pretty_midi
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

In [2]:
# -----------------------------
# Data Preprocessing Functions
# -----------------------------

def midi_to_piano_roll(midi_file, fps=100):
    """
    Convert a MIDI file to a piano roll representation.
    Args:
        midi_file (str): Path to the MIDI file.
        fps (int): Frames per second for the piano roll.
    Returns:
        numpy.ndarray: Piano roll representation (time_steps, pitches).
    """
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    piano_roll = midi_data.get_piano_roll(fs=fps).T  # Shape: (time_steps, pitches)
    piano_roll = (piano_roll > 0).astype(np.float32)  # Binary activation (note on/off)
    return piano_roll

def pad_or_crop(piano_roll, max_time_steps):
    """
    Ensure the piano roll has a fixed number of time steps by padding or cropping.
    """
    if piano_roll.shape[0] > max_time_steps:
        return piano_roll[:max_time_steps, :]
    else:
        padding = max_time_steps - piano_roll.shape[0]
        return np.pad(piano_roll, ((0, padding), (0, 0)), mode='constant')


In [3]:
# -----------------------------
# Custom Dataset for MIDI Data
# -----------------------------

class MIDIDataset(Dataset):
    def __init__(self, midi_dir, max_time_steps=128, fps=100):
        self.midi_files = [os.path.join(midi_dir, f) for f in os.listdir(midi_dir) if f.endswith('.mid')]
        self.max_time_steps = max_time_steps
        self.fps = fps

    def __len__(self):
        return len(self.midi_files)

    def __getitem__(self, idx):
        piano_roll = midi_to_piano_roll(self.midi_files[idx], fps=self.fps)
        piano_roll = pad_or_crop(piano_roll, self.max_time_steps)

        # Add noise to the input for training
        noisy_piano_roll = piano_roll + 0.2 * np.random.randn(*piano_roll.shape)
        noisy_piano_roll = np.clip(noisy_piano_roll, 0, 1)  # Keep values in [0, 1]

        return torch.tensor(noisy_piano_roll, dtype=torch.float32), torch.tensor(piano_roll, dtype=torch.float32)


In [4]:
# -----------------------------
# U-Net Model Definition
# -----------------------------

class UNet(nn.Module):
    def __init__(self, input_channels=1, output_channels=1):
        super(UNet, self).__init__()

        # Encoder
        self.enc1 = self._conv_block(input_channels, 64)
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.enc2 = self._conv_block(64, 128)
        self.pool2 = nn.MaxPool2d(kernel_size=2)

        # Bottleneck
        self.bottleneck = self._conv_block(128, 256)

        # Decoder
        self.up1 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.dec1 = self._conv_block(256, 128)
        self.up2 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.dec2 = self._conv_block(128, 64)

        # Final output layer
        self.final = nn.Conv2d(64, output_channels, kernel_size=1)

    def _conv_block(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        # Encoder
        enc1 = self.enc1(x)
        enc2 = self.enc2(self.pool1(enc1))

        # Bottleneck
        bottleneck = self.bottleneck(self.pool2(enc2))

        # Decoder
        up1 = self.up1(bottleneck)
        dec1 = self.dec1(torch.cat([up1, enc2], dim=1))
        up2 = self.up2(dec1)
        dec2 = self.dec2(torch.cat([up2, enc1], dim=1))

        # Final output
        output = self.final(dec2)
        return output


In [5]:

# -----------------------------
# Training Loop
# -----------------------------

def train_unet(model, dataloader, num_epochs=10, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0

        for noisy_piano_roll, clean_piano_roll in dataloader:
            noisy_piano_roll = noisy_piano_roll.unsqueeze(1).to(device)  # Add channel dim
            clean_piano_roll = clean_piano_roll.unsqueeze(1).to(device)

            optimizer.zero_grad()
            output = model(noisy_piano_roll)
            loss = criterion(output, clean_piano_roll)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader):.4f}")


In [6]:
# -----------------------------
# Main Script
# -----------------------------

if __name__ == "__main__":
    # Configuration
    midi_dir = "../archive"
    batch_size = 16
    max_time_steps = 128
    num_epochs = 20

    # Dataset and DataLoader
    dataset = MIDIDataset(midi_dir, max_time_steps=max_time_steps)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Model
    model = UNet(input_channels=1, output_channels=1)

    # Train
    train_unet(model, dataloader, num_epochs=num_epochs)

    # Save Model
    torch.save(model.state_dict(), "unet_midi_generator.pth")



Epoch 1/20, Loss: 0.1640
Epoch 2/20, Loss: 0.0302
Epoch 3/20, Loss: 0.0095
Epoch 4/20, Loss: 0.0015
Epoch 5/20, Loss: 0.0012
Epoch 6/20, Loss: 0.0011
Epoch 7/20, Loss: 0.0010
Epoch 8/20, Loss: 0.0009
Epoch 9/20, Loss: 0.0008
Epoch 10/20, Loss: 0.0007
Epoch 11/20, Loss: 0.0006
Epoch 12/20, Loss: 0.0006
Epoch 13/20, Loss: 0.0005
Epoch 14/20, Loss: 0.0004
Epoch 15/20, Loss: 0.0004
Epoch 16/20, Loss: 0.0003
Epoch 17/20, Loss: 0.0003
Epoch 18/20, Loss: 0.0003
Epoch 19/20, Loss: 0.0002
Epoch 20/20, Loss: 0.0003


In [7]:
# -----------------------------
# Ringtone Generation
# -----------------------------

def generate_ringtone_unet(model, max_time_steps, device):
    """
    Generate a new ringtone from random input.
    """
    model.eval()
    random_input = torch.randn(1, 1, max_time_steps, 128).to(device)  # Random input tensor

    with torch.no_grad():
        output = model(random_input)

    output_piano_roll = output.squeeze().cpu().numpy()
    return (output_piano_roll > 0.5).astype(np.float32)  # Binary thresholding

def piano_roll_to_midi(piano_roll, output_path, fps=100):
    """
    Convert a piano roll array back to a MIDI file.
    Args:
        piano_roll (numpy.ndarray): Piano roll array (time_steps, pitches).
        output_path (str): Path to save the MIDI file.
        fps (int): Frames per second (tempo).
    """
    midi_data = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  # Default to Acoustic Grand Piano

    piano_roll = (piano_roll > 0.5).astype(int)  # Ensure binary
    for pitch in range(piano_roll.shape[1]):
        notes = np.where(piano_roll[:, pitch] == 1)[0]
        if len(notes) > 0:
            start_time = notes[0] / fps
            end_time = (notes[-1] + 1) / fps
            note = pretty_midi.Note(velocity=100, pitch=pitch, start=start_time, end=end_time)
            instrument.notes.append(note)

    midi_data.instruments.append(instrument)
    midi_data.write(output_path)
    print(f"Saved generated ringtone to {output_path}")

In [8]:
# Generate a Ringtone
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
ringtone_piano_roll = generate_ringtone_unet(model, max_time_steps, device)

# Save as MIDI
piano_roll_to_midi(ringtone_piano_roll, "generated_ringtone_unet.mid")

Saved generated ringtone to generated_ringtone_unet.mid
