In [2]:
import os
import pretty_midi
import numpy as np
from sklearn.preprocessing import LabelBinarizer

import torch
from torch import nn

In [31]:
def preprocess_midi_notes(midi):
  midi_data = pretty_midi.PrettyMIDI(midi)
  notes = []
  start_times = []
  end_times = []
  for instrument in midi_data.instruments:
      for note in instrument.notes:
          notes.append(note.pitch)
          start_times.append(note.start)
          end_times.append(note.end)
  return notes, start_times, end_times

def preprocess_midi_into_piano_roll(midi, segment_length=2000):
    midi_data = pretty_midi.PrettyMIDI(midi)
    piano_roll = midi_data.get_piano_roll(fs=100)

    # Normalize
    piano_roll = (piano_roll - np.min(piano_roll)) / (np.max(piano_roll) - np.min(piano_roll))

    # Cut the piano roll into segments
    segments = []
    for start in range(0, piano_roll.shape[1] - segment_length + 1, segment_length):
        segment = piano_roll[:, start:start+segment_length]
        # Add extra dimension to represent a single channel for CNN training
        segment = segment[np.newaxis, :, :]
        segments.append(segment)

    # If the last segment is shorter than segment_length, pad it
    if piano_roll.shape[1] % segment_length != 0:
        segment = piano_roll[:, -(piano_roll.shape[1] % segment_length):]
        segment = np.pad(segment, ((0, 0), (0, segment_length - segment.shape[1])))
        segment = segment[np.newaxis, :, :]
        segments.append(segment)

    return segments


In [4]:
raw_data_folder = 'data/AAI511_final_project/'

In [28]:
example_piano_roll = preprocess_midi_into_piano_roll(raw_data_folder + 'train/chopin/chopin051.mid')

In [29]:
example_piano_roll.shape

(128, 85000)

In [32]:
def preprocess_data_in_directory(base_dir):
    X = []
    y = []

    # to show progression
    total_files = sum([len(files) for r, d, files in os.walk(base_dir)])
    processed_files = 0

    composers = os.listdir(base_dir)
    for composer in composers:
        composer_dir = os.path.join(base_dir, composer)
        if os.path.isdir(composer_dir):
            for file in os.listdir(composer_dir):
                if file.endswith('.mid'):
                    file_path = os.path.join(composer_dir, file)
                    try:
                        piano_roll_segments = preprocess_midi_into_piano_roll(file_path)
                        for segment in piano_roll_segments:
                            X.append(segment)
                            y.append(composer)
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")
                    processed_files += 1
                    if processed_files % (total_files // 10) == 0:
                      print(f"Processed {processed_files / total_files * 100}% of files")
    return np.array(X), np.array(y)

In [33]:
X_train, y_train = preprocess_data_in_directory(raw_data_folder + 'train')
X_test, y_test = preprocess_data_in_directory(raw_data_folder + 'test')
X_dev, y_dev = preprocess_data_in_directory(raw_data_folder + 'dev')



Processed 9.762532981530343% of files
Processed 19.525065963060687% of files
Processed 29.287598944591032% of files
Processed 39.050131926121374% of files
Processed 48.812664907651715% of files
Processed 58.575197889182064% of files
Processed 68.33773087071239% of files
Processed 78.10026385224275% of files
Processed 87.86279683377309% of files
Processed 8.88888888888889% of files
Processed 17.77777777777778% of files
Processed 26.666666666666668% of files
Processed 35.55555555555556% of files
Processed 44.44444444444444% of files
Processed 53.333333333333336% of files
Processed 62.22222222222222% of files
Processed 71.11111111111111% of files
Processed 8.88888888888889% of files
Processed 17.77777777777778% of files
Processed 26.666666666666668% of files
Processed 35.55555555555556% of files
Processed 44.44444444444444% of files
Processed 53.333333333333336% of files
Processed 62.22222222222222% of files
Processed 71.11111111111111% of files


In [34]:
lb = LabelBinarizer()

y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)
y_dev = lb.transform(y_dev)

In [35]:
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()
X_dev = torch.from_numpy(X_dev).float()
y_dev = torch.from_numpy(y_dev).float()

In [36]:
torch.save(X_train, raw_data_folder + 'X_train.pt')
torch.save(y_train, raw_data_folder + 'y_train.pt')
torch.save(X_test, raw_data_folder + 'X_test.pt')
torch.save(y_test, raw_data_folder + 'y_test.pt')
torch.save(X_dev, raw_data_folder + 'X_dev.pt')
torch.save(y_dev, raw_data_folder + 'y_dev.pt')

In [5]:
X_train = torch.load(raw_data_folder + 'X_train.pt')
y_train = torch.load(raw_data_folder + 'y_train.pt')
X_test = torch.load(raw_data_folder + 'X_test.pt')
y_test = torch.load(raw_data_folder + 'y_test.pt')
X_dev = torch.load(raw_data_folder + 'X_dev.pt')
y_dev = torch.load(raw_data_folder + 'y_dev.pt')

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train.to(device)
y_train.to(device)
X_test.to(device)
y_test.to(device)
X_dev.to(device)
y_dev.to(device)

tensor([[0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')

In [7]:
class CNN_LSTM(nn.Module):
    def __init__(self, cnn_output_size, hidden_size, num_layers, num_classes):
        super(CNN_LSTM, self).__init__()

        # The CNN part
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # The LSTM part
        self.lstm = nn.LSTM(cnn_output_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer for output
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Apply CNN
        out = self.cnn(x)

        # Reshape the output from the CNN to be 2D
        out = out.view(out.size(0), -1, cnn_output_size)

        # Apply LSTM
        out, _ = self.lstm(out)

        # Only take the output from the final timestep
        out = out[:, -1, :]

        # Apply fully connected layer
        out = self.fc(out)

        return out


In [8]:
from torch.utils.data import Dataset, DataLoader

class MIDIDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
# Create the Dataset
train_dataset = MIDIDataset(X_train, y_train)
test_dataset = MIDIDataset(X_test, y_test)

# Define the batch size
batch_size = 32

# Create the DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [12]:
# Define training function
def train(model, train_loader, test_loader, num_epochs=10, lr=0.001, weight_decay=0.01):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    train_losses = []
    test_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        with torch.no_grad():
            model.eval()
            test_loss = 0.0

            for inputs, labels in test_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                test_loss += loss.item() * inputs.size(0)

            test_loss /= len(test_loader.dataset)
            test_losses.append(test_loss)

        print(f'Epoch {epoch + 1}, train loss: {train_loss:.4f}, test loss: {test_loss:.4f}')

    return train_losses, test_losses

In [10]:
X_train.shape

torch.Size([6131, 1, 128, 2000])

In [11]:
cnn_output_size = 16
hidden_size = 68
num_layers = 1
num_classes = y_train.shape[1]

In [13]:
model = CNN_LSTM(cnn_output_size, hidden_size, num_layers, num_classes)

In [None]:
train_losses, test_losses = train(model, train_loader, test_loader)