In [2]:
import os
import pretty_midi
import numpy as np
from sklearn.preprocessing import LabelEncoder

from scipy.io import wavfile
from python_speech_features import mfcc

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from accelerate import Accelerator

[2023-07-19 10:22:24,843] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-07-19 10:22:25.166156: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def preprocess_midi_notes(midi):
  midi_data = pretty_midi.PrettyMIDI(midi)
  notes = []
  start_times = []
  end_times = []
  for instrument in midi_data.instruments:
      for note in instrument.notes:
          notes.append(note.pitch)
          start_times.append(note.start)
          end_times.append(note.end)
  return notes, start_times, end_times

def preprocess_midi_into_piano_roll(midi, segment_length=2000):
    midi_data = pretty_midi.PrettyMIDI(midi)
    piano_roll = midi_data.get_piano_roll(fs=100)

    # Normalize
    piano_roll = (piano_roll - np.min(piano_roll)) / (np.max(piano_roll) - np.min(piano_roll))

    # Cut the piano roll into segments
    segments = []
    for start in range(0, piano_roll.shape[1] - segment_length + 1, segment_length):
        segment = piano_roll[:, start:start+segment_length]
        # Add extra dimension to represent a single channel for CNN training
        segment = segment[np.newaxis, :, :]
        segments.append(segment)

    # If the last segment is shorter than segment_length, pad it
    if piano_roll.shape[1] % segment_length != 0:
        segment = piano_roll[:, -(piano_roll.shape[1] % segment_length):]
        segment = np.pad(segment, ((0, 0), (0, segment_length - segment.shape[1])))
        segment = segment[np.newaxis, :, :]
        segments.append(segment)

    return segments


In [4]:
raw_data_folder = 'data/AAI511_final_project/'

In [5]:
example_piano_roll = pretty_midi.PrettyMIDI(raw_data_folder + 'train/chopin/chopin049.mid').get_piano_roll(fs=100)

In [24]:
def preprocess_midi_into_mfcc(midi_file, num_cepstral=13, segment_length=2000):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    audio_data = midi_data.synthesize()
    wavfile.write("temp.wav", 44100, audio_data.astype(np.float32))

    rate, audio = wavfile.read("temp.wav")
    mfcc_features = mfcc(audio, samplerate=rate, numcep=num_cepstral, winlen=0.025, winstep=0.01, nfft=1103)

    segments = []
    for start in range(0, mfcc_features.shape[0] - segment_length + 1, segment_length):
        segment = mfcc_features[start:start+segment_length, :]
        segments.append(segment)

    if mfcc_features.shape[0] % segment_length != 0:
        segment = mfcc_features[-(mfcc_features.shape[0] % segment_length):, :]
        segment = np.pad(segment, ((0, segment_length - segment.shape[0]), (0, 0)))
        segments.append(segment)

    os.remove("temp.wav")

    return segments

In [6]:
def preprocess_data_in_directory(base_dir, preprocess_type='piano'):
    X = []
    y = []

    # to show progression
    total_files = sum([len(files) for r, d, files in os.walk(base_dir)])
    processed_files = 0

    composers = os.listdir(base_dir)
    for composer in composers:
        composer_dir = os.path.join(base_dir, composer)
        if os.path.isdir(composer_dir):
            for file in os.listdir(composer_dir):
                if file.endswith('.mid'):
                    file_path = os.path.join(composer_dir, file)
                    try:
                        if preprocess_type == 'piano':
                          piano_roll_segments = preprocess_midi_into_piano_roll(file_path)
                          for segment in piano_roll_segments:
                            X.append(segment)
                            y.append(composer)
                        else:
                          mfcc_segments = preprocess_midi_into_mfcc(file_path)
                          for segment in mfcc_segments:
                            X.append(segment)
                            y.append(composer)
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")
                    processed_files += 1
                    if processed_files % (total_files // 10) == 0:
                      print(f"Processed {processed_files / total_files * 100}% of files")
    return np.array(X), np.array(y)

In [25]:
X_train, y_train = preprocess_data_in_directory(raw_data_folder + 'train', preprocess_type='mfcc')
X_test, y_test = preprocess_data_in_directory(raw_data_folder + 'test', preprocess_type='mfcc')
X_dev, y_dev = preprocess_data_in_directory(raw_data_folder + 'dev', preprocess_type='mfcc')

Processed 9.762532981530343% of files
Processed 19.525065963060687% of files
Processed 29.287598944591032% of files
Processed 39.050131926121374% of files
Processed 48.812664907651715% of files
Processed 58.575197889182064% of files
Processed 68.33773087071239% of files
Processed 78.10026385224275% of files
Processed 87.86279683377309% of files
Processed 8.88888888888889% of files
Processed 17.77777777777778% of files
Processed 26.666666666666668% of files
Processed 35.55555555555556% of files
Processed 44.44444444444444% of files
Processed 53.333333333333336% of files
Processed 62.22222222222222% of files
Processed 71.11111111111111% of files
Processed 8.88888888888889% of files
Processed 17.77777777777778% of files
Processed 26.666666666666668% of files
Processed 35.55555555555556% of files
Processed 44.44444444444444% of files
Processed 53.333333333333336% of files
Processed 62.22222222222222% of files
Processed 71.11111111111111% of files


In [26]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_dev = le.transform(y_dev)

In [27]:
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()
X_dev = torch.from_numpy(X_dev).float()
y_dev = torch.from_numpy(y_dev).float()

In [28]:
torch.save(X_train, raw_data_folder + 'X_train_mfcc.pt')
torch.save(y_train, raw_data_folder + 'y_train_mfcc.pt')
torch.save(X_test, raw_data_folder + 'X_test_mfcc.pt')
torch.save(y_test, raw_data_folder + 'y_test_mfcc.pt')
torch.save(X_dev, raw_data_folder + 'X_dev_mfcc.pt')
torch.save(y_dev, raw_data_folder + 'y_dev_mfcc.pt')

In [4]:
# X_train = torch.load(raw_data_folder + 'X_train.pt')
# y_train = torch.load(raw_data_folder + 'y_train.pt')
# X_test = torch.load(raw_data_folder + 'X_test.pt')
# y_test = torch.load(raw_data_folder + 'y_test.pt')
# X_dev = torch.load(raw_data_folder + 'X_dev.pt')
# y_dev = torch.load(raw_data_folder + 'y_dev.pt')

X_train = torch.load(raw_data_folder + 'X_train_mfcc.pt')
y_train = torch.load(raw_data_folder + 'y_train_mfcc.pt')
X_test = torch.load(raw_data_folder + 'X_test_mfcc.pt')
y_test = torch.load(raw_data_folder + 'y_test_mfcc.pt')
X_dev = torch.load(raw_data_folder + 'X_dev_mfcc.pt')
y_dev = torch.load(raw_data_folder + 'y_dev_mfcc.pt')

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train = X_train.to(device)
y_train = y_train.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)
X_dev = X_dev.to(device)
y_dev = y_dev.to(device)

In [38]:
class CNN_LSTM(nn.Module):
    def __init__(self, cnn_output_size, hidden_size, num_layers, num_classes):
        super(CNN_LSTM, self).__init__()

        # The CNN part
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # The LSTM part
        self.lstm = nn.LSTM(cnn_output_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer for output
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Apply CNN
        out = self.cnn(x)

        # Reshape the output from the CNN to be 2D
        out = out.view(out.size(0), -1, cnn_output_size)

        # Apply LSTM
        out, _ = self.lstm(out)

        # Only take the output from the final timestep
        out = out[:, -1, :]

        # Apply fully connected layer
        out = self.fc(out)

        return out

class LSTM(nn.Module):
      def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()

        # The LSTM part
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer for output
        self.fc = nn.Linear(hidden_size, num_classes)

      def forward(self, x):
          # Apply LSTM
          out, _ = self.lstm(x)

          # Only take the output from the final timestep
          out = out[:, -1, :]

          # Apply fully connected layer
          out = self.fc(out)

          return out

In [35]:
class MIDIDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [42]:
# Create the Dataset
train_dataset = MIDIDataset(X_train, y_train)
test_dataset = MIDIDataset(X_test, y_test)

# Define the batch size
batch_size = 16

# Create the DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [43]:
# Define training function
def train(model, optimizer,
          train_loader, test_loader, num_epochs=400, print_interval=10):
    criterion = nn.CrossEntropyLoss()
    train_losses = []
    test_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch_idx, (inputs, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            labels = labels.long()
            loss = criterion(outputs, labels)
            accelerator.backward(loss)

            optimizer.step()
            train_loss += loss.item() * inputs.size(0)


        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)

        with torch.no_grad():
            model.eval()
            test_loss = 0.0

            for inputs, labels in test_loader:
                outputs = model(inputs)
                labels = labels.long()
                loss = criterion(outputs, labels)
                test_loss += loss.item() * inputs.size(0)

            test_loss /= len(test_loader.dataset)
            test_losses.append(test_loss)

        print(f'Epoch {epoch}, train loss: {train_loss:.4f}, test loss: {test_loss:.4f}')

    return train_losses, test_losses

In [16]:
X_train.shape

torch.Size([6129, 1, 128, 2000])

In [34]:
X_train.shape

torch.Size([6148, 2000, 13])

In [15]:
y_train

tensor([7., 7., 7.,  ..., 8., 8., 8.], device='cuda:0')

In [44]:
cnn_output_size = 256
hidden_size = 512
input_size = 13
num_layers = 1
lr=0.001
weight_decay=0.01
num_classes = len(torch.unique(y_train))

In [45]:
# model = CNN_LSTM(cnn_output_size, hidden_size, num_layers, num_classes)
model = LSTM(input_size, hidden_size, num_layers, num_classes)
model = model.to(device)

In [46]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [47]:
accelerator = Accelerator()
model, optimizer, train_loader, test_loader = accelerator.prepare(
    model, optimizer, train_loader, test_loader
)

In [31]:
torch.cuda.empty_cache()

In [None]:
train_losses, test_losses = train(model, optimizer, train_loader, test_loader)

Epoch 0, train loss: 1.9717, test loss: 1.8303
Epoch 1, train loss: 1.7740, test loss: 1.7204
Epoch 2, train loss: 1.7337, test loss: 1.7176
Epoch 3, train loss: 1.6893, test loss: 1.6871
Epoch 4, train loss: 1.6720, test loss: 1.6818
Epoch 5, train loss: 1.6592, test loss: 1.6788
Epoch 6, train loss: 1.6366, test loss: 1.6687
Epoch 7, train loss: 1.6332, test loss: 1.6427
Epoch 8, train loss: 1.6265, test loss: 1.6932
Epoch 9, train loss: 1.6076, test loss: 1.6948
Epoch 10, train loss: 1.6049, test loss: 1.6576
Epoch 11, train loss: 1.5866, test loss: 1.6242
Epoch 12, train loss: 1.5731, test loss: 1.5855
Epoch 13, train loss: 1.5609, test loss: 1.6023
Epoch 14, train loss: 1.5432, test loss: 1.6064
Epoch 15, train loss: 1.5320, test loss: 1.6218
Epoch 16, train loss: 1.5165, test loss: 1.5691
Epoch 17, train loss: 1.4932, test loss: 1.5567
Epoch 18, train loss: 1.4718, test loss: 1.5346
Epoch 19, train loss: 1.4804, test loss: 1.5005
Epoch 20, train loss: 1.5016, test loss: 1.5992
Ep