<a href="https://colab.research.google.com/github/josephfriedel/AAI-510-TEAM-03/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
import pretty_midi


from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Masking
from imblearn.under_sampling import RandomUnderSampler

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
import pretty_midi
import librosa


In [None]:
# Specify the directory
bach_dir = 'music_data/midiclassics/Bach'
beethoven_dir = 'music_data/midiclassics/Beethoven'
chopin_dir = 'music_data/midiclassics/Chopin'
mozart_dir = 'music_data/midiclassics/Mozart'

composer_dirs = [bach_dir, beethoven_dir, chopin_dir, mozart_dir]

In [None]:
#jfXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx

import os
import shutil
import random

# Assigning directory paths to variables
bach_dir = r'C:\Users\josep\git\git2\AAI-511-final-project\music_data\midiclassics\Bach'
beethoven_dir = r'C:\Users\josep\git\git2\AAI-511-final-project\music_data\midiclassics\Beethoven'
chopin_dir = r'C:\Users\josep\git\git2\AAI-511-final-project\music_data\midiclassics\Chopin'
mozart_dir = r'C:\Users\josep\git\git2\AAI-511-final-project\music_data\midiclassics\Mozart'

# Define the directories and their sizes
directories = {
    bach_dir: 925,
    beethoven_dir: 212,
    chopin_dir: 136,
    mozart_dir: 257
}

# Find the largest directory size
max_size = max(directories.values())

# Function to balance a directory by increasing the number of files
def balance_directory(directory, target_size):
    files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    current_size = len(files)

    if current_size < target_size:
        while current_size < target_size:
            file_to_duplicate = random.choice(files)
            base, extension = os.path.splitext(file_to_duplicate)
            new_file = f"{base}_copy_{current_size}{extension}"
            try:
                shutil.copy(os.path.join(directory, file_to_duplicate), os.path.join(directory, new_file))
                files.append(new_file)
                current_size += 1
            except PermissionError as e:
                print(f"PermissionError: {e}. Skipping {file_to_duplicate}")

# Balance each directory
for dir_name in directories:
    balance_directory(dir_name, max_size)

In [None]:
def extract_features(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)

    # Extract note pitches
    pitches = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            pitches.append(note.pitch)

    # Extract note durations
    durations = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            durations.append(note.end - note.start)

    # Extract note velocities
    velocities = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            velocities.append(note.velocity)

    # Compute average and standard deviation for features
    features = {
        'avg_pitch': np.mean(pitches),
        'std_pitch': np.std(pitches),
        'avg_duration': np.mean(durations),
        'std_duration': np.std(durations),
        'avg_velocity': np.mean(velocities),
        'std_velocity': np.std(velocities),
        'note_vector': pitches,  # Add the sequence of note pitches as a feature
        'duration_vector': durations,  # Add the sequence of note durations as a feature
        'velocity_vector': velocities  # Add the sequence of note velocities as a feature
    }

    return features


def load_data(directory, current_label, data, success_files, exception_files):
    print(f'Loading {directory} for {current_label}...')
       # Use os.walk to iterate over each subdirectory in the current directory
        # 'root' is the path to the current directory
        # 'dirs' is a list of the names of the subdirectories in 'root'
        # 'files' is a list of the names of the non-directory files in 'root'
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file is a MIDI file
            if file.endswith('.mid'):
                file_path = os.path.join(root, file)
                print(f'Processing {file}...')
                print(f'File path: {file_path}')
                try:
                    features = extract_features(file_path)
                    features['composer'] = current_label
                    data.append(features)
                    success_files.append(file_path)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
                    exception_files.append(file_path)

def create_dataframe(data, max_sequence_length = 100):
    df = pd.DataFrame(data)

    df['composer'] = df['composer'].map({0: 'Bach', 1: 'Beethoven', 2: 'Chopin', 3: 'Mozart'})
    df = pd.DataFrame(data)

    # Pad sequences to a fixed length
    note_vectors = pad_sequences(df['note_vector'].tolist(), maxlen=max_sequence_length, padding='post')
    duration_vectors = pad_sequences(df['duration_vector'].tolist(), maxlen=max_sequence_length, padding='post')
    velocity_vectors = pad_sequences(df['velocity_vector'].tolist(), maxlen=max_sequence_length, padding='post')

    # Add padded sequences to the dataframe
    df['note_vector'] = list(note_vectors)
    df['duration_vector'] = list(duration_vectors)
    df['velocity_vector'] = list(velocity_vectors)
    return df

data = []
success_files = []
exception_files = []

# Load data
for current_label, composer_dir in enumerate(composer_dirs):
    load_data(composer_dir, current_label, data, success_files, exception_files)

# Create dataframe
df = create_dataframe(data)

print(f"Success files: {len(success_files)}")
print(f"Exception files: {len(exception_files)}")

# Print shape of data
print(f"Data shape: {df.shape}")

# List all exceptions
for file in exception_files:
    print(file)

display(df.head())

In [None]:
#jfXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxxxx

# Prepare the dataset
X = df.drop(columns=['composer'])
y = df['composer']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Use RandomUnderSampler to handle class imbalance
#rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = (X, y_encoded)

# Convert resampled data back to DataFrame for easier handling
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.Series(y_resampled, name='composer')

# Verify class distribution after undersampling
print(y_resampled_df.value_counts())

In [None]:
# Define the LSTM model
model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])