In [1]:
import pandas as pd
import librosa
import numpy as np
import joblib
import os

In [2]:
# # Load ground truth annotations from CSV file
# ground_truth_path = "../data/csv/en001a.csv"
# ground_truth = pd.read_csv(ground_truth_path)

# # Load audio file
# audio_path = "../data/wav/en001a.wav"
# audio_data, sr = librosa.load(audio_path)

folder_path = 'C:/Users/Admin/OneDrive - Singapore Management University/Desktop/data mining/project/test/data/wav'
audio_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.wav')]

csv_path = 'C:/Users/Admin/OneDrive - Singapore Management University/Desktop/data mining/project/test/data/csv'
csv_files = [os.path.join(csv_path, file) for file in os.listdir(csv_path) if file.endswith('.csv')]
# audio_files

# ground_truth = pd.DataFrame()
# for file in csv_files:
#     temp = pd.read_csv(file)
#     ground_truth = ground_truth.append(temp, ignore_index=True)
# ground_truth

In [3]:
# Define a function to extract features from an audio segment
def extract_features(audio_data, sr, onset, offset):
    # trim
    y_trimmed, _ = librosa.effects.trim(audio_data)
    # Extract the audio segment
    segment = y_trimmed[int(onset * sr):int(offset * sr)]
    # Extract features (e.g., MFCCs)
    features = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
    return features.T  # Transpose to have shape (n_frames, n_mfcc)

In [4]:
features = []
pitch_values = []

for audio_file, csv_file in zip(audio_files[:6],csv_files[:6]):
    signal, sr = librosa.load(audio_file)
    
    ground_truth = pd.read_csv(csv_file)

    for _, row in ground_truth.iterrows():
        onset, offset, pitch = row['start'], row['end'], row['pitch']
        segment_features = extract_features(signal, sr, onset, offset)
        features.append(segment_features)
        pitch_values.extend([pitch] * segment_features.shape[0])

features = np.vstack(features)
pitch_values = np.array(pitch_values)




In [8]:
# # Extract features according to ground truth annotations
# features = []
# pitch_values = []
# for _, row in ground_truth.iterrows():
#     onset, offset, pitch = row['start'], row['end'], row['pitch']
#     segment_features = extract_features(audio_data, onset, offset)
#     features.append(segment_features)
#     pitch_values.extend([pitch] * segment_features.shape[0])

# # Convert features and pitch_values to numpy arrays
# features = np.vstack(features)
# pitch_values = np.array(pitch_values)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import save_model
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.optimizers import Adadelta

In [10]:
features.shape

(17967, 13)

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, pitch_values, test_size=0.2, random_state=42)

# Scale the features
s_scaler = StandardScaler()
X_train_scaled = s_scaler.fit_transform(X_train)
X_test_scaled = s_scaler.transform(X_test)

# Reshape y_train to 2D array for fitting the scaler
y_train_reshaped = y_train.reshape(-1, 1)

# Fit the MinMaxScaler on the y_train_reshaped
scaler = MinMaxScaler(feature_range=(0, 127)) # MIDI notes range from 0 to 127
scaler.fit(y_train_reshaped)

# Reshape features for LSTM input (assuming LSTM expects input shape of (n_samples, n_timesteps, n_features))
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

In [12]:
X_train_reshaped.shape

(14373, 1, 13)

In [13]:
# Define the LSTM model
model = Sequential([
    LSTM(128, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True),
    # LSTM(128, input_shape=(X_train_reshaped.shape[1], ), return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(128, activation='tanh'),
    Dense(1)  # Output layer for pitch prediction
])
# Adadelta optimizer
# optimizer = Adadelta(learning_rate=1.0)
# Compile the model
# model.compile(optimizer=optimizer, loss='mse')
model.compile(optimizer='SGD', loss='mse')

# Train the model
model.fit(X_train_reshaped, y_train, validation_data=(X_test_reshaped, y_test), epochs=35, batch_size=5)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x2be537de980>

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Make predictions on the test set
y_pred = model.predict(X_test_reshaped).flatten()
y_pred = [int(i) for i in y_pred]

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)

# Calculate mean absolute error
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


Mean Squared Error: 5.2982749026154705
Mean Absolute Error: 1.6800222593210907


In [15]:
print(y_pred)
print(y_test[:10])

[66, 66, 62, 62, 66, 66, 62, 69, 64, 64, 68, 65, 66, 67, 67, 67, 70, 61, 67, 66, 66, 72, 72, 67, 66, 69, 70, 66, 67, 65, 66, 62, 66, 62, 64, 68, 64, 66, 66, 66, 66, 65, 69, 66, 67, 67, 64, 67, 68, 66, 72, 71, 65, 66, 66, 67, 61, 61, 67, 65, 67, 67, 70, 60, 67, 67, 67, 67, 63, 69, 66, 67, 68, 61, 66, 69, 68, 64, 64, 65, 65, 68, 66, 66, 66, 68, 71, 66, 65, 68, 63, 65, 65, 64, 71, 67, 67, 71, 66, 61, 64, 65, 66, 68, 66, 66, 66, 61, 69, 68, 67, 64, 64, 66, 64, 69, 68, 66, 66, 68, 64, 68, 69, 65, 60, 67, 66, 70, 63, 62, 62, 64, 66, 66, 66, 61, 66, 64, 63, 62, 65, 65, 63, 61, 66, 63, 66, 69, 67, 70, 59, 66, 72, 62, 70, 66, 65, 58, 60, 67, 65, 66, 70, 66, 66, 69, 72, 69, 68, 71, 63, 66, 70, 72, 61, 70, 62, 70, 67, 62, 63, 60, 66, 60, 67, 66, 63, 61, 67, 67, 69, 66, 70, 67, 69, 65, 63, 60, 67, 71, 69, 66, 66, 62, 65, 70, 64, 66, 67, 64, 65, 67, 66, 64, 66, 67, 69, 68, 66, 64, 66, 61, 63, 66, 67, 58, 63, 68, 69, 65, 62, 66, 68, 69, 61, 65, 65, 67, 64, 68, 65, 64, 70, 68, 67, 65, 69, 66, 62, 69,

In [16]:
# Save the model
save_model(model, 'model.h5')

In [17]:
joblib.dump(s_scaler, 'scaler.pkl')

['scaler.pkl']