In [2]:
import pandas as pd
import librosa
import numpy as np

In [3]:
# Load ground truth annotations from CSV file
ground_truth_path = "../data/csv/en001a.csv"
ground_truth = pd.read_csv(ground_truth_path)

# Load audio file
audio_path = "../data/wav/en001a.wav"
audio_data, sr = librosa.load(audio_path)

In [4]:
# Define a function to extract features from an audio segment
def extract_features(audio_data, onset, offset):
    # Extract the audio segment
    segment = audio_data[int(onset * sr):int(offset * sr)]
    # Extract features (e.g., MFCCs)
    features = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
    return features.T  # Transpose to have shape (n_frames, n_mfcc)

In [6]:
# Extract features according to ground truth annotations
features = []
pitch_values = []
for _, row in ground_truth.iterrows():
    onset, offset, pitch = row['start'], row['end'], row['pitch']
    segment_features = extract_features(audio_data, onset, offset)
    features.append(segment_features)
    pitch_values.extend([pitch] * segment_features.shape[0])

# Convert features and pitch_values to numpy arrays
features = np.vstack(features)
pitch_values = np.array(pitch_values)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [22]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, pitch_values, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape features for LSTM input (assuming LSTM expects input shape of (n_samples, n_timesteps, n_features))
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

# Define the LSTM model
model = Sequential([
    LSTM(128, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(128, activation='tanh'),
    Dense(1)  # Output layer for pitch prediction
])

# Compile the model
model.compile(optimizer='SGD', loss='mse')

# Train the model
model.fit(X_train_reshaped, y_train, validation_data=(X_test_reshaped, y_test), epochs=35, batch_size=5)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x1b49fe62530>

In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Make predictions on the test set
y_pred = model.predict(X_test_reshaped).flatten()

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)

# Calculate mean absolute error
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


Mean Squared Error: 1.5721085534938455
Mean Absolute Error: 0.9053322510285811


In [21]:
print(y_pred[:10])
print(y_test[:10])

[63.310265 66.140205 63.411278 68.35091  66.401215 66.12492  69.02496
 67.69756  63.823586 67.90542 ]
[63 66 61 68 66 66 70 68 65 66]
