In [1]:
import os
import librosa
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Function to load a model or other objects from a file
def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

trained_model = load_pickle('../notebooks/models/audio_augmented_model/svm_model.pkl')
scaler = load_pickle('../notebooks/models/audio_augmented_model/scaler_svm.pkl')
label_encoder = load_pickle('../notebooks/models/audio_augmented_model/label_encoder_svm.pkl')

# Define a fixed length for MFCC feature vectors
max_length = 100

# Extract MFCC Features and Chop audio
# Preemphasis filter for high frequency
def preemphasis_filter(signal, alpha=0.97):
    return np.append(signal[0], signal[1:] - alpha * signal[:-1])

# Frame the signal into 25 ms frame and 10 ms frame shift
def frame_signal(signal, frame_length, frame_stride):
    signal_length = len(signal)
    frame_step = int(frame_stride)
    frame_length = int(frame_length)
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))

    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(signal, z)

    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
              np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]

    return frames
    
# Apply Hamming Windows and Compute the power spectrum
def power_spectrum(frames, NFFT=512):
    frames *= np.hamming(frames.shape[1])
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
    pow_frames = ((1.0 / NFFT) * (mag_frames ** 2))
    return pow_frames

# Apply Mel Filterbank
def mel_filterbank(spectrum, num_filters=40, sampling_rate=22050, n_fft=512):
    mel_filterbank = librosa.filters.mel(sr=sampling_rate, n_fft=n_fft, n_mels=num_filters)
    return np.dot(mel_filterbank, spectrum.T).T

# Compute MFCCS
def mfcc(signal, sampling_rate=22050, frame_length=512, frame_shift=256, num_mfcc=13, n_mels=40):
    emphasized_signal = preemphasis_filter(signal)
    framed_signal = frame_signal(emphasized_signal, frame_length, frame_shift)
    spectrum = power_spectrum(framed_signal)
    mel_spectrum = mel_filterbank(spectrum, num_filters=n_mels, sampling_rate=sampling_rate, n_fft=512)
    mfccs = librosa.feature.mfcc(S=librosa.power_to_db(mel_spectrum), n_mfcc=num_mfcc, n_mels=n_mels)

    return mfccs
# Function to preprocess audio files using your provided code
def preprocess_audio(audio_path):
    audiofile, sr = librosa.load(audio_path, sr=None)
    mfcc_features = mfcc(audiofile, sr)
    if mfcc_features.shape[1] < max_length:
        mfcc_features_padded = np.pad(mfcc_features, ((0, 0), (0, max_length - mfcc_features.shape[1])), mode='constant')
    elif mfcc_features.shape[1] > max_length:
        mfcc_features_padded = mfcc_features[:, :max_length]
    else:
        mfcc_features_padded = mfcc_features
    mfcc_features_flat = mfcc_features_padded.flatten()
    mfcc_features_flat = scaler.transform([mfcc_features_flat])
    return mfcc_features_flat

# Function to make predictions on preprocessed audio
def predict_audio(preprocessed_audio):
    prediction = trained_model.predict(preprocessed_audio)
    predicted_label = label_encoder.inverse_transform(prediction)[0]
    return predicted_label


audio_path = '../data/indiv_test/435806317_25534324009486266_6473560724221715299_n.wav'  # Replace with your audio file path
preprocessed_audio = preprocess_audio(audio_path)
predicted_label = predict_audio(preprocessed_audio)
print(f"Predicted label: {predicted_label}")

Predicted label: discomfort
