# Converting Raw .wav files to MFCC

Instead of utilizing the raw audio files as our features we are instead going to utilize Mel-Frequency Cepstral Coefficients. Thye represent features extracted from an audio signal that capture how humans perceive sound frequencies. This makes them effective
for distinguishing between audio patterns.

Read more about the technical explanation here: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum

Read more about the librosa documentation here for feature extraction: https://librosa.org/doc/0.11.0/generated/librosa.feature.mfcc.html 

In [22]:
import numpy as np
import pandas as pd
import librosa as lb
from glob import glob
import os

In [11]:
audio_dir = "../data/raw/"  # directory containing audio files
num_mfcc = 30  # number of MFCC to extract
n_segments = 1  # how many segements to split audio files
sample_rate = 16000  # sampling rate

In [12]:
file_paths = glob(audio_dir + "**/*.wav", recursive=True)
print(f"Total files: {len(file_paths)}")

Total files: 30000


In [13]:
def splitSignal(signal, n_segments):
    """
    Splits an 1D audio signal into a defined number of equal segments.
    Data augmentation technique that treats each individual segment as a seperate sample.
    """

    signal_length = len(signal)
    segment_length = int(np.ceil(signal_length / n_segments))  # size that each segment needs to be
    segment_list = []

    for i in range(n_segments):
        start_index = i * segment_length  # starting index of current segment
        end_index = min((i + 1) * segment_length, signal_length)  # end index of current segment
        segment = signal[start_index:end_index]  # grab the segment
        segment_list.append(segment)  # add it to the list of segments

    return segment_list

In [None]:
features = []

for fpath in file_paths:
    audio_data, _ = lb.load(fpath, sr=sample_rate)
    segments = splitSignal(audio_data, n_segments)

    speaker_id = os.path.basename(fpath).split('_')[0]
    target = int(os.path.basename(fpath).split('_')[1])

    for seg in segments:
        mfcc = lb.feature.mfcc(y=seg, sr=sample_rate, n_mfcc=num_mfcc)
        mfcc_avg = np.mean(lb.amplitude_to_db(np.abs(mfcc), ref=np.max), axis=1)
        features.append([speaker_id, target] + mfcc_avg.tolist())

    print(f"Completed {fpath}.")

In [15]:
columns = ["speaker", "digit"] + [f"MFCC_{i}" for i in range(num_mfcc)]
df_mfcc = pd.DataFrame(features, columns=columns)

In [21]:
df_mfcc.tail()

Unnamed: 0,speaker,digit,MFCC_0,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,...,MFCC_20,MFCC_21,MFCC_22,MFCC_23,MFCC_24,MFCC_25,MFCC_26,MFCC_27,MFCC_28,MFCC_29
29995,5,25,-2.77107,-22.803801,-31.509413,-29.957558,-35.038284,-36.968578,-40.163132,-41.606991,...,-48.690201,-44.63995,-49.214695,-48.594898,-51.443996,-45.519501,-51.83025,-48.507618,-49.27108,-48.037495
29996,5,25,-3.094936,-21.266586,-34.223949,-35.108227,-37.199162,-41.358482,-40.740501,-36.990124,...,-49.181156,-46.477318,-50.299721,-53.109825,-52.669922,-48.034004,-51.03891,-47.789875,-47.041237,-48.043873
29997,7,25,-2.424269,-19.968473,-27.698462,-35.202866,-37.850842,-38.347851,-41.119381,-41.168095,...,-46.940624,-45.713348,-50.274548,-46.413177,-48.301434,-49.298492,-47.686218,-45.711559,-49.174328,-47.739471
29998,6,25,-3.133085,-27.264418,-27.33766,-38.449894,-34.045048,-36.607265,-39.404686,-35.576519,...,-46.908993,-43.107845,-54.680386,-48.957413,-51.627159,-41.228485,-50.615047,-47.040798,-51.364323,-49.768005
29999,6,25,-1.716212,-24.961514,-24.442923,-34.30777,-30.245928,-34.460892,-43.622475,-37.395226,...,-47.809151,-39.013443,-53.325287,-43.413662,-49.169521,-45.970791,-53.27211,-48.566704,-44.426937,-43.724277


In [23]:
df_mfcc.to_pickle("../data/processed/mfcc_data.pkl")