In [1]:
import librosa
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import pandas as pd
import re
from collections import Counter

# Option 1: spectrogram conversion

The code below converts all the available records files into spectrograms: a 2d represenation of the audio files. It saves all images into a separate folder. Spectrograms can be used for speech classification, however, they are less optimal because they also cover frequencies that human ears cannot detect. They are more easily inflenced by noise and not used as the standard input for speech recognition.

The code below takes quite a while to run (circa 15min for me) because it transforms all the available record files.

In [None]:
# change the paths to match your own
directory = "C:\\Users\\olasw\\OneDrive\\Documents\\Uni\\DSP\\data-science-project\\records\\records" #this is the place for all the records
output_directory = "C:\\Users\\olasw\\OneDrive\\Documents\\Uni\\DSP\\data-science-project\\spectrograms"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)


wav_files = glob.glob(os.path.join(directory, "*.wav"))
print(f"Number of WAV files: {len(wav_files)}")

for wav_file in wav_files:

    y, sr = librosa.load(wav_file)
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr)

    png_file = os.path.join(output_directory, os.path.basename(wav_file).replace(".wav", ".png"))
    plt.savefig(png_file)
    plt.close()
print("PNG files created:")
for png_file in os.listdir(output_directory):
    if png_file.endswith(".png"):
        print(png_file)

Number of WAV files: 1011
PNG files created:
HC10a1.png
HC10a1_clean.png
HC10a1_LF.png
HC10a2.png
HC10a2_clean.png
HC10a2_LF.png
HC10i1.png
HC10i1_clean.png
HC10i1_LF.png
HC10i2.png
HC10i2_clean.png
HC10i2_LF.png
HC11a1.png
HC11a1_clean.png
HC11a1_LF.png
HC11a2.png
HC11a2_clean.png
HC11a2_LF.png
HC11i1.png
HC11i1_clean.png
HC11i1_LF.png
HC11i2.png
HC11i2_clean.png
HC11i2_LF.png
HC12a1.png
HC12a1_clean.png
HC12a1_LF.png
HC12a2.png
HC12a2_clean.png
HC12a2_LF.png
HC12i1.png
HC12i1_clean.png
HC12i1_LF.png
HC12i2.png
HC12i2_clean.png
HC12i2_LF.png
HC13a1.png
HC13a1_clean.png
HC13a1_LF.png
HC13a2.png
HC13a2_clean.png
HC13a2_LF.png
HC13i1.png
HC13i1_clean.png
HC13i1_LF.png
HC13i2.png
HC13i2_clean.png
HC13i2_LF.png
HC14a1.png
HC14a1_clean.png
HC14a1_LF.png
HC14a2.png
HC14a2_clean.png
HC14a2_LF.png
HC14i1.png
HC14i1_clean.png
HC14i1_LF.png
HC14i2.png
HC14i2_clean.png
HC14i2_LF.png
HC15a1.png
HC15a1_clean.png
HC15a1_LF.png
HC15a2.png
HC15a2_clean.png
HC15a2_LF.png
HC15i1.png
HC15i1_clean.png
HC1

What we've done so far is change all the audio files into pictures. Now I will filter them to choose only the regular recordings. The _clean files are recordings cleaned from any background noises, the _LF are samples of glottal pulse used for the synthesis. I went with the non-clean recording, because if it ever needs to be used in a clinical setting (and that's what we want to argue about), the sound will not be perfectly clean.

In [None]:
# directory that contains the pngs
directory = "C:\\Users\\olasw\\OneDrive\\Documents\\Uni\\DSP\\data-science-project\\spectrograms"

# regex pattern to match files like HC1a1.png etc
pattern = re.compile(r"^(HC|PD|MSA|PSP)(\d+)([ai])(\d)\.png$")

# create counters to check if the dataset is balanced
group_counter = Counter()
vowel_counter = Counter()

# iterate and match files
matched_files = []
for filename in os.listdir(directory):
    match = pattern.match(filename)
    if match:
        matched_files.append(filename)
        group, subject_id, vowel, repetition = match.groups()
        group_counter[group] += 1
        vowel_counter[vowel] += 1

# print summary
print("Summary of matched spectrogram files:")
print(f"Total matched files: {len(matched_files)}\n")

print("Group counts:")
for group in ["HC", "PD", "MSA", "PSP"]:
    print(f"{group}: {group_counter[group]}")

print("Parkinsonism (MSA+PSP): ", group_counter["MSA"]+group_counter["PSP"])

print("\nVowel counts:")
print(f"a (/A/): {vowel_counter['a']}")
print(f"i (/I/): {vowel_counter['i']}")


Summary of matched spectrogram files:
Total matched files: 337

Group counts:
HC: 88
PD: 93
MSA: 84
PSP: 72
Parkinsonism (MSA+PSP):  156

Vowel counts:
a (/A/): 170
i (/I/): 167


# Option 2 (preferred): Melspectrograms

Melspectrograms are scaled spectrograms. Spectrograms plot time against frequency, while melspectrograms plot time against the mel-scale. It is meant to mimic human hearing, in that it has more resolution in low frequencies (e.g. for vowels) and less resolution in high frequencies(e.g. noise). It is the standard tool used for audio classification and in speech models.

The code below first checks the type of file and filters the records to cut down on runtime. In my case, it filters only the basic non-clean files. If you want it to check for a different file type, you need to edit the regex pattern match. 

I also considered doing fourier transform on the audio files but this package already does Short-Time Fourier Transform while converting to melspectrograms.

In [None]:
#directory with all records
directory = "C:\\Users\\olasw\\OneDrive\\Documents\\Uni\\DSP\\data-science-project\\records\\records"
output_directory = "C:\\Users\\olasw\\OneDrive\\Documents\\Uni\\DSP\\data-science-project\\melspectrograms"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# this pattern matches .wav files like HC1a1.wav, getting rid of _clean and _LF
pattern = re.compile(r"^(HC|PD|MSA|PSP)(\d+)([ai])(\d)\.wav$", re.IGNORECASE)

wav_files = glob.glob(os.path.join(directory, "*.wav"))
print(f"Total WAV files found: {len(wav_files)}")

matched_files = []

for wav_file in wav_files:
    filename = os.path.basename(wav_file)
    if pattern.match(filename): # if you delete this if statement, you can also conver all available files
        matched_files.append(filename)
        print("matched a file, continuing") #this statement is just here for my sanity to show the program is doing something and not crashing
        
        # load audio file
        y, sr = librosa.load(wav_file, sr=None)

        # generate mel spectrogram and convert power to decibels
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        S_dB = librosa.power_to_db(S, ref=np.max)

        # plot and save
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
        plt.axis('off')  # clean output
        png_file = os.path.join(output_directory, filename.replace(".wav", ".png"))
        plt.savefig(png_file, bbox_inches='tight', pad_inches=0)
        plt.close()

print(f"Mel spectrogram PNGs created for {len(matched_files)} files.")


Total WAV files found: 1011
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
matched a file, continuing
