# Feature Extraction

We will work with mainly 4 kind features: MFCC, Zero Crossing Rate, Spectral Centroid and Spectrograms. 

We start off with same data preprocessing.

## Creating labels.csv

In [None]:
import pandas as pd
import os
import re
import inflect

# Define the path to the folder containing your TSV files
folder_path = r'C:\Users\jonec\Documents\SUTD\T6\AI\Voice dataset\cv-corpus-4'

# List of the TSV files you want to combine
tsv_files = ['dev.tsv', 'invalidated.tsv', 'other.tsv', 'test.tsv', 'train.tsv', 'validated.tsv']

# Initialize an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Iterate over each file and append its contents to the combined DataFrame
for file_name in tsv_files:
    file_path = os.path.join(folder_path, file_name)
    temp_df = pd.read_csv(file_path, sep='\t')
    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

# Drop unnecessary columns
combined_df.drop(['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent'], axis=1, inplace=True)

# Initialize the inflect engine
p = inflect.engine()

def normalize_text(text):
    # Check if the input is not a string (e.g., NaN or None)
    if not isinstance(text, str):
        return ""  # or some placeholder text, e.g., "missing_sentence"
    
    # Convert numbers to words
    text = re.sub(r'\b\d+\b', lambda x: p.number_to_words(x.group()), text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Keep only spaces, lowercase letters, and numbers in word form
    text = re.sub(r'[^a-z ]', '', text)
    
    return text

# Apply normalization including the check for non-string types
combined_df['sentence'] = combined_df['sentence'].apply(normalize_text)

# Proceed with saving the normalized labels as before
combined_csv_path = os.path.join(folder_path, 'labels.csv')


In [None]:
# combined_df[combined_df["path"] == 'common_voice_en_10.mp3']
combined_df

In [None]:
combined_df.to_csv(combined_csv_path, index=False)

In [None]:
base_folder = r'C:\Users\jonec\Documents\SUTD\T6\AI\Voice dataset\cv-corpus-4'
path_substring = "common_voice_en_10.mp3"
clips_folder = os.path.join(base_folder, 'clips')

for filename in os.listdir(clips_folder):
    if path_substring in filename:
        file_path = os.path.join(clips_folder, filename)
        print(f"Found and playing file: {file_path}")
        break

## Extract the MFCCs and store in h5py file

For the code below, if you are processing the entire corpus file, I do not recommend using jupyter notebook. Create a .py file and run that instead. The python file (feature_extraction.py) also considers feature extraction with melspectrograms, zero crossing rate and spectral centroid

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import soundfile as sf
import h5py
import numpy as np

def process_and_store_audio_files_with_labels(folder_path, hdf5_path, labels_dict, sampling_rate=16000, hop_length=512, n_mfcc=13):
    with h5py.File(hdf5_path, 'w') as hdf5_file:
        processed_files = 0
        skipped_files = 0
        for filename in os.listdir(folder_path):
            print(f"Processing {processed_files + skipped_files}-th file: {filename}")
            if filename.endswith('.mp3') and filename in labels_dict:
                if not isinstance(labels_dict[filename], str):
                    continue
                file_path = os.path.join(folder_path, filename)
                try:
                    signal, _ = librosa.load(file_path, sr=sampling_rate)
                    mfccs = librosa.feature.mfcc(y=signal, sr=sampling_rate, n_mfcc=n_mfcc, hop_length=hop_length)
                except Exception as e:
                    print(f"Skipping {filename} due to error: {e}")
                    skipped_files += 1
                    continue
                
                # Create a group for each audio file
                grp = hdf5_file.create_group(filename)
                # Store MFCCs in the group
                grp.create_dataset("mfccs", data=mfccs)
                # Store the label in the group
                label_data = labels_dict[filename].encode('utf-8')
                grp.create_dataset("label", data=label_data)
                
                print(f"Processed and stored MFCCs and label for {filename}")
                processed_files += 1
        print(f"Processed {processed_files} files")
        print(f"Skipped {skipped_files} files")

# Params
sampling_rate = 16000
hop_length = 512
n_mfcc = 13

# Load the labels first

label_file_path = r'C:\Users\jonec\Documents\SUTD\T6\AI\Voice dataset\cv-corpus-4\labels.csv'
labels_df = pd.read_csv(label_file_path)
labels_dict = pd.Series(labels_df.sentence.values, index=labels_df.path).to_dict()


# Specify the path to save the HDF5 file
hdf5_path = 'mfccs_dataset.h5'
# Path to the folder containing your MP3 files
folder_path = r'C:\Users\jonec\Documents\SUTD\T6\AI\Voice dataset\cv-corpus-4\clips'

# Process the audio files and store their MFCCs
process_and_store_audio_files_with_labels(folder_path, hdf5_path, labels_dict, sampling_rate, hop_length, n_mfcc)

## (DEBUGGING BLOCK) Inspect the h5 file content 

In [None]:
def list_contents_in_hdf5(hdf5_path):
    with h5py.File(hdf5_path, 'r') as hdf5_file:
        print(f"Contents in {hdf5_path}:")
        # Iterate through groups in the HDF5 file
        for group_name in hdf5_file:
            print(f"Group name: {group_name}")
            group = hdf5_file[group_name]
            # Iterate through datasets in each group
            for dataset_name in group:
                dataset = group[dataset_name]
                print(f"  Dataset name: {dataset_name}")
                print(f"  Shape: {dataset.shape}")
                print(f"  Datatype: {dataset.dtype}")
            print("---" * 10)

# Specify the path to your HDF5 file
hdf5_path = 'mfccs_dataset.h5'

list_contents_in_hdf5(hdf5_path)

In [None]:
# Debug functions 

# If you want to get labels for a specific audio
def get_label_for_audio(hdf5_path, audio_filename):
    with h5py.File(hdf5_path, 'r') as hdf5_file:
        # Access the group corresponding to the audio file
        audio_group = hdf5_file[audio_filename]
        # Access the 'label' dataset within this group
        label_data = audio_group['label'][()]
        # Decode the binary string to get the label text
        label_text = label_data.decode('utf-8')
        return label_text

# Specify the path to your HDF5 file
hdf5_path = 'mfccs_dataset.h5'
# Specify the audio filename group you're interested in
audio_filename = 'common_voice_en_1.mp3'

# Retrieve and print the label for the specified audio file
label_text = get_label_for_audio(hdf5_path, audio_filename)
print(f"Label for {audio_filename}: {label_text}")


# If you want to get the length of the hdf5 file
def get_length_hdf5(hdf5_path):
    with h5py.File(hdf5_path, 'r') as hdf5_file:
        # Count the number of top-level groups in the HDF5 file
        number_of_groups = len(hdf5_file.keys())
        return number_of_groups

# Specify the path to your HDF5 file
hdf5_path = 'mfccs_dataset.h5'

# Get the count of groups (audio files) in the HDF5 file
group_count = get_length_hdf5(hdf5_path)
print(f"The HDF5 file contains {group_count} groups (audio files).")

