# Preprocessing

This file makes folders and processes the TIMIT dataset to filterbanks

In [1]:
import re
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.signal import get_window
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank

Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
# Change to change output folder
OUTPUT_PATH = Path('./MFCCS2')

TRAIN_PATH = Path('./TRAIN_WAV')
TEST_PATH = Path('./TEST_WAV')

# All of the sub directories, such as TRAIN_PATH/DR1
TRAIN_PATHS = [x for x in TRAIN_PATH.iterdir()]
TEST_PATHS = [x for x in TEST_PATH.iterdir()]

It would have been possible to use grub or something similar here.
But I figured that out quite late.

In [3]:
# All of the different accents/people talking
TRAIN_ACCENTS = [[x for x in xs.iterdir()] for xs in TRAIN_PATHS]
TEST_ACCENTS = [[x for x in xs.iterdir()] for xs in TEST_PATHS]

In [4]:
pat = r'^([A-Z0-9]+)\.[A-Z]+$'

def get_files(pathss):
    """
        Returns all of the files in a folder and
        all of the file-groupings in a path.
        For example if we have a folder with multiple files
        such as path/test.png, path/test.txt etc.
        Then this will return path/test
    """
    FILES = []
    for paths in pathss:
        for xs in paths:
            for x in xs.iterdir():
                FILES.append(x)
        #TRAIN_FILES.append([[x for x in xs.iterdir()] for xs in paths])
    FILES_2 = []
    matches = []

    for x in FILES:
        # I might be able to use x.stem insead of this
        # Path has .match for regex matching also. Might be quicker and better
        match = re.search(pat, x.name)
        if match == None or match.group(1) in matches:
            continue
        FILES_2.append(x.with_suffix(''))
        matches.append(match.group(1))
    return FILES, FILES_2 

# TRAIN_FILES here is the files while TRAIN_FILES_GROUPINGS are the file groupings
(TRAIN_FILES_ALL, TRAIN_FILES_GROUPINGS) = get_files(TRAIN_ACCENTS)
(TEST_FILES_ALL, TEST_FILES_GROUPINGS) = get_files(TEST_ACCENTS)
print(TRAIN_FILES_GROUPINGS[:5])
print(TRAIN_FILES_ALL[:5])

[PosixPath('TRAIN_WAV/DR7/MMDG0/SX160'), PosixPath('TRAIN_WAV/DR7/MMDG0/SA1'), PosixPath('TRAIN_WAV/DR7/MMDG0/SA2'), PosixPath('TRAIN_WAV/DR7/MMDG0/SI2035'), PosixPath('TRAIN_WAV/DR7/MMDG0/SX340')]
[PosixPath('TRAIN_WAV/DR7/MMDG0/SX160.PHN'), PosixPath('TRAIN_WAV/DR7/MMDG0/SA1.TXT'), PosixPath('TRAIN_WAV/DR7/MMDG0/SA1.WAV.wav'), PosixPath('TRAIN_WAV/DR7/MMDG0/SA2.PHN'), PosixPath('TRAIN_WAV/DR7/MMDG0/SX70.WAV.wav')]


In [5]:
#from Lee & Hon, 1989
phone_transfers = {
    "ao": "aa",
    "ax": "ah",
    "ax-h": "ah",
    "axr": "er",
    "hv": "hh",
    "ix": "ih",
    "el": "l",
    "em": "m",
    "en": "n",
    "nx": "n",
    "eng": "ng",
    "zh": "sh",
    "ux": "uw",
    "pcl": "sil",
    "tcl": "sil",
    "kcl": "sil",
    "bcl": "sil",
    "dcl": "sil",
    "gcl": "sil",
    "h#": "sil",
    "pau": "sil",
    "epi": "sil"
}

In [6]:
def get_phn(file, i):
    """
        Gets the phone of file file at time i
    """
    phns = pd.read_csv(Path(file).with_suffix("").with_suffix('.PHN'), sep = " " , header=None)
    phn = phns[2][0]
    length = int(phns[1][0]) - int(phns[0][0])
    for idx, x in enumerate(phns[0]):
        #print(i, idx, x, phns[2][idx])
        if i >= x:
            phn = phns[2][idx]
            length = int(phns[1][idx]) - int(phns[0][idx])
            continue
        return phn, length
        break
    else:
        return phn, int(phns[1][idx - 1]) - int(phns[0][idx - 1])
            
def normalize(arr):
    # from : https://stackoverflow.com/questions/31152967/normalise-2d-numpy-array-zero-mean-unit-variance
    return (arr - arr.mean(axis=0)) / arr.std(axis=0)

In [7]:
def wavfile_to_mfccs(path, labels_file, idx="TRAIN_WAV"):
    #(sampling_rate, _sig) = scipy.io.wavfile.read(path)
    file = path
    (_sig, sampling_rate) = librosa.load(path, sr = 16000)

    #window = int(400 * 2.8 / 2)
    window = int(320)
    for i in range(0, len(_sig) - 150, window):
        for mult in range(2):
            i = int(i + mult * (window / 2))
            if len(_sig) - i < window + 1:
                break
                
            sig = _sig[i: i + window]  

            mfcc_feat = mfcc(sig, sampling_rate, nfilt = 40, numcep=40, winlen=0.0025, winstep=0.0013)

            mfccs = mfcc_feat.T


            _delta = delta(mfccs, 1)
            _delta_delta = delta(_delta, 1)
    
            finished = np.hstack([mfccs, _delta, _delta_delta])
            label = get_phn(path, i)[0]
        
            if label in phone_transfers.keys():
                    label = phone_transfers[label]
            if (label == "q"): 
                continue
        
            yield finished, label
        

In [8]:
import gc

def read_wavfiles_to_mfccs(files):
    # I had some problems with the fact that when converting the audio to images
    # the neural net get incredibly bad results for some reason.
    # So parts of this code is from 
    # https://github.com/dtjchen/spoken-command-processo3
    print(len(files))
    
    X, y = [], []
    divider = len(files)
    print("Starting converting...")
    for i, file in enumerate(files):
        if (i % int(divider/20) == 0): print(f"{i / divider :.2f}")
        for mfccs, label in wavfile_to_mfccs(file.with_suffix(".WAV.wav"), file.with_suffix(".PHN")):
            if (label != "q"):
                X.append(mfccs)
                y.append(label)
        gc.collect()
    print("Done")
    return X, y

In [12]:

X, y = read_wavfiles_to_mfccs(TRAIN_FILES_GROUPINGS)

np.save(OUTPUT_PATH / "X_train", X)
np.save(OUTPUT_PATH / "y_train", y)

X, y= read_wavfiles_to_mfccs(TEST_FILES_GROUPINGS)

np.save(OUTPUT_PATH / "X_test", X)
np.save(OUTPUT_PATH / "y_test", y)

1718
Starting converting...
0.00
0.05
0.10
0.15
0.20
0.25
0.30
0.35
0.40
0.45
0.49
0.54
0.59
0.64
0.69
0.74
0.79
0.84
0.89
0.94
0.99
Done
626
Starting converting...
0.00
0.05
0.10
0.15
0.20
0.25
0.30
0.35
0.40
0.45
0.50
0.54
0.59
0.64
0.69
0.74
0.79
0.84
0.89
0.94
0.99
Done


### Image section
Just as an example, here is some old code to show how these functions used to work.
This code is not used anymore, but is more of relic of the past which shows how this used to work before I coped some of the code above.

In [7]:
def make_directories(files, index = "TRAIN_WAV"):
    """Makes directories for all of the needed file groupings"""
    for i, file in enumerate(files):
        print(x.parts)
        index = file.parts.index(index)
        t = Path(OUTPUT_PATH).joinpath(*file.parts[index + 1:])
        t.mkdir(parents=True, exist_ok=True)

In [9]:
TRAIN_FILES_3 = []
TEST_FILES_3 = []
phones = [
    "aa",
    "ae",
    "ah",
    "ah",
    "ay",
    "aw",
    "b",
    "ch",
    "d",
    "dh",
    "dx",
    "eh",
    "er",
    "ey",
    "f",
    "g",
    "hh",
    "ih",
    "k",
    "iy",
    "jh",
    "l",
    "m",
    "n",
    "ng",
    "oy",
    "ow",
    "r",
    "s",
    "sh",
    "t",
    "th",
    "uw",
    "uh",
    "p",
    "v",
    "w",
    "y",
    "z",
    "sil"
]
OUTPUT_PATH = Path('./AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN')
# Makes paths for the output path followed by the phoneme
for i in phones:
    TRAIN_FILES_3.append(OUTPUT_PATH / i)
OUTPUT_PATH = Path('./AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TEST')
# Makes paths for the output path followed by the phoneme
for i in phones:
    TEST_FILES_3.append(OUTPUT_PATH / i)
    
# Makes all of the paths for the paths above
for d in TRAIN_FILES_3:
    d.mkdir(parents=True, exist_ok=True)
for d in TEST_FILES_3:
    d.mkdir(parents=True, exist_ok=True)

[PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/aa'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ae'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ah'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ah'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ay')]
[PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/aa'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ae'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ah'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ah'), PosixPath('AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN/ay')]


In [14]:

def write_filter_bank_to_disk(path, idx="TRAIN_WAV"):
    file = path
    (rate, _sig) = scipy.io.wavfile.read(path)  # File assumed to be in the same directory
    
    # The sliding windows length
    window = 800
    for i in range(0, len(_sig) - 150, window):
        for mult in range(2):
            i = int(i + mult * (window / 2))
            if len(_sig) - i < window + 1:
                break

            # Clears and closes the images as this is supposed to help with ram
            # But it still filled up most of ram running this
            plt.clf()
            plt.close()
                
            sig = _sig[i: i + window]  
            fbank_feat = logfbank(sig,rate, nfilt=40, winlen=0.025, winstep=550000*1e-9*3*1.1)

            filterbank_features = fbank_feat.T

            _delta = delta(filterbank_features, 1)
            _delta_delta = delta(_delta, 1)
            finished = np.hstack((filterbank_features, _delta))
            finished = np.hstack((finished, _delta_delta))
            
            index = file.parts.index(idx)
            out_path = Path(OUTPUT_PATH)

            out_path.mkdir(parents=True, exist_ok=True)
            
            phone = str(get_phn(path, i))
            if phone in ['q']:
                break
            if phone in phone_transfers.keys():
                phone = phone_transfers[phone]
            
            plt.imsave(f'{out_path / (phone + "/" + str(Path(file.parts[-1]).with_suffix("").with_suffix("")) + str(i) + phone)}.png', finished)

In [None]:

OUTPUT_PATH = Path('./AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TRAIN')

# Goes through all of the train wavs and converts them to images and saves them
for i, files in enumerate(TRAIN_FILES_2):
    if i % 10 == 0:
        print(i/len(TRAIN_FILES_2))
    write_filter_bank_to_disk(files.with_suffix(".WAV.wav"))
    
OUTPUT_PATH = Path('./AUDIO_FIX_FILTER_BANKS_DELTA_DELTA_FOLDERS/TEST')

# Same as above but with test files
for i, files in enumerate(TEST_FILES_2):
    if i % 100 == 0:
        print(i/len(TEST_FILES_2))
    write_filter_bank_to_disk(files.with_suffix(".WAV.wav"), "TEST_WAV")

starting train
0.0
0.005820721769499418
0.011641443538998836
0.017462165308498253
0.023282887077997673
0.02910360884749709
0.034924330616996506
0.04074505238649592
0.046565774155995346
0.05238649592549476
0.05820721769499418
0.0640279394644936
0.06984866123399301
0.07566938300349244
0.08149010477299184
0.08731082654249127
0.09313154831199069
0.0989522700814901
0.10477299185098952
0.11059371362048893
0.11641443538998836
0.12223515715948778
0.1280558789289872
0.13387660069848661
0.13969732246798602
0.14551804423748546
0.15133876600698487
0.15715948777648428
0.1629802095459837
0.16880093131548313
0.17462165308498254
0.18044237485448195
0.18626309662398138
0.1920838183934808
0.1979045401629802
0.20372526193247964
0.20954598370197905
0.21536670547147846
0.22118742724097787
0.2270081490104773
0.23282887077997672
0.23864959254947612
0.24447031431897556
0.25029103608847497
0.2561117578579744
0.2619324796274738
0.26775320139697323
0.27357392316647267
0.27939464493597205
0.2852153667054715
0.291