This final milestone notebook is for processing the data using a NVIDIA GPU and then feeding the images to the CNN rather than using the processed data from the tablet. More specifically, the MFCC algorithm will be run in CUDA. Any other algorithm involved in the process such as filtering, downsampling, trimming, and image generation is done in Python. 

In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os

import matplotlib.pyplot as plt
from scipy.io import wavfile
from IPython.display import Audio
from scipy import signal
import build.pybind_modules.dsp_module as cu
import build.pybind_modules.matrix_module as myMatrix
from math import ceil, isnan
from time import time

from PIL import Image

In [7]:
# Parameters 
MODEL_NAME = 'audio_mnist'
EPOCHS = 20
BATCH_SIZE = 64

# Parameters used on tablet 
VOICED_THRESHOLD = 20000000
FRAME_SETBACK = 2
FS = 48000
DOWN_SAMPLED_FS = 8000
NFFT = 256
NOVERLAP = -1
NFILT = 40
NUM_CEPS = 13
NN_DATA_COLS = 28
NN_DATA_ROWS = 12
PREEMPHASIS_B = 0.97
PIXEL_WIDTH = 400
PIXEL_HEIGHT = 300

In [8]:
""" helper functions used in sample processing before feed the samples to the CNN """
def soundDataToFloat(SD):
    "Converts integer representation back into librosa-friendly floats, given a numpy array SD"
    return np.array([ np.float32(s/32768.0) for s in SD])

def soundDataToInt16(SD):
    return np.array( [np.int16(s*32768.0) for s in SD] )

def createButter(N, Wn, fs):
    num, den = signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=fs)
    return np.array([num[i]/den[i] for i in range(len(num))])

def createFIR(num_taps, cut_off, fs):
    return signal.firwin(num_taps, cut_off, fs=fs)

def displayFIR(filt):
    coef_str = "{" 
    for val in filt: 
        coef_str += str(val) + ", " 
    coef_str = coef_str[:-2] 
    coef_str += "};" 
    print("FIR a Coefficients")
    print(coef_str) 

def applyFIR(samples, filt):
    circBuf = np.zeros(len(filt))
    circBufIdx = 0
    filteredSamples = np.zeros(len(samples))
    num_taps = len(filt)

    for i in range(len(samples)):
        circBuf[circBufIdx] = samples[i]
        curr_val = 0

        for n in range(num_taps):
            curr_val += filt[n] * circBuf[ (((circBufIdx - n) % num_taps) + num_taps) % num_taps]

        filteredSamples[i] = curr_val 
        circBufIdx = (circBufIdx + 1) % num_taps

    return filteredSamples

def frameVoiced(frame, threshold):
    isVoiced = False
    sum = 0
    for i in range(len(frame)):
        sum += abs(frame[i])**2

    if (sum > threshold):
        isVoiced = 1
    
    return isVoiced

def trimSamples(samples, frameSize, nfft, noverlap, threshold, frame_setback):
    num_samples = len(samples)
    if (noverlap < 0):
        noverlap = int(nfft/2)

    step = nfft - noverlap

    numFrames = ceil(num_samples / step)

    while ((numFrames - 1)*step + (nfft - 1) >= num_samples):
        numFrames -= 1

    first_frame = 0
    for i in range(numFrames):
        if frameVoiced(soundDataToInt16(samples[i*step:i*step + nfft]), threshold=threshold):
            first_frame = i
            break
    
    first_frame -= frame_setback
    if first_frame < 0:
        first_frame = 0
    last_frame = first_frame + frameSize
    num_trimmed_samples = ((last_frame - 1)*step + nfft) - (first_frame*step)
    trimmed_samples = np.zeros(num_trimmed_samples)

    for i in range(num_trimmed_samples):
        if (first_frame*step + i >= len(samples)):
            break
        trimmed_samples[i] = samples[first_frame*step + i]

    return trimmed_samples

def createImage(data, pixel_width, pixel_height, data_rows, data_cols, filename=None):
    def hex_to_rgb(val):
        mask = 0x0000FF
        b = mask & val
        g = mask & (val >> 8)
        r = mask & (val >> 16)
        
        return (r,g,b)

    viridis_pallete = [
        0x440154,
        0x481567,
        0x482677,
        0x453771,
        0x404788,
        0x39568C,
        0x33638D,
        0x2D708E,
        0x287D8E,
        0x238A8D,
        0x1F968B,
        0x20A387,
        0x29AF7F,
        0x3CBB75,
        0x55C667,
        0x73D055,
        0x95D840,
        0xB8DE29,
        0xDCE319,
        0xFDE725
    ]
    viridis_pallete_rgb = [hex_to_rgb(x) for x in viridis_pallete]
    viridis_size = len(viridis_pallete_rgb)

    max_val = None
    min_val = None
    for y in range(data_rows):
        for x in range(data_cols):
            sample = data[y,x]
            if isnan(sample):
                sample = 0
                data[y,x] = sample
            if max_val == None or sample > max_val:
                max_val = sample
            if min_val == None or sample < min_val:
                min_val = sample

    max_val -= min_val
    data = (data-min_val) / max_val

    """ canvas for holding rgb image from the range 0 to 255 """
    canvas = np.zeros((pixel_height, pixel_width, 3), dtype=np.uint8)
    horizontal_step = int(pixel_width / data_cols)
    vertical_step = int(pixel_height / data_rows)

    # attempting to only manipulate green in rgb

    # step counters that are 1 indexed
    horizontal_count = 1
    vertical_count = 1
    for pixel_row in range(pixel_height):
        if (pixel_row >= vertical_count * vertical_step) and (vertical_count < data_rows):
            vertical_count += 1

        horizontal_count = 1
        for pixel_col in range(pixel_width):
            if (pixel_col >= horizontal_count * horizontal_step) and (horizontal_count < data_cols):
                horizontal_count += 1
            
            # 0 index
            x_idx = horizontal_count - 1
            y_idx = data_rows - vertical_count

            # determine green value from data
            percent = data[y_idx, x_idx]

            viridis_idx = int((viridis_size-1) * percent)
            curr_color = viridis_pallete_rgb[viridis_idx]

            red_val = curr_color[0]
            green_val = curr_color[1]
            blue_val = curr_color[2]
            
            canvas[pixel_row, pixel_col, 0] = red_val
            canvas[pixel_row, pixel_col, 1] = green_val
            canvas[pixel_row, pixel_col, 2] = blue_val

    im = Image.fromarray(canvas)
    if filename is not None:
        im.save(filename) # if you wanted to save the image

    return im

In [9]:
# creating a filter
filt = createFIR(51, 3500, fs=FS)

## Jorge's Dataset

In [10]:
path = os.getcwd() + "/MFCC_Images"
write_path = os.getcwd() + "/MFCC_Images"
labeled_directories = os.listdir(path)
total_processes = 1
total_time = 0
print_once = 1
for i in range(len(labeled_directories)):
    label = labeled_directories[i]
    curr_label = int(label)
    # print(curr_label)

    image_directory_path = f"{path}/{label}/raw_wav_audio"
    write_directory_path = f"{write_path}/{label}/gpu_images"
    raw_wav_directory = os.listdir(image_directory_path)

    if not os.path.exists(write_directory_path):
        os.makedirs(write_directory_path)
    
    for j in range(len(raw_wav_directory)):
        curr_raw_wav = raw_wav_directory[j]
        curr_raw_wav_path = f"{image_directory_path}/{curr_raw_wav}"
        raw_fs, raw_wav_audio = wavfile.read(curr_raw_wav_path)

        start_process_time = time()

        sig = soundDataToFloat(raw_wav_audio)

        filtered_sig = applyFIR(sig, filt)

        filtered_sig = np.array([filtered_sig[i] for i in range(0, len(filtered_sig), 6)])

        filtered_sig = trimSamples(filtered_sig, NN_DATA_COLS, NFFT, NOVERLAP, VOICED_THRESHOLD, FRAME_SETBACK)

        curr_processed = np.array(myMatrix.MFCC_Matrix(list(filtered_sig), DOWN_SAMPLED_FS, NFFT, NOVERLAP, 2, PREEMPHASIS_B, NFILT, NUM_CEPS), copy=False)[1:,:]

        end_process_time = (time() - start_process_time)*1000 # record how long the process took in ms

        write_wav_file_name = f"{write_directory_path}/{curr_label}_{j}_gpu_image.png"

        createImage(curr_processed, PIXEL_WIDTH, PIXEL_HEIGHT, NN_DATA_ROWS, NN_DATA_COLS, filename=write_wav_file_name)

        if print_once:
            print_once = 0
            print("First time: {} ms".format(end_process_time))

        total_time += end_process_time
        total_processes += 1

total_processes -= 1
Average_Process_Time = total_time / total_processes

print("GPU took on average {} ms to process each file for {} files".format(Average_Process_Time, total_processes))




First time: 2220.151901245117 ms
GPU took on average 2176.17045266061 ms to process each file for 601 files


## Jonathan's Dataset

In [11]:
path = os.getcwd() + "/Jonathan_MFCC_Images"
write_path = os.getcwd() + "/Jonathan_MFCC_Images"
labeled_directories = os.listdir(path)
total_processes = 1
total_time = 0
print_once = 1
for i in range(len(labeled_directories)):
    label = labeled_directories[i]
    curr_label = int(label)
    # print(curr_label)

    image_directory_path = f"{path}/{label}/raw_wav_audio"
    write_directory_path = f"{write_path}/{label}/gpu_images"
    raw_wav_directory = os.listdir(image_directory_path)

    if not os.path.exists(write_directory_path):
        os.makedirs(write_directory_path)
    
    for j in range(len(raw_wav_directory)):
        curr_raw_wav = raw_wav_directory[j]
        curr_raw_wav_path = f"{image_directory_path}/{curr_raw_wav}"
        raw_fs, raw_wav_audio = wavfile.read(curr_raw_wav_path)

        start_process_time = time()

        sig = soundDataToFloat(raw_wav_audio)

        filtered_sig = applyFIR(sig, filt)

        filtered_sig = np.array([filtered_sig[i] for i in range(0, len(filtered_sig), 6)])

        filtered_sig = trimSamples(filtered_sig, NN_DATA_COLS, NFFT, NOVERLAP, VOICED_THRESHOLD, FRAME_SETBACK)

        curr_processed = np.array(myMatrix.MFCC_Matrix(list(filtered_sig), DOWN_SAMPLED_FS, NFFT, NOVERLAP, 2, PREEMPHASIS_B, NFILT, NUM_CEPS), copy=False)[1:,:]

        end_process_time = (time() - start_process_time)*1000 # record how long the process took in ms

        write_wav_file_name = f"{write_directory_path}/{curr_label}_{j}_gpu_image.png"

        createImage(curr_processed, PIXEL_WIDTH, PIXEL_HEIGHT, NN_DATA_ROWS, NN_DATA_COLS, filename=write_wav_file_name)

        if print_once:
            print_once = 0
            print("First time: {} ms".format(end_process_time))

        total_time += end_process_time
        total_processes += 1

total_processes -= 1
Average_Process_Time = total_time / total_processes

print("GPU took on average {} ms to process each file for {} files".format(Average_Process_Time, total_processes))




First time: 2253.852605819702 ms


  raw_fs, raw_wav_audio = wavfile.read(curr_raw_wav_path)
  raw_fs, raw_wav_audio = wavfile.read(curr_raw_wav_path)


GPU took on average 2186.432551404197 ms to process each file for 613 files


## Max's Dataset

In [12]:
path = os.getcwd() + "/Max_MFCC_Images"
write_path = os.getcwd() + "/Max_MFCC_Images"
labeled_directories = os.listdir(path)
total_processes = 1
total_time = 0
print_once = 1
for i in range(len(labeled_directories)):
    label = labeled_directories[i]
    curr_label = int(label)
    # print(curr_label)

    image_directory_path = f"{path}/{label}/raw_wav_audio"
    write_directory_path = f"{write_path}/{label}/gpu_images"
    raw_wav_directory = os.listdir(image_directory_path)

    if not os.path.exists(write_directory_path):
        os.makedirs(write_directory_path)
    
    for j in range(len(raw_wav_directory)):
        curr_raw_wav = raw_wav_directory[j]
        curr_raw_wav_path = f"{image_directory_path}/{curr_raw_wav}"
        raw_fs, raw_wav_audio = wavfile.read(curr_raw_wav_path)

        start_process_time = time()

        sig = soundDataToFloat(raw_wav_audio)

        filtered_sig = applyFIR(sig, filt)

        filtered_sig = np.array([filtered_sig[i] for i in range(0, len(filtered_sig), 6)])

        filtered_sig = trimSamples(filtered_sig, NN_DATA_COLS, NFFT, NOVERLAP, VOICED_THRESHOLD, FRAME_SETBACK)

        curr_processed = np.array(myMatrix.MFCC_Matrix(list(filtered_sig), DOWN_SAMPLED_FS, NFFT, NOVERLAP, 2, PREEMPHASIS_B, NFILT, NUM_CEPS), copy=False)[1:,:]

        end_process_time = (time() - start_process_time)*1000 # record how long the process took in ms

        write_wav_file_name = f"{write_directory_path}/{curr_label}_{j}_gpu_image.png"

        createImage(curr_processed, PIXEL_WIDTH, PIXEL_HEIGHT, NN_DATA_ROWS, NN_DATA_COLS, filename=write_wav_file_name)

        if print_once:
            print_once = 0
            print("First time: {} ms".format(end_process_time))

        total_time += end_process_time
        total_processes += 1

total_processes -= 1
Average_Process_Time = total_time / total_processes

print("GPU took on average {} ms to process each file for {} files".format(Average_Process_Time, total_processes))




First time: 2274.200439453125 ms
GPU took on average 2193.07440381995 ms to process each file for 434 files
