In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import os

In [2]:
def create_train_test_df():
    path_train = '../data/new/train/'
    path_test = '../data/new/train/'

    train_list = os.listdir(path_train)
    test_list = os.listdir(path_test)
    
    d_train = {'language': [x[:2] for x in train_list], 'audio_path': [path_train + x for x in train_list], 'set' : 'train'}
    d_test = {'language': [x[:2] for x in test_list], 'audio_path': [path_test + x for x in test_list], 'set' : 'test'}
    
    df_train = pd.DataFrame(data=d_train)
    df_test = pd.DataFrame(data=d_test)
    frames = [df_train, df_test]
    
    return pd.concat(frames)

In [3]:
def create_sql_table(df):
    pw = os.getenv('MySQLpwd')
    connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
    engine = create_engine(connection_string)
    df.to_sql('language_audio_sets', engine, 'languages', if_exists='replace', index=False)

In [4]:
df = create_train_test_df()
#create_sql_table(df)

FileNotFoundError: [Errno 2] No such file or directory: '../data/new/train/'

In [None]:
df.shape

## Generate features for samples

In [None]:
import glob
import os
import time

import numpy as np
import soundfile as sf

import common

In [None]:
SEED = 42

FB_HEIGHT = 40  # filter banks
WIDTH = 1000
COLOR_DEPTH = 1
INPUT_SHAPE = (FB_HEIGHT, WIDTH, COLOR_DEPTH)

DATA_TYPE = 'float32'
DATA_KEY = 'data'

LANGUAGES = ['en', 'de', 'es']
GENDERS = ['m', 'f']

LANGUAGE_INDEX = 0
GENDER_INDEX = 1

THRESHOLD = 0.8

FRAGMENT_DURATION = 10  # seconds

DATASET_DIST = '../data/spoken_language_audio_dataset/'

In [None]:
def generate_fb_and_mfcc(signal, sample_rate):

    # Pre-Emphasis
    pre_emphasis = 0.97
    emphasized_signal = np.append(
        signal[0],
        signal[1:] - pre_emphasis * signal[:-1])

    # Framing
    frame_size = 0.025
    frame_stride = 0.01

    # Convert from seconds to samples
    frame_length, frame_step = (
        frame_size * sample_rate,
        frame_stride * sample_rate)
    signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))

    # Make sure that we have at least 1 frame
    num_frames = int(
        np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))

    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))

    # Pad Signal to make sure that all frames have equal
    # number of samples without truncating any samples
    # from the original signal
    pad_signal = np.append(emphasized_signal, z)

    indices = (
        np.tile(np.arange(0, frame_length), (num_frames, 1)) +
        np.tile(
            np.arange(0, num_frames * frame_step, frame_step),
            (frame_length, 1)
        ).T
    )
    frames = pad_signal[indices.astype(np.int32, copy=False)]

    # Window
    frames *= np.hamming(frame_length)

    # Fourier-Transform and Power Spectrum
    NFFT = 512

    # Magnitude of the FFT
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))

    # Power Spectrum
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))

    # Filter Banks
    nfilt = 40

    low_freq_mel = 0

    # Convert Hz to Mel
    high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))

    # Equally spaced in Mel scale
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)

    # Convert Mel to Hz
    hz_points = (700 * (10**(mel_points / 2595) - 1))
    bin = np.floor((NFFT + 1) * hz_points / sample_rate)

    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin[m - 1])   # left
        f_m = int(bin[m])             # center
        f_m_plus = int(bin[m + 1])    # right

        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = np.dot(pow_frames, fbank.T)

    # Numerical Stability
    filter_banks = np.where(
        filter_banks == 0,
        np.finfo(float).eps,
        filter_banks)

    # dB
    filter_banks = 20 * np.log10(filter_banks)

    return filter_banks

In [None]:
def process_audio(input_dir, debug=False):
    files = []

    extensions = ['*.flac']
    for extension in extensions:
        files.extend(glob.glob(os.path.join(input_dir, extension)))

    for file in files:
        print(file)

        signal, sample_rate = sf.read(file)
        assert len(signal) > 0
        assert sample_rate == 22050

        fb = generate_fb_and_mfcc(signal, sample_rate)
        fb = fb.astype(DATA_TYPE, copy=False)

        assert fb.dtype == DATA_TYPE
        assert fb.shape == (WIDTH, FB_HEIGHT)

        # .npz extension is added automatically
        file_without_ext = os.path.splitext(file)[0]

        np.savez_compressed(file_without_ext + '.fb', data=fb)

        if debug:
            end = time.time()
            print("It took [s]: ", end - start)

            # data is casted to uint8, i.e. (0, 255)
            import imageio
            imageio.imwrite('fb_image.png', fb)

            exit(0)

In [None]:
#process_audio(os.path.join(DATASET_DIST, 'test'))
#process_audio(os.path.join(DATASET_DIST, 'train'))

## Normalize feature and build folds

In [None]:
def has_uids(uids):
    for language in LANGUAGES:
        for gender in GENDERS:
            if len(uids[language][gender]) == 0:
                return False
    return True


def generate_fold(
        uids,
        input_dir,
        input_ext,
        output_dir,
        group,
        fold_index,
        input_shape,
        normalize,
        output_shape):

    # pull uid for each a language, gender pair
    fold_uids = []
    for language in LANGUAGES:
        for gender in GENDERS:
            fold_uids.append(uids[language][gender].pop())

    # find files for given uids
    fold_files = []
    for fold_uid in fold_uids:
        filename = '*{uid}*{extension}'.format(
            uid=fold_uid,
            extension=input_ext)
        fold_files.extend(glob(os.path.join(input_dir, filename)))

    fold_files = sorted(fold_files)
    fold_files = shuffle(fold_files, random_state=SEED)

    metadata = []

    # create a file array
    filename = "{group}_data.fold{index}.npy".format(
        group=group, index=fold_index)
    features = np.memmap(
        os.path.join(output_dir, filename),
        dtype=DATA_TYPE,
        mode='w+',
        shape=(len(fold_files),) + output_shape)

    # append data to a file array
    # append metadata to an array
    for index, fold_file in enumerate(fold_files):
        print(fold_file)

        filename = common.get_filename(fold_file)
        language = filename.split('_')[0]
        gender = filename.split('_')[1]

        data = np.load(fold_file)[DATA_KEY]
        assert data.shape == input_shape
        assert data.dtype == DATA_TYPE

        features[index] = normalize(data)
        metadata.append((language, gender, filename))

    assert len(metadata) == len(fold_files)

    # store metadata in a file
    filename = "{group}_metadata.fold{index}.npy".format(
        group=group,
        index=fold_index)
    np.save(
        os.path.join(output_dir, filename),
        metadata)

    # flush changes to a disk
    features.flush()
    del features


def generate_folds(
        input_dir,
        input_ext,
        output_dir,
        group,
        input_shape,
        normalize,
        output_shape):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    files = glob(os.path.join(input_dir, '*' + input_ext))

    uids = common.group_uids(files)

    fold_index = 1
    while has_uids(uids):
        print("[{group}] Fold {index}".format(group=group, index=fold_index))

        generate_fold(
            uids,
            input_dir,
            input_ext,
            output_dir,
            group,
            fold_index,
            input_shape,
            normalize,
            output_shape)

        fold_index += 1


def normalize_fb(spectrogram):

    # Mean and Variance Normalization
    spectrogram = speechpy.processing.cmvn(
        spectrogram,
        variance_normalization=True)

    # MinMax Scaler, scale values between (0,1)
    normalized = (
        (spectrogram - np.min(spectrogram)) /
        (np.max(spectrogram) - np.min(spectrogram))
    )

    # Rotate 90deg
    normalized = np.swapaxes(normalized, 0, 1)

    # Reshape, tensor 3d
    (height, width) = normalized.shape
    normalized = normalized.reshape(height, width, COLOR_DEPTH)

    assert normalized.dtype == DATA_TYPE
    assert np.max(normalized) == 1.0
    assert np.min(normalized) == 0.0

    return normalized

In [None]:
# fb
generate_folds(
    os.path.join(DATASET_DIST, 'test'),
    '.fb.npz',
    output_dir='build/folds',
    group='test',
    input_shape=(WIDTH, FB_HEIGHT),
    normalize=normalize_fb,
    output_shape=(FB_HEIGHT, WIDTH, COLOR_DEPTH)
)
generate_folds(
    os.path.join(DATASET_DIST, 'train'),
    '.fb.npz',
    output_dir='build/folds',
    group='train',
    input_shape=(WIDTH, FB_HEIGHT),
    normalize=normalize_fb,
    output_shape=(FB_HEIGHT, WIDTH, COLOR_DEPTH)
)

end = time.time()
print("It took [s]: ", end - start)