In [1]:
from scipy.io import wavfile
import numpy as np
from scipy import signal
# from librosa import feature, effects, load
from sklearn.preprocessing import minmax_scale
import os.path
import pickle


def extract(wav_file, nfft=64, window_length=0.1, mel=False, flatten=True, augment=False, noise=False):
    rate, frames = wavfile.read(wav_file)
    window = 16384  # round(window_length * rate)
    feat = []

    for i in range(0, len(frames)-window, int(window/2)):
        if mel:
            pxx = np.array(feature.mfcc(frames[i:i + window - 1],
                               sr=rate,
                               n_fft=nfft,
                               hop_length=round(nfft / 2),
                               fmax=8000))
        else:
            pxx = np.array(frames[i:i + window])
        if flatten:
            feat.append(pxx.flatten())
        else:
            feat.append(pxx)
            '''TODO: experiments with augmentation'''
            if augment:
                feat.append(effects.pitch_shift(pxx, rate, n_steps=4.0))
                # feat.append(effects.pitch_shift(pxx, rate, n_steps=8.0))
                feat.append(effects.pitch_shift(pxx, rate, n_steps=-4.0))
                # feat.append(effects.pitch_shift(pxx, rate, n_steps=-8.0))
            if noise:
                feat.append(pxx + np.random.normal(0, 1, len(pxx)))
                feat.append(pxx * np.random.normal(1, 0.1, len(pxx)))
    return np.stack(feat)


def extract_features(file_wet, file_dry, mel=False, flatten=True, scaling=False, categorical=True, augment=False, noise=False):
    to_replace ="\\/"
    for char in to_replace:
        fw = file_wet.replace(char, "_")
        fd = file_dry.replace(char, "_")
    pickle_file = fw + "-" + fd + ".pkl"
    if os.path.exists(pickle_file):
        print("Using pickle file", pickle_file)
        with open(pickle_file, "rb") as f:
            features, labels = pickle.load(f)
        return features, labels
    features_wet = extract(file_wet, mel=mel, flatten=flatten, augment=augment, noise=noise)
    features_dry = extract(file_dry, mel=mel, flatten=flatten, augment=augment, noise=noise)
    print(features_dry, features_dry.shape)
    labels_wet = np.ones(features_wet.shape[0])
    labels_dry = np.zeros(features_dry.shape[0])
    features = np.concatenate((features_wet, features_dry))
    labels = np.concatenate((labels_wet, labels_dry))
    if categorical:
        from keras.utils import to_categorical
        labels = to_categorical(labels, 2)
    if scaling and flatten:
        features = minmax_scale(features)
    with open(pickle_file, "wb") as f:
        pickle.dump((features, labels), f, protocol=4)
    return features, labels


def get_last(path, type):
    import glob
    if type == "weights":
        list = sorted(glob.glob(path + "*.h5"))
    if type == "model":
        list = sorted(glob.glob(path + "*.yaml"))
    if len(list) > 0:
        return max(listt)
    return None


In [2]:
from feature_extraction import extract_features, get_last
from sklearn.metrics import recall_score, accuracy_score
from sklearn.utils import shuffle
import numpy as np
from sklearn.model_selection import train_test_split
from time import time
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout, Dense, Flatten, LSTM
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, Callback
from keras import optimizers, regularizers
from keras.utils import to_categorical
from datetime import datetime
import os

dt = datetime.now().strftime("%d-%m-%Y.%H-%M")

class TestCallback(Callback):
    def __init__(self, test_data, number):
        self.test_data = test_data
        self.number = number

    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        loss, acc = self.model.evaluate(x, y, verbose=0)
        log_filename = "models/cnn/log." + dt + ".csv"
        with open(log_filename, "a") as log:
            log.write("{},{},{},{},{}\n".format(self.number, epoch, loss, acc, logs["acc"]))

def def_model_cnn_blstm(input_shape):
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=64, kernel_size=64, strides=1, padding="same", activation='tanh'), input_shape=input_shape))
    model.add(TimeDistributed(MaxPooling1D(2)))
    model.add(TimeDistributed(Dropout(0.4)))
    # model.add(TimeDistributed(Conv1D(32, 32, activation='relu')))
    # model.add(TimeDistributed(MaxPooling1D(4)))
    # model.add(TimeDistributed(Dropout(0.3)))
    model.add(TimeDistributed(Conv1D(64, 64, padding="same", activation='tanh')))
    model.add(TimeDistributed(MaxPooling1D(2)))
    model.add(TimeDistributed(Dropout(0.4)))
    model.add(TimeDistributed(Conv1D(64, 64, padding="same", activation='tanh')))
    model.add(TimeDistributed(MaxPooling1D(2)))
    model.add(TimeDistributed(Dropout(0.4)))
    # model.add(TimeDistributed(Conv1D(64, 16, activation='relu')))
    # model.add(TimeDistributed(MaxPooling1D(4)))
    # model.add(TimeDistributed(Dropout(0.4)))
    model.add(TimeDistributed(Conv1D(128, 64, padding="same", activation='tanh')))
    model.add(TimeDistributed(MaxPooling1D(2)))
    model.add(TimeDistributed(Dropout(0.4)))
    model.add(TimeDistributed(Conv1D(128, 64, padding="same", activation='tanh')))
    model.add(TimeDistributed(MaxPooling1D(2)))
    # model.add(TimeDistributed(Dropout(0.4)))
    model.add(TimeDistributed(Conv1D(128, 64, padding="same", activation='tanh')))
    model.add(TimeDistributed(MaxPooling1D(2)))
    # model.add(TimeDistributed(Dropout(0.4)))
    model.add(TimeDistributed(Conv1D(256, 64, padding="same", activation='tanh')))
    model.add(TimeDistributed(MaxPooling1D(2)))
    # model.add(TimeDistributed(Dropout(0.4)))
    model.add(TimeDistributed(Conv1D(256, 64, padding="same", activation='tanh')))
    model.add(TimeDistributed(GlobalAveragePooling1D()))

    # model.add(TimeDistributed(Dense(128, activation='relu')))
    # model.add(TimeDistributed(Dropout(0.5)))
    # model.add(Bidirectional(LSTM(256, return_sequences=True)))
    # model.add(TimeDistributed(Dropout(0.5)))
    # model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(Bidirectional(LSTM(256)))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()
    return model


def train():
    X_train, X_1, X_2, X_3, y_train, y_1, y_2, y_3 = ex_feat()
    start = time()
    print("\nTraining model...")
    model = def_model_cnn_blstm(X_train.shape[1:])
    weights = get_last("models/cnn/", "weights")
    if weights is not None:
        model.load_weights(weights)
    print("Using weights:", weights)
    print("Dataset shape:", X_train.shape)

    # tbCallback = TensorBoard(histogram_freq=1, write_grads=True, write_graph=False)  # Tensorboard callback
    # esCallback = EarlyStopping(monitor="val_loss", min_delta=0.01, patience=5, verbose=1)  # early stopping callback
    mcCallback = ModelCheckpoint("models/cnn/weights.{epoch:02d}-{val_acc:.4f}.h5", monitor='val_acc', verbose=0,
                                 save_best_only=False, save_weights_only=True,
                                 mode='auto', period=1)  # saving weights every epoch
    testCallback0 = TestCallback((X_train, y_train), 3)
    testCallback1 = TestCallback((X_1, y_1), 1)
    testCallback2 = TestCallback((X_2, y_2), 2)
    testCallback3 = TestCallback((X_3, y_3), 3)


    # dt = datetime.now().strftime("%d-%m-%Y.%H-%M")
    model_filename = "models/cnn/model." + dt + ".yaml"
    with open(model_filename, "w") as model_yaml:
        model_yaml.write(model.to_yaml())

    model.fit(X_train, y_train, validation_data=(X_1, y_1),
              batch_size=128, epochs=75, verbose=1,
              callbacks=[mcCallback, testCallback0, testCallback1, testCallback2, testCallback3]) #, esCallback])

    weights_filename = "models/cnn/" + dt + ".h5"
    model.save_weights(weights_filename)
    end = time()
    training_time = end - start
    print("\nTook %.3f sec." % training_time)


def ex_feat():
    start = time()
    print("\nExtracting features...")
    # X_1, y_1 = extract_features("dataset/wet1/audio_mono.wav",
    #                             "dataset/dry1/audio_mono.wav", flatten=False, scaling=False)
    # X_2, y_2 = extract_features("dataset/wet2/audio_mono.wav",
    #                             "dataset/dry2/audio_mono.wav", flatten=False, scaling=False)
    # X_3, y_3 = extract_features("dataset/wet3/audio_mono.wav",
    #                             "dataset/dry3/audio_mono.wav", flatten=False, scaling=False)
    #
    # X_train = np.concatenate((X_1, X_2, X_3))
    # y_train = np.concatenate((y_1, y_2, y_3))
    #
    # X_test, y_test = extract_features("dataset/wet/chevy_wet.wav",
    #                                   "dataset/dry/chevy_dry.wav", flatten=False, scaling=False)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    # X_train, y_train = extract_features("dataset/wet/test_wet.wav", "dataset/dry/test_dry.wav",
    #                                     mel=False, flatten=False, scaling=True, categorical=True, augment=True)
    # X_test, y_test = extract_features("dataset/wet/test_wet.wav", "dataset/dry/test_dry.wav",
    #                                   mel=False, flatten=False, scaling=True, categorical=True)
    # X_val, y_val = extract_features("dataset/wet/test_wet.wav", "dataset/dry/test_dry.wav",
    #                                 mel=False, flatten=False, scaling=True, categorical=True)
    X_train, y_train = extract_features("yt_data/wet_0.wav", "yt_data/dry_0.wav",
                                        mel=False, flatten=False, scaling=True, categorical=True)
    X_1, y_1 = extract_features("dataset/wet1/audio_mono.wav", "dataset/dry1/audio_mono.wav",
                                      mel=False, flatten=False, scaling=True, categorical=True)
    X_2, y_2 = extract_features("dataset/wet2/audio_mono.wav", "dataset/dry2/audio_mono.wav",
                                    mel=False, flatten=False, scaling=True, categorical=True)
    X_3, y_3 = extract_features("dataset/wet3/audio_mono.wav", "dataset/dry3/audio_mono.wav",
                               mel=False, flatten=False, scaling=True, categorical=True)

    X_train = np.expand_dims(X_train, axis=1)
    X_1 = np.expand_dims(X_1, axis=1)
    X_2 = np.expand_dims(X_2, axis=1)
    X_3 = np.expand_dims(X_3, axis=1)

    X_train = X_train.reshape((X_train.shape[0], 1, int(X_train.shape[2])))
    X_1 = X_1.reshape((X_1.shape[0], 1, int(X_1.shape[2])))
    X_2 = X_2.reshape((X_2.shape[0], 1, int(X_2.shape[2])))
    X_3 = X_3.reshape((X_3.shape[0], 1, int(X_3.shape[2])))

    X_train = np.expand_dims(X_train, axis=3)
    X_1 = np.expand_dims(X_1, axis=3)
    X_2 = np.expand_dims(X_2, axis=3)
    X_3 = np.expand_dims(X_3, axis=3)

    end = time()
    print("Took %.3f sec." % (end - start))


    return X_train, X_1, X_2, X_3, y_train, y_1, y_2, y_3


Using TensorFlow backend.


In [3]:
train()


Extracting features...
Using pickle file yt_data_wet_0.wav-yt_data_dry_0.wav.pkl
Using pickle file dataset_wet1_audio_mono.wav-dataset_dry1_audio_mono.wav.pkl
Using pickle file dataset_wet2_audio_mono.wav-dataset_dry2_audio_mono.wav.pkl
Using pickle file dataset_wet3_audio_mono.wav-dataset_dry3_audio_mono.wav.pkl
Took 17.509 sec.

Training model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_1 (TimeDist (None, 1, 16384, 64)      4160      
_________________________________________________________________
time_distributed_2 (TimeDist (None, 1, 8192, 64)       0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 1, 8192, 64)       0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 1, 8192, 64)       262208    
______________________________________________________

KeyboardInterrupt: 