In [2]:
%pylab inline
import librosa
import librosa.filters
import math
import numpy as np
from scipy import signal

import glob
import os

from scipy.io import wavfile
from scipy import interpolate
import pyworld, pysptk
from nnmnkwii.metrics import melcd
from fastdtw import fastdtw

import matplotlib.pyplot as plt
from IPython.display import Audio

import soundfile as sf

Populating the interactive namespace from numpy and matplotlib


In [3]:
base1 = os.path.expanduser("../../dataset/wav_data/uemura_normal/")
base2 = os.path.expanduser("../../dataset/wav_data/tsuchiya_normal/")

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

# from https://github.com/keithito/tacotron/blob/08989cc3553b3a916a31f565e4f20e34bf19172f/hparams.py
hparams = AttrDict(
    # Audio:
    num_mels=80,
    num_freq=1025,
    sample_rate=24000,
    frame_length_ms=50,
    frame_shift_ms=12.5,
    preemphasis=0.97,
    min_level_db=-100,
    ref_level_db=20,
)

## functions

In [4]:
# from https://github.com/keithito/tacotron/blob/53a840c030a1899a0510da4965d96b53a29d6679/util/audio.py

def load_wav(path):
    w = librosa.core.load(path, sr=hparams.sample_rate)[0]
    w = librosa.effects.remix(w, intervals=librosa.effects.split(w, top_db=25))
#     w = librosa.effects.trim(w)[0]
    return w


def save_wav(wav, path):
    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
    sf.write(path, wav, 44100, 'PCM_24')


def preemphasis(x):
    return signal.lfilter([1, -hparams.preemphasis], [1], x)


def inv_preemphasis(x):
    return signal.lfilter([1], [1, -hparams.preemphasis], x)


def spectrogram(y):
    D = _stft(preemphasis(y))
    S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
    return _normalize(S)


def inv_spectrogram(spectrogram):
    '''Converts spectrogram to waveform using librosa'''
    S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)    # Convert back to linear
    return inv_preemphasis(_griffin_lim(S ** hparams.power))                    # Reconstruct phase


def inv_spectrogram_tensorflow(spectrogram):
    '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
    Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
    inv_preemphasis on the output after running the graph.
    '''
    S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
    return _griffin_lim_tensorflow(tf.pow(S, hparams.power))


def melspectrogram(y):
    D = _stft(preemphasis(y))
    S = _amp_to_db(_linear_to_mel(np.abs(D)))
    return _normalize(S)


def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
    window_length = int(hparams.sample_rate * min_silence_sec)
    hop_length = int(window_length / 4)
    threshold = _db_to_amp(threshold_db)
    for x in range(hop_length, len(wav) - window_length, hop_length):
        if np.max(wav[x:x+window_length]) < threshold:
            return x + hop_length
    return len(wav)


def _griffin_lim(S):
    '''librosa implementation of Griffin-Lim
    Based on https://github.com/librosa/librosa/issues/434
    '''
    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
    S_complex = np.abs(S).astype(np.complex)
    y = _istft(S_complex * angles)
    for i in range(hparams.griffin_lim_iters):
        angles = np.exp(1j * np.angle(_stft(y)))
        y = _istft(S_complex * angles)
    return y


def _griffin_lim_tensorflow(S):
    '''TensorFlow implementation of Griffin-Lim
    Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
    '''
    with tf.variable_scope('griffinlim'):
        # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
        S = tf.expand_dims(S, 0)
        S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
        y = _istft_tensorflow(S_complex)
        for i in range(hparams.griffin_lim_iters):
            est = _stft_tensorflow(y)
            angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
            y = _istft_tensorflow(S_complex * angles)
        return tf.squeeze(y, 0)


def _stft(y):
    n_fft, hop_length, win_length = _stft_parameters()
    return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)


def _istft(y):
    _, hop_length, win_length = _stft_parameters()
    return librosa.istft(y, hop_length=hop_length, win_length=win_length)


def _stft_tensorflow(signals):
    n_fft, hop_length, win_length = _stft_parameters()
    return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)


def _istft_tensorflow(stfts):
    n_fft, hop_length, win_length = _stft_parameters()
    return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)


def _stft_parameters():
    n_fft = (hparams.num_freq - 1) * 2
    hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
    return n_fft, hop_length, win_length


# Conversions:

_mel_basis = None

def _linear_to_mel(spectrogram):
    global _mel_basis
    if _mel_basis is None:
        _mel_basis = _build_mel_basis()
    return np.dot(_mel_basis, spectrogram)

def _build_mel_basis():
    n_fft = (hparams.num_freq - 1) * 2
    return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)

def _amp_to_db(x):
    return 20 * np.log10(np.maximum(1e-5, x))

def _db_to_amp(x):
    return np.power(10.0, x * 0.05)

def _db_to_amp_tensorflow(x):
    return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)

def _normalize(S):
    return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)

def _denormalize(S):
    return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db

def _denormalize_tensorflow(S):
    return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db

In [7]:
fs = 48000
fftlen = pyworld.get_cheaptrick_fft_size(fs)
alpha = pysptk.util.mcepalpha(fs)
order = 25
frame_period = 5
hop_length = int(fs * (frame_period * 0.001))

def collect_features(x, fs):
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    ap = pyworld.d4c(x, f0, timeaxis, fs)
    return f0, spectrogram.T, ap

def synth_wav(f0, sp, ap, fs):
    return pw.synthesize(f0, sp.T, ap, fs)

## extract feature with WORLD

In [9]:
fre1 = []
fre2 = []
ap1 = []
ap2 = []
spec1 = []
spec2 = []

for i in range(5):
    p = glob.glob(os.path.join(base1, '*{0:03}*'.format(i+1)))[0]
    w = load_wav(p)
    f0, sp, ap =  collect_features(w, fs)
    


(787,)

## DNN

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from scipy.stats import zscore

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### dataset

In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    feature1, feature2, test_size=1/5, random_state=0) #random_stateは乱数シードの固定

X_train = np.hstack(X_train)
y_train = np.hstack(y_train)
X_test = np.hstack(X_test)
y_test = np.hstack(y_test)

a = np.concatenate([X_train, X_test, y_train, y_test],1)
_std = a.std()
_mean = a.mean()

X_train = torch.Tensor((X_train - _mean)/_std)
y_train = torch.Tensor((y_train - _mean)/_std)
X_test = torch.Tensor((X_test - _mean)/_std)
y_test = torch.Tensor((y_test - _mean)/_std)

ds_train = TensorDataset(X_train, y_train) # 入力データと教師データをまとめる
ds_test = TensorDataset(X_test, y_test) # 上同様

dataloader_train = DataLoader(ds_train,batch_size=1024, shuffle=True)
dataloader_test = DataLoader(ds_test, shuffle=False)


NameError: name 'mel1' is not defined