In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class UnsupportedWavFileException(Exception):
  "Unsupported WAV File"

In [None]:
import glob
import librosa
import numpy as np

def load_spectrograms(dirs, winsize=2048, hopsize=None):
    x_all = []
    sr_all = []
    S_all = []
    if hopsize is None:
        hopsize = int(winsize / 4)

    for directory in dirs:
        files = glob.glob(directory + "/*.wav")
        for i, f in enumerate(files):
            try:
                y, sr = librosa.load(f)
                sr_all.append(sr)
                D = librosa.stft(y, n_fft=winsize, win_length=winsize, hop_length=hopsize)
                S, phase = librosa.magphase(D)
                x_all.append(np.expand_dims(S, axis=-1))
                S_all.append(np.abs(D))
                print(i, f, len(S_all) - 1)
            except:
                print("skip")

    x_train = np.asarray(x_all)
    return x_train, sr_all, S_all

In [None]:
dirs =["/content/drive/MyDrive/B4川原/wav/Sound Pool/Vol.2/TechnoTrance Vol.9/Drums"]

x_train, sr_all, original_S = load_spectrograms(dirs)

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
encoded_dim=16
tfd = tfp.distributions
prior = tfd.Independent(
tfd.Normal(loc=tf.zeros(encoded_dim), scale=1),
reinterpreted_batch_ndims=1)

In [None]:
seq_length = x_train.shape[2]
input_dim = x_train.shape[1]
hidden_dim=256
encoder = tf.keras.Sequential()
encoder.add(tf.keras.layers.Conv2D(hidden_dim, (input_dim,1),
                                   input_shape=(input_dim,seq_length, 1),
                                   strides=1, padding="valid",
                                   activation="relu"))
encoder.add(tf.keras.layers.Conv2D(hidden_dim, (1,37), strides=( 1,37),
                                   padding="valid", activation="relu"))
encoder.add(tf.keras.layers.Conv2D(hidden_dim, (1,4), strides=(1,4),
                                   padding="valid", activation="relu"))
encoder.add(tf.keras.layers.Flatten())
encoder.add(tf.keras.layers.Dense(
    tfp.layers.MultivariateNormalTriL.params_size(encoded_dim),
    activation=None))
encoder.add(tfp.layers.MultivariateNormalTriL(
    encoded_dim,
    activity_regularizer=tfp.layers.KLDivergenceRegularizer(
        prior, weight=0.001)))
encoder.summary()

In [None]:
decoder = tf.keras.Sequential()
decoder.add(tf.keras.layers.Dense(hidden_dim, input_dim=encoded_dim,
                                  activation="relu"))

decoder.add(tf.keras.layers.Reshape((1, 1, hidden_dim)))
decoder.add(tf.keras.layers.Conv2DTranspose( hidden_dim, (1,4), strides=(1,4), padding="valid", activation="relu"))
decoder.add(tf.keras.layers.Conv2DTranspose( hidden_dim, (1,37), strides=(1,37), padding="valid", activation="relu"))
decoder.add(tf.keras.layers.Conv2DTranspose( 1, (input_dim,1), strides=1, padding="valid", activation="relu"))
decoder.summary()

In [None]:
from keras.callbacks import ModelCheckpoint
import os

os.makedirs('drive/MyDrive/vae_model', exist_ok=True)
vae = tf.keras.Model(encoder.inputs, decoder(encoder.outputs))
weight_path = '/content/drive/MyDrive/vae_model/model.h5'

if os.path.exists(weight_path):
    vae.load_weights(weight_path)
vae.compile(optimizer="adam", loss="mse", metrics="mse")
model_checkpoint = ModelCheckpoint(
    filepath=os.path.join('drive/MyDrive/vae_model', 'model.h5'),
    monitor='loss',
    save_best_only=True,
    verbose=1)

history = vae.fit(x_train, x_train, epochs=500, batch_size=16, callbacks=[model_checkpoint])

In [None]:
from keras.callbacks import ModelCheckpoint
import os

os.makedirs('drive/MyDrive/vae_model', exist_ok=True)
vae = tf.keras.Model(encoder.inputs, decoder(encoder.outputs))
vae.load_weights('/content/drive/MyDrive/vae_model/model.h5')
vae.compile(optimizer="adam", loss="mse", metrics="mse")


In [None]:
vae.evaluate(x_train,x_train)

In [None]:
z = encoder.predict(x_train)

In [None]:
def decode_spectrograms(encoder, decoder, z):
    re_spec = []
    for i in range(len(z)):
        try:
            d = decoder.predict(np.array([z[i]]))
            re_spec.append(np.squeeze(d))
        except UnsupportedWavFileException:
            print("Skip")
    return re_spec

re_spec = decode_spectrograms(encoder, decoder, z)

In [None]:
similarities = []
for i in range(len(original_S)):
    s1 = original_S[i]
    s2 = re_spec[i]
    cosine_similarity = np.dot(s1.flatten(), s2.flatten()) / (np.linalg.norm(s1) * np.linalg.norm(s2))
    similarities.append(cosine_similarity)
    print(i)
    print("類似度: ", cosine_similarity)

In [None]:
import IPython
a = 0.5
i = 指定した音源数字
j = 指定した音源数字
my_x = decoder.predict(np.array([z[i]])*a + np.array([z[j]])* (1-a))
my_x = np.squeeze(my_x)

In [None]:
import matplotlib.pyplot as plt
winsize = 2048
hopsize = int(winsize/4)
y_inv = librosa.griffinlim(my_x, n_iter=32, win_length=winsize, hop_length=hopsize)
print(y_inv)
plt.matshow(my_x)

In [None]:
IPython.display.display(IPython.display.Audio(y_inv, rate=sr_all[0]))