In [1]:
from pydub import AudioSegment
from pydub.utils import mediainfo
from scipy import spatial
from glob import glob
from IPython.display import Audio
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import datetime

In [2]:
num_dirs = 2
dirs = glob('../fma_small/*')
dirs = dirs[:num_dirs]
dirs = [str(dirs[i]) + str('/*.mp3') for i in range(len(dirs))]
dirs = [glob(dirs[i]) for i in range(len(dirs))]
path = []
for i in range(len(dirs)):
    path += dirs[i]

In [8]:
Fs = 44100
LENGTH = int(1.5*Fs)
INPUT_LEN = 30*Fs
INTERVAL = int(0.05*Fs) #in seconds, determines how many fingerprints each song will generate

def get_batch(path, k=None, shuffle=True):
    ## path: path that contains the label of the music
    ## k: batch size

    batch = []
    label = []
    
    music = [AudioSegment.from_file(files) for files in path]
    
    for i in range(len(music)):
        temp = np.array(music[i].get_array_of_samples())
        # check if the audio has two channels
        if len(temp) > INPUT_LEN: 
            temp = [temp[i] for i in range(0,len(temp),2)] #get left channel only
            
        # convert to fingerprints
        num_fp = int((len(temp)-LENGTH)/INTERVAL)+1
        for j in range(num_fp):
            segment = temp[j*INTERVAL:j*INTERVAL+LENGTH]
            segment = np.asarray(segment)          
            batch.append(segment)
        
        # discard last few elements
        # create label for the current music (modify so that only nearby segment have the same label)
        label += [int(path[0][17:23])] * num_fp
        print('num music: '+str(i))

    batch = np.asarray(batch)
    label = np.asarray(label)

    # shuffle all samples
    if shuffle:
        s = np.arange(batch.shape[0])
        np.random.shuffle(s)
        batch = batch[s]
        label = label[s]
    
    # create k-size batches
    if k != None:
        num_batch = int(batch.shape[0]/k)
        batch = batch[:num_batch*k]
        label = label[:num_batch*k]
        batch = batch.reshape(num_batch, k, LENGTH, 1)
        label = label.reshape(num_batch, k)
        batch = [batch[i]/np.amax(batch[i]) for i in range(batch.shape[0])]
        
    # normalize data
    batch = np.asarray(batch)
    label = np.asarray(label)
    batch = batch.astype(np.float32)
    batch = batch/np.amax(batch)

    return np.asarray(batch), np.asarray(label)

In [11]:
start = datetime.datetime.now()
train_data, train_label = get_batch(path[:1])
end = datetime.datetime.now()
print('time taken: '+str(end-start)) #4s without gpu

num music: 0
time taken: 0:00:04.809125


In [12]:
MARGIN = 0.2

# helper function for triplet loss
def dist(x,y):
    diffs = tf.expand_dims(x, axis=1) - tf.expand_dims(y, axis=0)
    return tf.sqrt(tf.reduce_sum(tf.square(diffs), axis=-1))

def triplet_loss(dists, labels):
    identity_mask = tf.equal(tf.expand_dims(labels, axis=1),
                             tf.expand_dims(labels, axis=0))
    negative_mask = tf.logical_not(identity_mask)
    positive_mask = tf.logical_xor(identity_mask,
                                   tf.eye(tf.shape(labels)[0], dtype=tf.bool))

    furthest_positive = tf.reduce_max(dists*tf.cast(positive_mask, tf.float32), axis=1)
    closest_negative = tf.map_fn(lambda x: tf.reduce_min(tf.boolean_mask(x[0], x[1])),
                                (dists, negative_mask), tf.float32)
    
    diff = furthest_positive - closest_negative
    
    return tf.maximum(diff + MARGIN, 0.0)

In [21]:
NUM_EMBEDDING = 64
SPLIT = 8

def model(features):
    # convolution layer 1 
    conv1 = tf.layers.conv1d(
        inputs=features,
        filters=32,
        kernel_size=3,
        padding="same",
        activation=tf.nn.relu)
    
    # pooling layer 1
    pool1 = tf.layers.max_pooling1d(inputs=conv1, pool_size=2, strides=2)
    
    # convolution layer 2 
    conv2 = tf.layers.conv1d(
        inputs=pool1,
        filters=32,
        kernel_size=3,
        padding="same",
        activation=tf.nn.relu)
    
    # pooling layer 2
    pool2 = tf.layers.max_pooling1d(inputs=conv2, pool_size=2, strides=2)
    
    # convolution layer 3 
    conv3 = tf.layers.conv1d(
        inputs=pool2,
        filters=32,
        kernel_size=3,
        padding="same",
        activation=tf.nn.relu)
    
    # pooling layer 3
    pool3 = tf.layers.max_pooling1d(inputs=conv3, pool_size=2, strides=2)
    
    # convolution layer 4 
    conv4 = tf.layers.conv1d(
        inputs=pool3,
        filters=32,
        kernel_size=3,
        padding="same",
        activation=tf.nn.relu)
    
    # pooling layer 4
    pool4 = tf.layers.max_pooling1d(inputs=conv4, pool_size=2, strides=2)
    
    # convolution layer 3 
    conv5 = tf.layers.conv1d(
        inputs=pool4,
        filters=32,
        kernel_size=3,
        padding="same",
        activation=tf.nn.relu)
    
    # pooling layer 3
    pool5 = tf.layers.max_pooling1d(inputs=conv5, pool_size=2, strides=2)
    
    # convolution layer 3 
    conv6 = tf.layers.conv1d(
        inputs=pool5,
        filters=32,
        kernel_size=3,
        padding="same",
        activation=tf.nn.relu)
    
    # pooling layer 3
    pool6 = tf.layers.max_pooling1d(inputs=conv6, pool_size=2, strides=2)

    ### more convolution layers here...

    # flatten input: 16*64=1024
    flat = tf.layers.flatten(pool6)

    # divide and encode
    concatenate = []
    num_each_split = flat.shape[-1]//SPLIT
    units = NUM_EMBEDDING//SPLIT
    splits = tf.split(flat, SPLIT, 1)

    for i in range(SPLIT):
        divide = tf.layers.dense(splits[i], activation=tf.nn.relu, units=128)
        divide = tf.layers.dense(divide, activation=tf.nn.relu, units=units)
        concatenate.append(divide)
    
    embedding = tf.concat([elem for elem in concatenate], 1)
    # final embedding
    #embedding = tf.layers.dense(flat,
                                #activation=None,
                                #kernel_initializer=tf.truncated_normal_initializer,
                                #units=NUM_EMBEDDING)
    
    return embedding

In [22]:
input_data = tf.placeholder(tf.float32, [None, LENGTH, 1])
input_label = tf.placeholder(tf.int32, [None])

# loss function operations
embedded = model(input_data)
dists = dist(embedded, embedded)
loss = tf.reduce_mean(triplet_loss(dists, input_label))

# train operation
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
    loss=loss,
    global_step=tf.train.get_global_step())
init = tf.global_variables_initializer()

Tensor("max_pooling1d_12/Squeeze:0", shape=(?, 33075, 32), dtype=float32)
Tensor("max_pooling1d_13/Squeeze:0", shape=(?, 16537, 32), dtype=float32)
Tensor("max_pooling1d_14/Squeeze:0", shape=(?, 8268, 32), dtype=float32)
Tensor("max_pooling1d_15/Squeeze:0", shape=(?, 4134, 32), dtype=float32)
Tensor("max_pooling1d_16/Squeeze:0", shape=(?, 2067, 32), dtype=float32)
Tensor("max_pooling1d_17/Squeeze:0", shape=(?, 1033, 32), dtype=float32)
Tensor("flatten_3/Reshape:0", shape=(?, 33056), dtype=float32)


In [None]:
num_epoch = 1
num_path = 8
path_per_itr = len(path)//num_path
batch_size = 128
sess = tf.Session()
sess.run(init)
loss_hist = []
    
for i in range(num_epoch):
    # shuffle the path
    np.random.shuffle(path)
    for j in range(num_path):
        cur_path = path[path_per_itr*j:path_per_itr*(j+1)]
        print(cur_path)
        train_data, train_label = get_batch(cur_path, batch_size)
        for k in range(train_data.shape[0]):
            _, loss_val = sess.run([train_op, loss],
                                    feed_dict={input_data: train_data[k],
                                               input_label: train_label[k]})
            loss_hist.append(loss_val)
            print('batch num: '+str(k))
        print('iter num: '+str(j))
    print('num_epoch: '+str(i)+' loss: '+str(loss_hist[-1]))

['../fma_small/132/132791.mp3', '../fma_small/132/132118.mp3', '../fma_small/132/132568.mp3', '../fma_small/132/132456.mp3', '../fma_small/135/135337.mp3', '../fma_small/135/135341.mp3', '../fma_small/132/132773.mp3', '../fma_small/132/132787.mp3', '../fma_small/135/135371.mp3', '../fma_small/132/132965.mp3', '../fma_small/132/132794.mp3', '../fma_small/135/135336.mp3']
num music: 0
num music: 1
num music: 2
num music: 3
num music: 4
num music: 5
num music: 6
num music: 7
num music: 8
num music: 9
num music: 10
num music: 11
