- /raid/data/lmarkeeva/generated_sinus/wav_sinusoids - синусоиды в диапазоне от 16Гц до 25кГц
- /raid/data/lmarkeeva/generated_sinus/wav_notes - синусоиды, соответствующие нотам: https://ru.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BE%D1%82%D1%8B_%D0%BD%D0%B0%D1%81%D1%82%D1%80%D0%BE%D0%B9%D0%BA%D0%B8_%D1%84%D0%BE%D1%80%D1%82%D0%B5%D0%BF%D0%B8%D0%B0%D0%BD%D0%BE
- /raid/data/lmarkeeva/generated_sinus/wav_notes_splited - предыдущее, но поделенное на папки test и train

Формат имени:
<частота>-<громкость>.wav

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="6"

In [2]:
%matplotlib inline

import matplotlib.pyplot as plt

In [3]:
import numpy as np

def midi_to_pitch(d):
    return 2.0**((d-69.)/12.0)*440.0

def pitch_to_midi(f):
    return 69+12*np.log2(f/440.)

def gen_sin(freq, volume, duration, padding_duration=None, sr=16000):
    size = duration
    if padding_duration is not None:
        size += padding_duration
    
    signal = np.zeros(int(size*sr), dtype=np.float32)
    signal[:int(duration*sr)] = volume * np.sin(2*np.pi*np.arange(sr*duration)*freq/sr)

    return signal

In [4]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [6]:
import numpy as np
import tensorflow as tf
import random

LOW_LIMIT = 16
HIGH_LIMIT = 25000
eps = 1e-6

KEY = "key"
AUDIO = "signal_raw"
PITCH = "pitch"
VELOCITY = "velocity"
WAV = "wav"
VOLUMES = np.array([25., 50., 75., 100., 127.])/127.
NOTES_FREQ = [4186.01, 3951.07, 3729.31, 3520.0, 3322.44, 3135.96, 2959.96, 2793.83, 2637.02, 2489.02, 2349.32, 2217.46, 2093.0, 1975.53, 1864.66, 1760.0, 1661.22, 1567.98, 1479.98, 1396.91, 1318.51, 1244.51, 1174.66, 1108.73, 1046.5, 987.767, 932.328, 880.0, 830.609, 783.991, 739.989, 698.456, 659.255, 622.254, 587.33, 554.365, 523.251, 493.883, 466.164, 440.0, 415.305, 391.995, 369.994, 349.228, 329.628, 311.127, 293.665, 277.183, 261.626, 246.942, 233.082, 220.0, 207.652, 195.998, 184.997, 174.614, 164.814, 155.563, 146.832, 138.591, 130.813, 123.471, 116.541, 110.0, 103.826, 97.999, 92.499, 87.307, 82.407, 77.782, 73.416, 69.296, 65.406, 61.735, 58.27, 55.0, 51.913, 48.999, 46.249, 43.654, 41.203, 38.891, 36.708, 34.648, 32.703, 30.868, 29.135, 27.5]



def _bytes_features(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))

def _int64_features(value):
    return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))

def _floats_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _floats_features(values):
    return tf.train.Feature(float_list=tf.train.FloatList(value=values))

class SinDataset(object):
    """Dataset object to help manage the TFRecord loading."""
    
    def __init__(self, record_path, is_training=True, num_threads=4, capacity=500, min_after=200):
        self.is_training = is_training
        self.record_path = record_path
        self.num_threads = num_threads
        self.capacity = capacity
        self.min_after = min_after

    @staticmethod
    def generate_dataset(tfrecord_path, duration, padding_duration=None, 
                         sr=16000, nsynch_velocity=False,
                         verbose=False, print_every=1000):
        with tf.python_io.TFRecordWriter(tfrecord_path) as writer:
#             freques = np.arange(LOW_LIMIT, HIGH_LIMIT)
#             np.random.shuffle(freques)
            freques = NOTES_FREQ
            i = -1
            for freq in freques:
                i+=1
                volumes = None
                if nsynch_velocity:
                    volumes = VOLUMES
                else:
                    volumes = [max(eps, random.random())]

                for volume in volumes:
                    key = "{:.3f}-{:.3f}".format(float(freq), volume).encode()
                    midi_picth = int(round(pitch_to_midi(float(freq))))
                    midi_volume = int(round(volume * 127.0))
                    signal = gen_sin(float(freq), volume, duration, padding_duration, sr)
                    raw_signal = signal.tobytes()

                    feature = { KEY: _bytes_features(key),
                                AUDIO: _bytes_features(raw_signal),
                                PITCH: _int64_features(midi_picth),
                                VELOCITY: _int64_features(midi_volume)}

                    example = tf.train.Example(features = tf.train.Features(feature = feature))
                    writer.write(example.SerializeToString())
                
                if verbose and (i+1) > 1 and (i+1) % print_every == 0:
                    print("Done {} records".format((i+1)*len(volumes)))
            print("Done: {}".format(freques*len(volumes)))
                    
    def get_first_n(self, n, sr=16000, duration=4.0):
        path_queue = tf.train.input_producer([self.record_path],
                                             num_epochs=None if self.is_training else 1,
                                             shuffle=self.is_training,
                                             capacity=n)
        reader = tf.TFRecordReader()
        _, serialized_examples = reader.read(path_queue)
        
        size = int(sr*duration)
        features = {
                    KEY: tf.FixedLenFeature([], dtype=tf.string),
                    AUDIO: tf.FixedLenFeature([], dtype=tf.string),
                    PITCH: tf.FixedLenFeature([1], dtype=tf.int64),
                    VELOCITY: tf.FixedLenFeature([1], dtype=tf.int64)
        }
        
        examples = tf.parse_single_example(serialized_examples, features)
        
        return examples
    
    def get_first_n_wavenet(self, n, sr=16000, 
                            duration=4.0, crop_len=64000,
                            test_mode=False):

        example = self.get_first_n(n)
        
        signal = tf.decode_raw(example['signal_raw'], tf.float32)
        key = tf.squeeze(example[KEY])
        pitch = tf.squeeze(example[PITCH])
        velocity = tf.squeeze(example[VELOCITY])
        
        if self.is_training:
          # random crop
            crop = tf.random_crop(signal, [crop_len])
            crop = tf.reshape(crop, [1, crop_len])

        else:
            # fixed center crop
            offset = (int(sr * duration) - crop_len) // 2  # 24320
            crop = tf.slice(wav, [offset], [crop_len])
            crop = tf.reshape(crop, [1, crop_len])
            
        if test_mode:
            crops, keys, pitchs, velocities = tf.train.batch( [crop, key, pitch, velocity],
                                                              n,
                                                              num_threads=1,
                                                              capacity=self.capacity * n)
        else:
            crops, keys, pitchs, velocities = tf.train.shuffle_batch([crop, key, pitch, velocity],
                                                                      n,
                                                                      num_threads=self.num_threads,
                                                                      capacity=self.capacity * batch_size,
                                                                      min_after_dequeue=self.min_after * batch_size)
               
        crops = tf.reshape(tf.cast(crops, tf.float32), [n, crop_len])
        pitchs = tf.cast(pitchs, tf.int32)
        velocities = tf.cast(velocities, tf.int32)
        return {"wav": crops, "pitch": pitchs, "velocity": velocities, "key": keys}        

In [7]:
tfrecord_path = "/raid/data/lmarkeeva/generated_sinus/notes.tfrecord"
num_records = 10000
duration = 3.0
padding_duration = 1.0
sr = 16000
verbose = True

test_data= SinDataset.generate_dataset(tfrecord_path, duration, padding_duration=padding_duration, 
                                       sr=sr, nsynch_velocity=True,
                                       verbose=verbose, print_every=1000)

Done: [4186.01, 3951.07, 3729.31, 3520.0, 3322.44, 3135.96, 2959.96, 2793.83, 2637.02, 2489.02, 2349.32, 2217.46, 2093.0, 1975.53, 1864.66, 1760.0, 1661.22, 1567.98, 1479.98, 1396.91, 1318.51, 1244.51, 1174.66, 1108.73, 1046.5, 987.767, 932.328, 880.0, 830.609, 783.991, 739.989, 698.456, 659.255, 622.254, 587.33, 554.365, 523.251, 493.883, 466.164, 440.0, 415.305, 391.995, 369.994, 349.228, 329.628, 311.127, 293.665, 277.183, 261.626, 246.942, 233.082, 220.0, 207.652, 195.998, 184.997, 174.614, 164.814, 155.563, 146.832, 138.591, 130.813, 123.471, 116.541, 110.0, 103.826, 97.999, 92.499, 87.307, 82.407, 77.782, 73.416, 69.296, 65.406, 61.735, 58.27, 55.0, 51.913, 48.999, 46.249, 43.654, 41.203, 38.891, 36.708, 34.648, 32.703, 30.868, 29.135, 27.5, 4186.01, 3951.07, 3729.31, 3520.0, 3322.44, 3135.96, 2959.96, 2793.83, 2637.02, 2489.02, 2349.32, 2217.46, 2093.0, 1975.53, 1864.66, 1760.0, 1661.22, 1567.98, 1479.98, 1396.91, 1318.51, 1244.51, 1174.66, 1108.73, 1046.5, 987.767, 932.328, 880

In [29]:
wavs=None
with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord) 
    print("Read wavs")
    wavs = data["wav"].eval()
    keys = data["key"].eval()
    pitch = data["pitch"].eval()
    velocity = data["velocity"].eval()
    coord.request_stop()
    coord.join(threads)


Read wavs


In [8]:
import librosa


# reconstruct signals and annotation info from .tfrecord file
dir_for_saving = "/raid/data/lmarkeeva/generated_sinus/wav_notes"
record_iterator = tf.python_io.tf_record_iterator(path = tfrecord_path)

for string_record in record_iterator:
    example = tf.train.Example()
    example.ParseFromString(string_record)
    
    signal = np.fromstring(example.features.feature['signal_raw'].bytes_list.value[0], dtype = np.float32)
    key = example.features.feature['key'].bytes_list.value[0].decode()
    
    with open(os.path.join(dir_for_saving, "{}.wav".format(key)), "wb") as f:
        librosa.output.write_wav(f, signal, sr=16000)

  if sys.path[0] == '':


In [9]:
notes = []
with open("notes.txt", "r", encoding="utf-8") as f:
    for line in f:
        notes.append(round(float(line.strip().split("\t")[-1]), 3))

In [10]:
print(notes)

[4186.01, 3951.07, 3729.31, 3520.0, 3322.44, 3135.96, 2959.96, 2793.83, 2637.02, 2489.02, 2349.32, 2217.46, 2093.0, 1975.53, 1864.66, 1760.0, 1661.22, 1567.98, 1479.98, 1396.91, 1318.51, 1244.51, 1174.66, 1108.73, 1046.5, 987.767, 932.328, 880.0, 830.609, 783.991, 739.989, 698.456, 659.255, 622.254, 587.33, 554.365, 523.251, 493.883, 466.164, 440.0, 415.305, 391.995, 369.994, 349.228, 329.628, 311.127, 293.665, 277.183, 261.626, 246.942, 233.082, 220.0, 207.652, 195.998, 184.997, 174.614, 164.814, 155.563, 146.832, 138.591, 130.813, 123.471, 116.541, 110.0, 103.826, 97.999, 92.499, 87.307, 82.407, 77.782, 73.416, 69.296, 65.406, 61.735, 58.27, 55.0, 51.913, 48.999, 46.249, 43.654, 41.203, 38.891, 36.708, 34.648, 32.703, 30.868, 29.135, 27.5]


In [11]:
from sklearn.model_selection import train_test_split

train_notes, test_notes = train_test_split(notes, test_size=0.2)

In [12]:
train_notes = set(train_notes)
test_notes = set(test_notes)
print(test_notes)

{1760.0, 34.648, 391.995, 1479.98, 41.203, 554.365, 2093.0, 2349.32, 3951.07, 48.999, 369.994, 29.135, 146.832, 1046.5, 246.942, 440.0, 349.228, 1661.22}


In [15]:
import os

from shutil import copyfile

dir_for_saving = "/raid/data/lmarkeeva/generated_sinus/wav_notes"
save_notes_train = "/raid/data/lmarkeeva/generated_sinus/wav_notes_splited/train"
save_notes_test = "/raid/data/lmarkeeva/generated_sinus/wav_notes_splited/test"

for file_name in os.listdir(dir_for_saving):
    freq = round(float(file_name.strip().split("-")[0]), 3)
    src = os.path.join(dir_for_saving, file_name)
    dst = None
    if freq in train_notes:
        dst = os.path.join(save_notes_train, file_name)
        
    if freq in test_notes:
        dst = os.path.join(save_notes_test, file_name)
    
    copyfile(src, dst)



In [25]:
import IPython
IPython.display.Audio("/raid/data/lmarkeeva/generated_sinus/wav_notes_splited/train/110.000-1.000.wav")