# Some auxiliary functions to unpack  LibriSpeech data

In [2]:
from glob import glob, iglob
import numpy as np
import tarfile
import os
import subprocess
import concurrent.futures
from itertools import repeat
import librosa

import matplotlib.pyplot as plt
from IPython.display import Audio
%matplotlib inline
%config InlineBackend.figure_format = 'jpg'

In [3]:
data_path = "/home/julia/DeepVoice_project/LibriSpeech"
train_path = data_path + '/train-clean-100'
test_path = data_path + '/test-clean'
print(data_path)

# колиество аудио файлов
print('\n Number of files in train, test')
print(len(glob("{}/**/*.flac".format(train_path), recursive = True)))
print(len(glob("{}/**/*.flac".format(test_path), recursive = True)))

# количество спекеров
print("\n Number of speakers in train, test")
print(len(list(os.walk(train_path)))),
print(len(list(os.walk(test_path))))

/home/julia/DeepVoice_project/LibriSpeech

 Number of files in train, test
28539
2620

 Number of speakers in train, test
837
128


In [2]:
def extract_tar_file(path, dst = './'):
    '''
    path : str
        Location to tar file
    dst : str
        Location to save tar file contents
    '''
    print('extracting from {}'.format(path))
    tarfile.open(path, 'r:gz').extractall(dst)
    print('extraction completed')

In [4]:
def get_dataset(path, convert_to_wav = False):
    '''
    path : str
        Location to the directory with data
    convert_to_wav : bool, optional
        True if we want to convert audiorecords to .wav format
    '''
    if not os.path.exists(path):
        print("Path {} doesn't exist".format(path))
        
    else:
        wavs = glob('{}/**/*.wav'.format(path), recursive=True)  
        if convert_to_wav:
            if len(wavs) == 0:
                flacs = glob('{}/**/*.flac'.format(path), recursive=True)
                for f in flacs:
                    subprocess.check_call(['ffmpeg', '-i', f, '-f', 'wav', '-y', '%s.wav' % f])
                wavs = glob('{}/**/*.wav'.format(path), recursive=True)
            else:
                print('WARNING: Found existing wave files.  Not converting!')
            
        dataset = []
        for wav_i in wavs:
            id_i, chapter_i, utter_i = wav_i.split('/')[-3:]
            dataset.append({
                'name': wav_i,
                'id': id_i,
                'chapter': chapter_i,
                'utterance': utter_i.split('-')[-1].strip('.wav')
            })
        if len(wavs) == 0:
            print('LibriSpeech is a FLAC dataset.  Consider rerunning this \\'
                  'command with convert_to_wav=True, to use ffmpeg to\\'
                  'convert the flac files to wave files first. This requires\\'
                  'the use of ffmpeg and so this should be installed first.')
        return dataset


In [3]:
def flac_to_wav(path):
    if not os.path.exists(path):
        print("Path {} doesn't exist".format(path))
        
    else:
        wavs = glob('{}/**/*.wav'.format(path), recursive=True)  
        if len(wavs) == 0:
            flacs = glob('{}/**/*.flac'.format(path), recursive=True)
            for f in flacs:
                subprocess.check_call(['ffmpeg', '-i', f, '-f', 'wav', '-y', '%s.wav' % f[:-5]])
            wavs = glob('{}/**/*.wav'.format(path), recursive=True)
        else:
            print('WARNING: Found existing wave files.  Not converting!')
    

In [11]:
data_path = "/home/julia/DeepVoice_project/LibriSpeech_small"
train_path = data_path + '/train-clean-100'
test_path = data_path + '/test-clean'

In [12]:
pathes = [data_path + '/train-clean-100', data_path + '/test-clean']

for path in pathes:
    print("Converting files from {} ...".format(path))
    flac_to_wav(path)
    print("Convertion completed")


Converting files from /home/julia/DeepVoice_project/LibriSpeech_small/train-clean-100 ...
Convertion completed
Converting files from /home/julia/DeepVoice_project/LibriSpeech_small/test-clean ...
Convertion completed


In [6]:
test_dataset = get_dataset(test_path, convert_to_wav = True)

In [16]:
len(test_dataset), test_dataset[0]['utterance']

(7860, '0008.flac')

In [14]:
train_dataset = get_dataset(train_path, convert_to_wav = True)

In [15]:
def batch_generator(dataset,
                    batch_size = 32,
                    max_sequence_length = 6144):
    n_batches = len(dataset) // batch_size
    for batch_i in range(n_batches):
        cropped_wavs, ids = [], []
        while len(cropped_wavs) < batch_size:
            idx_i = np.random.choice(np.arange(len(dataset)))
            fname_i = dataset[idx_i]['name']
            id_i = dataset[idx_i]['id']
            wav_i = wavfile.read(fname_i)[1]
            sample = np.random.choice(range(len(wav_i) - max_sequence_length))
            cropped_wav = wav_i[sample:sample + max_sequence_length]
            if np.max(np.abs(cropped_wav) / maxval) > threshold:
                if normalize:
                    cropped_wav = cropped_wav / maxval
                cropped_wavs.append(cropped_wav)
                ids.append(id_i)
        yield np.array(cropped_wavs, np.float32), np.array(ids, np.int32)

In [17]:
len(train_dataset), train_dataset[0]

(85617,
 {'chapter': '76549',
  'id': '6836',
  'name': '/home/julia/DeepVoice_project/LibriSpeech/train-clean-100/6836/76549/6836-76549-0007.flac.wav',
  'utterance': '0007.flac'})

# Lets make convenient folders for our experiments
From **LibriSpeech/test-clean**(128 speakers) data we will build **train**(100 speakers), **validation**(9 speakers) and **test**(19 speakers) in the following way:
for each speaker we will divide his recordings for 3 parts.

In [55]:
import shutil

def split_data(path):

    train_files, val_files, test_files = [], [], []

    for abs_names, dir_names, file_names in os.walk(path):
        if len(file_names) > 0:
            files = ['/'.join([abs_names, fname]) for fname in file_names if fname.endswith('.wav')]

            length = len(files) - 1
            train_length = int(0.78*length)
            val_length = int((0.78+0.07)*length)

            train_files.extend(files[: train_length])
            val_files.extend(files[train_length: val_length])
            test_files.extend(files[val_length:])

    return train_files, val_files, test_files

# copy to destination folder all .wav files longer than 40000 (2.5 min)
def copy_files(files, dest_dir):
    c = 0
    for wav_file in files:
        signal, sr = librosa.core.load(wav_file, sr = 16000)
        if len(signal) < 40000:
            c+=1
            continue
#         # uncomment if you really want to copy, not just count
#         else:
#             shutil.copy(wav_file, dest_dir)
    print('{} signals of length < 40000, was not copied'.format(c))

In [51]:
path = '/home/julia/DeepVoice_project/LibriSpeech_wav/test-clean'
train_files, val_files, test_files = split_data(path)

print(list(map(len, [train_files, val_files, test_files])))


[1933, 178, 509]


In [56]:
dest_path = '/home/julia/DeepVoice_project/LibriSpeech_to_classify/'

copy_files(train_files, dest_path + 'train')
copy_files(val_files, dest_path + 'val')
copy_files(test_files, dest_path + 'test')

133 signals of length < 40000, was not copied
13 signals of length < 40000, was not copied
41 signals of length < 40000, was not copied


In [8]:
signal, sr = librosa.load('/home/julia/DeepVoice_project/saved_data/test/5142-36377-0003_gaussian_add.wav',
                          sr = 16000)
Audio(signal, rate = sr)