In [2]:
import os
import multiprocessing as mp
from pathlib import Path
from itertools import chain

os.environ['PATH'] += ':/home/csci5980/piehl008/software/ffmpeg/bin/'

import pandas as pd
import numpy as np

import s3_tools
import data_tools

import warnings

In [3]:
client = s3_tools.get_s3_client()
bucket = 'fma-dataset'
raw_audio_directory = s3_tools.load_csv_from_s3(client, 'fma-dataset', 'raw_audio_directory.csv')

In [4]:
def create_second_waveforms(tid, raw_file, basename, sr=44100, length_threshold=0.95):

    try:
        warnings.filterwarnings("ignore")
        waveform, true_sr = s3_tools.load_raw_audio_from_s3(client, 'fma-dataset', raw_file, sr=sr)
        assert true_sr == sr
        warnings.filterwarnings('default')
    except:
        warnings.warn('Unable to create waveform for {}'.format(raw_file))
        return []

    waveform = np.trim_zeros(waveform)
    split_waveform = np.split(waveform, np.arange(sr, len(waveform), sr))
    if len(split_waveform[-1]) / float(sr) < length_threshold:
        split_waveform = split_waveform[:-1]
    else:
        split_waveform[-1] = np.pad(split_waveform[-1], (0, sr - len(split_waveform[-1])),
                                    'constant', constant_values=(0, 0))
    split_files = []
    for i, split in enumerate(split_waveform):
        split_name = 'sr{}_{}-{}.npy'.format(sr, basename, i)
        s3_tools.write_numpy_file_to_s3(split, client, bucket, split_name)
        split_files += [(tid, split_name)]

    return split_files

In [5]:
basenames = raw_audio_directory['file'].map(lambda x: str(Path('waveforms', Path(x).stem)))

with mp.Pool(mp.cpu_count()) as p:
    waveform_files = list(p.starmap(create_second_waveforms, 
                                    zip(raw_audio_directory['track_id'], 
                                        raw_audio_directory['file'],
                                        basenames)))

wdf = pd.DataFrame(list(chain.from_iterable(waveform_files)), columns=['track_id', 'waveform_file'])
wdf = wdf.join(raw_audio_directory.set_index('track_id'), on='track_id')

resource = s3_tools.get_s3_resource()
s3_tools.write_csv_to_s3(wdf, resource, 'fma-dataset', 'sr{}_waveform_directory.csv'.format(44100))

wdf

Unnamed: 0,track_id,waveform_file,file,genre_top,genres,genres_all,split,subset
0,2,sr44100_waveforms/000002-0.npy,raw-audio/000/000002.mp3,Hip-Hop,[21],[21],training,small
1,2,sr44100_waveforms/000002-1.npy,raw-audio/000/000002.mp3,Hip-Hop,[21],[21],training,small
2,2,sr44100_waveforms/000002-2.npy,raw-audio/000/000002.mp3,Hip-Hop,[21],[21],training,small
3,2,sr44100_waveforms/000002-3.npy,raw-audio/000/000002.mp3,Hip-Hop,[21],[21],training,small
4,2,sr44100_waveforms/000002-4.npy,raw-audio/000/000002.mp3,Hip-Hop,[21],[21],training,small
...,...,...,...,...,...,...,...,...
239570,155066,sr44100_waveforms/155066-25.npy,raw-audio/155/155066.mp3,Hip-Hop,"[21, 811]","[811, 21]",training,small
239571,155066,sr44100_waveforms/155066-26.npy,raw-audio/155/155066.mp3,Hip-Hop,"[21, 811]","[811, 21]",training,small
239572,155066,sr44100_waveforms/155066-27.npy,raw-audio/155/155066.mp3,Hip-Hop,"[21, 811]","[811, 21]",training,small
239573,155066,sr44100_waveforms/155066-28.npy,raw-audio/155/155066.mp3,Hip-Hop,"[21, 811]","[811, 21]",training,small


## Create Training/Validation/Testing Split csv Files

In [18]:
wdf = s3_tools.load_csv_from_s3(client, 'fma-dataset', 'sr{}_waveform_directory.csv'.format(44100))
resource = s3_tools.get_s3_resource()

In [19]:
wdf_train = wdf.loc[wdf['split'] == 'training']
s3_tools.write_csv_to_s3(wdf_train, resource, 'fma-dataset', 'sr{}_waveform_training_directory.csv'.format(44100))

In [20]:
wdf_val = wdf.loc[wdf['split'] == 'validation']
s3_tools.write_csv_to_s3(wdf_val, resource, 'fma-dataset', 'sr{}_waveform_validation_directory.csv'.format(44100))

In [21]:
wdf_test = wdf.loc[wdf['split'] == 'test']
s3_tools.write_csv_to_s3(wdf_test, resource, 'fma-dataset', 'sr{}_waveform_test_directory.csv'.format(44100))

In [22]:
len(wdf_train) + len(wdf_val) + len(wdf_test)

239575

In [23]:
len(wdf)

239575