In [1]:
import time
from itertools import chain

import h5py
import numpy as np
import librosa
from re import sub
from loguru import logger
from pathlib import Path
from tqdm import tqdm
from tools.file_io import load_csv_file, write_pickle_file

In [2]:
def load_metadata(dataset, csv_file):
    """Load meta data of Clotho
    """
    if dataset == 'AudioCaps' and 'train' in csv_file:
        caption_field = None
    else:
        caption_field = ['caption_{}'.format(i) for i in range(1, 6)]
    csv_list = load_csv_file(csv_file)

    audio_names = []
    captions = []

    for i, item in enumerate(csv_list):

        audio_name = item['file_name']
        if caption_field is not None:
            item_captions = [_sentence_process(item[cap_ind], add_specials=False) for cap_ind in caption_field]
        else:
            item_captions = _sentence_process(item['caption'])
        audio_names.append(audio_name)
        captions.append(item_captions)

    meta_dict = {'audio_name': np.array(audio_names), 'captions': np.array(captions)}

    return meta_dict


In [3]:
def _create_vocabulary(captions):
    vocabulary = []
    for caption in captions:
        caption_words = caption.strip().split()
        vocabulary.extend(caption_words)
    words_list = list(set(vocabulary))
    words_list.sort(key=vocabulary.index)
    words_freq = [vocabulary.count(word) for word in words_list]
    words_list.append('<sos>')
    words_list.append('<eos>')
    words_list.append('<ukn>')
    words_freq.append(len(captions))
    words_freq.append(len(captions))
    words_freq.append(0)

    return words_list, words_freq


def _sentence_process(sentence, add_specials=False):

    # transform to lower case
    sentence = sentence.lower()

    if add_specials:
        sentence = '<sos> {} <eos>'.format(sentence)

    # remove any forgotten space before punctuation and double space
    sentence = sub(r'\s([,.!?;:"](?:\s|$))', r'\1', sentence).replace('  ', ' ')

    # remove punctuations
    sentence = sub('[,.!?;:\"]', ' ', sentence).replace('  ', ' ')
    return sentence


def pad_or_truncate(x, audio_length):
    """Pad all audio to specific length."""
    length = len(x)
    if length <= audio_length:
        return np.concatenate((x, np.zeros(audio_length - length)), axis=0), length
    else:
        return x[:audio_length], audio_length


In [None]:

def pack_dataset_to_hdf5(dataset):
    """

    Args:
        dataset: 'AudioCaps', 'Clotho'

    Returns:

    """

    splits = ['train', 'val', 'test']
    sampling_rate = 32000
    all_captions = []
    
    #if dataset == 'AudioCaps':
    #    audio_duration = 10
    if dataset == 'Clotho':
        audio_duration = 30
        
    else:
        raise NotImplementedError(f'No dataset named: {dataset}')

    max_audio_length = audio_duration * sampling_rate # 30 * 32000
    
    
    for split in splits:
        csv_path = 'data/{}/csv_files/{}.csv'.format(dataset, split)
        audio_dir = 'data/{}/waveforms/{}/'.format(dataset, split)
        hdf5_path = 'data/{}/hdf5s/{}/'.format(dataset, split)

        # make dir for hdf5
        Path(hdf5_path).mkdir(parents=True, exist_ok=True)

        meta_dict = load_metadata(dataset, csv_path)
        # meta_dict: {'audio_names': [], 'captions': []}

        audio_nums = len(meta_dict['audio_name'])

        if split == 'train':
            # store all captions in training set into a list
            if dataset == 'Clotho':
                for caps in meta_dict['captions']:
                    for cap in caps:
                        all_captions.append(cap)
            else:
                all_captions.extend(meta_dict['captions'])

        start_time = time.time()

        with h5py.File(h df5_path+'{}.h5'.format(split), 'w') as hf:

            hf.create_dataset('audio_name', shape=(audio_nums,), dtype=h5py.special_dtype(vlen=str))
            hf.create_dataset('audio_length', shape=(audio_nums,), dtype=np.uint32)
            hf.create_dataset('waveform', shape=(audio_nums, max_audio_length), dtype=np.float32)

            if split == 'train' and dataset == 'AudioCaps':
                hf.create_dataset('caption', shape=(audio_nums,), dtype=h5py.special_dtype(vlen=str))
            else:
                hf.create_dataset('caption', shape=(audio_nums, 5), dtype=h5py.special_dtype(vlen=str))

            for i in tqdm(range(audio_nums)):
                audio_name = meta_dict['audio_name'][i]

                audio, _ = librosa.load(audio_dir + audio_name, sr=sampling_rate, mono=True)
                audio, audio_length = pad_or_truncate(audio, max_audio_length)

                hf['audio_name'][i] = audio_name.encode()
                hf['audio_length'][i] = audio_length
                hf['waveform'][i] = audio
                hf['caption'][i] = meta_dict['captions'][i]

        logger.info(f'Packed {split} set to {hdf5_path} using {time.time() - start_time} s.')
    words_list, words_freq = _create_vocabulary(all_captions)
    logger.info(f'Creating vocabulary: {len(words_list)} tokens!')
    write_pickle_file(words_list, 'data/{}/pickles/words_list.p'.format(dataset))


In [4]:
splits = ['train', 'val', 'test']
sampling_rate = 32000
audio_duration = 30
max_audio_length = audio_duration * sampling_rate # 30 * 32000
dataset='Clotho'

In [5]:
import os
os.getcwd()

'/home/clim-lab/바탕화면/main_drive/jiwon/retrieval'

In [6]:
data_path = '/home/clim-lab/바탕화면/main_drive/jiwon/retrieval/data/Clotho'

In [7]:
csv_path = os.path.join(data_path, 'csv_files/train.csv')

In [8]:
import pandas as pd
pd.read_csv(csv_path)

Unnamed: 0,file_name,caption_1,caption_2,caption_3,caption_4,caption_5
0,Distorted AM Radio noise.wav,A muddled noise of broken channel of the TV,A television blares the rhythm of a static TV.,Loud television static dips in and out of focus,The loud buzz of static constantly changes pit...,heavy static and the beginnings of a signal on...
1,Paper_Parchment_Rustling.wav,A person is turning a map over and over.,A person is very carefully rapping a gift for ...,A person is very carefully wrapping a gift for...,"He sighed as he turned the pages of the book, ...","papers are being turned, stopped, then turned ..."
2,03 Whales Slowing Down.wav,Several barnyard animals mooing in a barn whil...,"The vocalization of several whales, along with...","Underwater, large numbers of shrimp clicking a...",Whales sing to one another over the flowing wa...,wales sing to one another with water flowing i...
3,Rope tied to boat in port.wav,An office chair is squeaking as someone bends ...,Popping and squeaking gradually tapers off to ...,Someone is opening a creaky door slowly while ...,Squeaking and popping followed by gradual popp...,an office chair is squeaking as someone leans ...
4,carpenter bee.wav,A flying bee is buzzing loudly around an objec...,An annoying fly is buzzing loudly and consiste...,An insect buzzing in the foreground as birds c...,"An insect trapped in a spider web struggles, b...","Outdoors, insect trapped in a spider web and t..."
...,...,...,...,...,...,...
3834,Metallic Ping CPU Heatsink.wav,Tools are being tried to make different sounds.,The metallic clang is made at different and va...,One at a time the metal chimes are being chimed.,A metallic clang is made at various pitches.,Metal chimes being chimed one at a time.
3835,Fumbling.wav,"multiple items are picked up, and a tin pan is...",They are sorting through objects and dropping ...,A person works moving cans and other items and...,Cans and items are being moved around clunking...,Going through all of the trash can noisily.
3836,cold waterdrops in a hot pot.wav,Someone is flipping over food on a hot grill.,Someone is flipping over food above a hot grill.,All recorded audio is drowned out by the loudn...,Someone waters down a boat with a high pressur...,"A match is lit, then another match is lit."
3837,Dry leaves falling on cement floor.wav,Tapping noises are being made before paper is ...,Repeated tapping hits a hard surface and multi...,A person is typing and pauses a few times in b...,They are continually dropping clips on the table.,Sticks crunch and break while being walked on.


In [9]:
csv_path = os.path.join(data_path, 'csv_files/train.csv')
audio_dir = os.path.join(data_path, 'waveforms/train')
hdf5_path = os.path.join(data_path, 'hdf5s/train/train.h5')

In [10]:
meta_dict = load_metadata(dataset, csv_path)
# meta_dict: {'audio_names': [], 'captions': []}
print(meta_dict)

{'audio_name': array(['Distorted AM Radio noise.wav', 'Paper_Parchment_Rustling.wav',
       '03 Whales Slowing Down.wav', ...,
       'cold waterdrops in a hot pot.wav',
       'Dry leaves falling on cement floor.wav', 'Wood chips.wav'],
      dtype='<U133'), 'captions': array([['a muddled noise of broken channel of the tv',
        'a television blares the rhythm of a static tv ',
        'loud television static dips in and out of focus',
        'the loud buzz of static constantly changes pitch and volume ',
        'heavy static and the beginnings of a signal on a transistor radio'],
       ['a person is turning a map over and over ',
        'a person is very carefully rapping a gift for someone else ',
        'a person is very carefully wrapping a gift for someone else ',
        'he sighed as he turned the pages of the book stopping to scan the information ',
        'papers are being turned stopped then turned again and someone is breathing '],
       ['several barnyard animal

In [14]:
display(meta_dict['audio_name'])
display(len(meta_dict['audio_name']))
audio_nums = len(meta_dict['audio_name'])

array(['Distorted AM Radio noise.wav', 'Paper_Parchment_Rustling.wav',
       '03 Whales Slowing Down.wav', ...,
       'cold waterdrops in a hot pot.wav',
       'Dry leaves falling on cement floor.wav', 'Wood chips.wav'],
      dtype='<U133')

3839

In [15]:
all_captions = []

meta_dict = load_metadata(dataset, csv_path)
# meta_dict: {'audio_names': [], 'captions': []}
audio_nums = len(meta_dict['audio_name'])


# store all captions in training set into a list
if dataset == 'Clotho':
    for caps in meta_dict['captions']:
        for cap in caps:
            all_captions.append(cap)
else:
    all_captions.extend(meta_dict['captions'])


In [16]:
splits = ['train', 'val', 'test']
sampling_rate = 32000
all_captions = []

#if dataset == 'AudioCaps':
#    audio_duration = 10
# if dataset == 'Clotho':
#     audio_duration = 30

# else:
#     raise NotImplementedError(f'No dataset named: {dataset}')

max_audio_length = audio_duration * sampling_rate # 30 * 32000


for split in splits:
    csv_path = 'data/Clotho/csv_files/{}.csv'.format(split)
    audio_dir = 'data/Clotho/waveforms/{}/'.format(split)
    hdf5_path = 'data/Clotho/hdf5s/{}/'.format(split)

    # make dir for hdf5
    Path(hdf5_path).mkdir(parents=True, exist_ok=True)

    meta_dict = load_metadata(dataset, csv_path)
    # meta_dict: {'audio_names': [], 'captions': []}

    audio_nums = len(meta_dict['audio_name'])

    if split == 'train':
        # store all captions in training set into a list
        if dataset == 'Clotho':
            for caps in meta_dict['captions']:
                for cap in caps:
                    all_captions.append(cap)
        else:
            all_captions.extend(meta_dict['captions'])


In [17]:
words_list, words_freq = _create_vocabulary(all_captions)

In [18]:
words_dictionary = {words_list[i]: words_freq[i] for i in range(len(words_freq))}
words_dictionary

{'a': 19595,
 'muddled': 1,
 'noise': 533,
 'of': 3826,
 'broken': 37,
 'channel': 16,
 'the': 10205,
 'tv': 6,
 'television': 43,
 'blares': 23,
 'rhythm': 56,
 'static': 272,
 'loud': 639,
 'dips': 3,
 'in': 5689,
 'and': 9560,
 'out': 585,
 'focus': 1,
 'buzz': 75,
 'constantly': 113,
 'changes': 36,
 'pitch': 126,
 'volume': 95,
 'heavy': 425,
 'beginnings': 1,
 'signal': 44,
 'on': 2572,
 'transistor': 1,
 'radio': 211,
 'person': 1492,
 'is': 7333,
 'turning': 69,
 'map': 1,
 'over': 773,
 'very': 356,
 'carefully': 10,
 'rapping': 6,
 'gift': 2,
 'for': 243,
 'someone': 1603,
 'else': 32,
 'wrapping': 8,
 'he': 49,
 'sighed': 1,
 'as': 2670,
 'turned': 207,
 'pages': 83,
 'book': 77,
 'stopping': 60,
 'to': 1668,
 'scan': 1,
 'information': 1,
 'papers': 63,
 'are': 3160,
 'being': 1367,
 'stopped': 33,
 'then': 1848,
 'again': 347,
 'breathing': 50,
 'several': 487,
 'barnyard': 2,
 'animals': 82,
 'mooing': 11,
 'barn': 3,
 'while': 2357,
 'it': 1478,
 'rains': 43,
 'outside':

In [19]:
len(words_freq)

4368