# Create MP3 files
The purpose of this notebook is to trial creating mp3 files from the data suitable to make a HF data set.

In [8]:
import pandas as pd
import numpy as np
import webrtcvad
import os
import sys
module_path = os.path.abspath(os.path.join('../vad_utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
from vad_utils import SAMPLING_RATE, FRAME_SIZE_MS, SAMPLES_PER_FRAME
import vad_utils as vu
from pydub import AudioSegment


/home/jovyan/work/vox-grn/Projects/vad_utils


In [9]:
items = pd.read_csv("/home/jovyan/work/GRN-Notebooks/Data/all_items_for_processing.csv")
items.columns


Index(['Unnamed: 0', 'LanguageID', 'ISO', 'Language', 'Program', 'Track',
       'Recordist', 'Location', 'Year', 'Path', 'Filename', 'Size', 'Length',
       'ID', 'Program Item Number', 'Tape Side', 'Title', 'Item Type',
       'composite', 'start', 'end'],
      dtype='object')

In [3]:
# get a subset of items to make a small dataset
working_set = items[items['Item Type'] == 'Scripture Stories']
working_set.set_index('ID', inplace=True)
working_set = working_set.drop(columns=['Unnamed: 0', 'LanguageID', 'Language', 'Program', 'Track', 'Recordist', 'Size', 'Length', 'Program Item Number', 'Tape Side', 'Item Type', 'composite'])

Now for each item we want to create a set of items based on the time divisions.

In [4]:
vad = webrtcvad.Vad(2)
vadfilt = vu.VADFilter()

def audio_to_voice_segments(audio_segment, segment_size_secs):
    global vad, vadfilt
    
    frames = vu.generate_frames_from_audio_segments(FRAME_SIZE_MS, audio_segment, SAMPLING_RATE)
    speech = [vad.is_speech(frame.bytes, SAMPLING_RATE) for frame in frames]
    vadout = [vadfilt.filt(s) for s in speech]
    segs = vu.form_segments(vadout)
    return vu.divide_into_segments(segs, segment_size_secs)


In [5]:

def generate_audio_segments(item, segment_size_secs):
    # create an audio segment
    audio_seg = AudioSegment.from_file('/media/programs/' + item.Path + item.Filename, format='wav')
    start_ms = int(item.start * 1000)
    end_ms = int(item.end * 1000)
    audio_seg = audio_seg[start_ms:end_ms]
    segs = audio_to_voice_segments(audio_seg, segment_size_secs)
    rows = list()
    for i, seg in enumerate(segs):
        desc = dict(item._asdict())
        desc['start'] = vu.convert_frames_to_seconds(seg.start)
        desc['end'] = vu.convert_frames_to_seconds(seg.stop)
        desc['seg'] = i
        rows.append(desc)
    return rows


In [6]:
from pathlib import Path

def prepare_dir(item):
    dirname = '/media/programs/dataset/stories/' + item.ISO
    Path(dirname).mkdir(parents=True, exist_ok=True)
    return dirname

def condition_audio_segment(audio_seg):
    if audio_seg.channels != 1:
        audio_seg = audio_seg.set_channels(1)

    if audio_seg.sample_width != 2:
        audio_seg = audio_seg.set_sample_width(2)

    if audio_seg.frame_rate != SAMPLING_RATE:
        audio_seg = audio_seg.set_frame_rate(SAMPLING_RATE)

    return audio_seg
        
# We want to store the data as 16 bit 16kHz data
def generate_mp3_segments(item, segment_size_secs):
    # create an audio segment
    audio_seg = AudioSegment.from_file('/media/programs/' + item.Path + item.Filename, format='wav')
    start_ms = int(item.start * 1000)
    end_ms = int(item.end * 1000)
    audio_seg = audio_seg[start_ms:end_ms]
    audio_seg = condition_audio_segment(audio_seg)
    segs = audio_to_voice_segments(audio_seg, segment_size_secs)
    dirname = prepare_dir(item)
    rows = list()
    for i, seg in enumerate(segs):
        fname = f'{dirname}/{item.Index}_{i:0>3}.mp3'
        start = vu.convert_frames_to_ms(seg.start)
        stop = vu.convert_frames_to_ms(seg.stop)
        audio_seg[start:stop].export(fname, format='mp3', bitrate='32k')
        desc = dict(item._asdict())
        desc['start'] = vu.convert_frames_to_seconds(seg.start)
        desc['stop'] = vu.convert_frames_to_seconds(seg.stop)
        desc['seg'] = i
        desc['fname'] = fname
        rows.append(desc)

    return rows


In [7]:
# generate the working set dataset
segmented_segs = []
for i, row in enumerate(working_set.itertuples()):
    segmented_segs.extend(generate_mp3_segments(row, 6.0))
    print('.', end='')
    if i % 50 == 0:
        print(i)

seg_df = pd.DataFrame.from_records(segmented_segs)


.0
..................................................50
..................................................100
..................................................150
......

Now we need to generate a json file to go with the dataset.

In [9]:
print(seg_df.columns)

Index(['Index', 'ISO', 'Location', 'Year', 'Path', 'Filename', 'Title',
       'start', 'end', 'stop', 'seg', 'fname'],
      dtype='object')


In [10]:
seg_df.drop(inplace=True, columns=['Path', 'Filename'])

In [12]:
seg_df.drop(inplace=True, columns=['start', 'stop', 'end'])
seg_df = seg_df.rename(columns={'Index' : 'item'})

In [13]:
from datasets import Dataset
ds = Dataset.from_pandas(seg_df)
print(ds[0])
ds.to_json('/media/programs/datasets/stories/working_set.json')

{'item': 'A23090_001', 'ISO': 'krw', 'Location': 'Liberia', 'Year': 1990.0, 'Title': "The Poor Woman's Offering Mark 12:41-44", 'seg': 0, 'fname': '/media/programs/dataset/stories/krw/A23090_001_000.mp3'}


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

418940

In [15]:
# now try loading the dataset
from datasets import load_dataset

try_ds = load_dataset('json', data_files='/media/programs/datasets/stories/working_set.json', split='train')


Using custom data configuration default-fc0693ab274b3ef7


Downloading and preparing dataset json/default to /home/jovyan/.cache/huggingface/datasets/json/default-fc0693ab274b3ef7/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/json/default-fc0693ab274b3ef7/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


In [19]:
print(try_ds[0])
print(len(try_ds))

{'item': 'A23090_001', 'ISO': 'krw', 'Location': 'Liberia', 'Year': 1990.0, 'Title': "The Poor Woman's Offering Mark 12:41-44", 'seg': 0, 'fname': '/media/programs/dataset/stories/krw/A23090_001_000.mp3'}
2180


In [21]:
seg_df = seg_df.rename(columns={'fname': 'file_name'})

Now I want to create the metadata.csv file used by audiofile on huggingface.
It requires:

    1. the repo structure to be:
        metadata.csv
        data/*split1*/file.mp3
        data/*split2*/file.mp3
    2. the metadata.csv to include a column called file_name that has names relative to the root directory of the dataset (where metadata.csv is located)

I am not sure that it handles different configurations. Lets go simple first and not have a split.

In [22]:
seg_df.iloc[0,:]

item                                                A23090_001
ISO                                                        krw
Location                                               Liberia
Year                                                    1990.0
Title                  The Poor Woman's Offering Mark 12:41-44
seg                                                          0
file_name    /media/programs/dataset/stories/krw/A23090_001...
Name: 0, dtype: object

In [23]:
metadata = seg_df
metadata['file_name'] = metadata['file_name'].str.extract(r'/media/programs/dataset/stories/(.*)')

In [24]:
metadata['file_name'] = 'data/' + metadata['file_name']

In [26]:
metadata.set_index('file_name', inplace=True)

In [27]:
metadata.to_csv('/media/programs/dataset/stories/grnvox_test/metadata.csv')

I used git command line to check out the repo and add the data files. Lets try and load it.

In [41]:
grnvox_test = load_dataset('johno-grn/grnvox_test', use_auth_token=True)

Using custom data configuration johno-grn--grnvox_test-fdbb0a063e17a22e
Reusing dataset csv (/home/jovyan/.cache/huggingface/datasets/johno-grn___csv/johno-grn--grnvox_test-fdbb0a063e17a22e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [42]:
print(grnvox_test['train'][0])

{'file_name': 'data/krw/A23090_001_000.mp3', 'item': 'A23090_001', 'ISO': 'krw', 'Location': 'Liberia', 'Year': 1990.0, 'Title': "The Poor Woman's Offering Mark 12:41-44", 'seg': 0}


In [2]:
from datasets import load_dataset
grnvox_test = load_dataset("audiofolder", data_dir='/media/programs/dataset/stories/grnvox_test', use_auth_token=True)

Resolving data files:   0%|          | 0/2181 [00:00<?, ?it/s]

Using custom data configuration default-1c0bdf108f173cb3


Downloading and preparing dataset audiofolder/default to /home/jovyan/.cache/huggingface/datasets/audiofolder/default-1c0bdf108f173cb3/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...
               

Downloading data files #1:   0%|          | 0/137 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/137 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/137 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/136 [00:00<?, ?obj/s]

 

Downloading data files #4:   0%|          | 0/137 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/137 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/136 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/audiofolder/default-1c0bdf108f173cb3/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
print(grnvox_test['train'][2000]['audio'])

{'path': '/media/programs/dataset/stories/grnvox_test/data/yon/A63676_005_000.mp3', 'array': array([0.        , 0.        , 0.        , ..., 0.02023047, 0.01743877,
       0.01676089], dtype=float32), 'sampling_rate': 16000}


In [9]:
from datasets import load_dataset
grnvox_test_rem = load_dataset("audiofolder", data_files=['/media/programs/dataset/stories/grnvox_test/grnvox_test.tar.gz'], use_auth_token=True)

Using custom data configuration default-375a603028e41e9a


Downloading and preparing dataset audiofolder/default to /home/jovyan/.cache/huggingface/datasets/audiofolder/default-375a603028e41e9a/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files: 0it [00:00, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/audiofolder/default-375a603028e41e9a/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
print(grnvox_test_rem['train'][200]['audio'])


{'path': '/home/jovyan/.cache/huggingface/datasets/downloads/extracted/b32c294ef61fc35f17fcefc55357171fb07e40ac7e73a70a92eb4a1490637c80/data/yon/A63675_013_014.mp3', 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.0140425 ,
       -0.01663685, -0.01981813], dtype=float32), 'sampling_rate': 16000}
