# Partial Process of Segment GRN Data for vox-grn

See SegmentVox.ipynb for full description.

In an attempt to speed up the processing I have divided the items up into 10 groups and want to have them all process in parallel. It probably won't be 10 times faster but it should still be quicker.


In [1]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('../vad_utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
from vad_utils import SAMPLING_RATE, FRAME_SIZE_MS, SAMPLES_PER_FRAME
import vad_utils as vu
from pydub import AudioSegment
import pickle as pkl
from pathlib import Path
import time


In [2]:
# define the locations for each of the file types
NOTEBOOK_ID=13
SEGMENTS_DIR = '/media/originals/segments/'
DATASETS_DIR = '/media/originals/datasets/'
SEC_4_DATA_DIR = 'seg_4sec/data/'
SEC_6_DATA_DIR = 'seg_6sec/data/'
SEC_10_DATA_DIR = 'seg_10sec/data/'

# define specific files used in the process
SEG_4_SEC_DF = f'{SEGMENTS_DIR}seg_4sec_{NOTEBOOK_ID}.csv'
SEG_6_SEC_DF = f'{SEGMENTS_DIR}seg_6sec_{NOTEBOOK_ID}.csv'
SEG_10_SEC_DF = f'{SEGMENTS_DIR}seg_10sec_{NOTEBOOK_ID}.csv'

# define segment sizes for each dataset
SEG_4_SEC = 4.0
SEG_6_SEC = 6.0
SEG_10_SEC = 10.0

In [3]:
# Now read in the description of the input and remove the unwanted columns and rename the rest to be python attribute names.
items = pd.read_csv(f'/media/originals/segments/items_{NOTEBOOK_ID}.csv')
items.set_index('ID', inplace=True)
items.columns


Index(['iso', 'program', 'location', 'year', 'path', 'filename', 'item_no',
       'title', 'start', 'end'],
      dtype='object')

In [4]:
print(items.iloc[100])

iso                                       swk
program                                A82313
location                     Blantyre, Malawi
year                                   1998.0
path        Programs/82/82313/A82313/PM-1401/
filename                       A82313-023.wav
item_no                                    23
title                              Picture 66
start                                     0.0
end                                      77.0
Name: A82313_023, dtype: object


In [5]:
print(items.index)

Index(['A19381_051', 'A65140_030', 'A65141_006', 'A19381_050', 'A72880_012',
       'A72880_013', 'A72880_014', 'A65140_021', 'A65140_020', 'A19381_049',
       ...
       'A21071_017', 'A21071_003', 'A21071_001', 'A21071_034', 'A21071_035',
       'A21071_036', 'A21071_037', 'A21071_038', 'A21071_039', 'A21071_040'],
      dtype='object', name='ID', length=12582)


In [6]:
# generate directories and filenames
def prepare_dir(dirname):
    if dirname[-1] != '/':
        dirname = dirname + '/'
    Path(dirname).mkdir(parents=True, exist_ok=True)
    return dirname

def prepare_raw_seg_dir(item):
    return prepare_dir(SEGMENTS_DIR + item.iso)

def raw_seg_filename(item):
    return f'{item.program}_{item.item_no:03d}.pkl'

def prepare_dataset_data_dir(item, dataset_dir):
    return prepare_dir(DATASETS_DIR + dataset_dir + item.iso)

def seg_mp3_filename(item, seg):
    return f'{item.program}_{item.item_no:03d}_{seg:03d}.mp3'


In [7]:

def condition_audio_segment(audio_seg):
    if audio_seg.channels != 1:
        audio_seg = audio_seg.set_channels(1)

    if audio_seg.sample_width != 2:
        audio_seg = audio_seg.set_sample_width(2)

    if audio_seg.frame_rate != SAMPLING_RATE:
        audio_seg = audio_seg.set_frame_rate(SAMPLING_RATE)

    return audio_seg
        


In [8]:

def save_the_segment_info(item, segs):
    fname = prepare_raw_seg_dir(item) + raw_seg_filename(item)
    with open(fname, 'wb') as pklFile:
         pkl.dump(segs, pklFile)

def read_the_segment_info(item):
    fname = prepare_raw_seg_dir(item) + raw_seg_filename(item)
    if os.path.exists(fname):
        if os.path.getsize(fname) > 0:
            with open(fname, 'rb') as pklFile:
                return pkl.load(pklFile)
    return []
    
def extract_audio_segment_for_item(item):
    fmt = 'wav'
    if item.filename[-4:] == '.mp3' :
        fmt = 'mp3'
    audio_seg = AudioSegment.from_file('/media/programs/' + item.path + item.filename, format=fmt)
    start_ms = int(item.start * 1000)
    end_ms = int(item.end * 1000)
    audio_seg = audio_seg[start_ms:end_ms]

    # now condition the segment and extract the raw segments.
    audio_seg = condition_audio_segment(audio_seg)
    segs = read_the_segment_info(item)
    if len(segs) == 0:
        segs = vu.audio_to_raw_voice_segments(audio_seg)
        save_the_segment_info(item, segs)

    return audio_seg, segs

def create_mp3_files(audio_seg, segs, time_per_segment, dataset_dir, item):
    segs_for_time = vu.divide_into_segments(segs, time_per_segment)

    # now write out the 4 sec segments
    dirname = prepare_dataset_data_dir(item, dataset_dir)
    rows = list()
    for i, seg in enumerate(segs_for_time):
        file_name = dataset_dir + item.iso + '/' + seg_mp3_filename(item, i)
        fname = dirname + seg_mp3_filename(item, i)
        if not os.path.exists(fname):
            start = vu.convert_frames_to_ms(seg.start)
            stop = vu.convert_frames_to_ms(seg.stop)
            audio_seg[start:stop].export(fname, format='mp3', bitrate='32k')
        desc = dict(item._asdict())
        desc['seg_start'] = vu.convert_frames_to_seconds(seg.start)
        desc['seg_stop'] = vu.convert_frames_to_seconds(seg.stop)
        desc['seg'] = i
        desc['file_name'] = file_name
        rows.append(desc)

    return rows

def update_dataframes(seg_df_csv, seg_records):
    # now update the dataframes
    if len(seg_records) > 0:
        if os.path.isfile(seg_df_csv):
            seg_sec_df = pd.concat([pd.read_csv(seg_df_csv, index_col='file_name'), pd.DataFrame.from_records(seg_records, index='file_name')])
        else:
            seg_sec_df = pd.DataFrame.from_records(seg_records, index='file_name')
        seg_sec_df.to_csv(seg_df_csv)


In [9]:
# processing these items might take a very long time. To permit the process to be interrupted and restarted the indexes of processed items
# are maintained in a set that is pickled on each batch. This allows the batch to quickly pick up where it left off.
def process_record_batch(items_df, *, batch_size=10):
    batch_proc = 0
    processed_file = f'/media/originals/segments/processed16_{NOTEBOOK_ID}.pkl'
    if os.path.isfile(processed_file):
        with open(processed_file, 'rb') as pklFile:
            processed_items = pkl.load(pklFile)
    else:
        processed_items = set()

    segmented_4sec_segs = []
    segmented_6sec_segs = []
    segmented_10sec_segs = []

    for item in items_df.itertuples():
        if batch_proc < batch_size:
            if item.Index not in processed_items:
                try:
                    audio_seg, voice_segs = extract_audio_segment_for_item(item)
                    segmented_4sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_4_SEC, SEC_4_DATA_DIR, item))
                    segmented_6sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_6_SEC, SEC_6_DATA_DIR, item))
                    segmented_10sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_10_SEC, SEC_10_DATA_DIR, item))
                except:
                    print('exception')
                    pass

                # we want to add an item that has an exception so it is not reprocessed on every batch
                processed_items.add(item.Index)
                batch_proc += 1

        else:
            break

    update_dataframes(SEG_4_SEC_DF, segmented_4sec_segs)
    update_dataframes(SEG_6_SEC_DF, segmented_6sec_segs)
    update_dataframes(SEG_10_SEC_DF, segmented_10sec_segs)

    with open(processed_file, 'wb') as pklFile:
        pkl.dump(processed_items, pklFile)

    return processed_items



In [None]:
start_time = time.time()
# form the language classification file
items_processed = 0
items_to_process = len(items)
while items_to_process > items_processed:
    processed_items = process_record_batch(items, batch_size=50)
    items_processed = len(processed_items)
    print(f'Processed {items_processed} out of {items_to_process} in {time.time()-start_time} seconds')
