# Partial Process of Segment GRN Data for vox-grn

To speed up processing the files have been broken into 10 groups and processed in parallel.

# Data to be Created

Three sets of data will be made, 4, 6, and 10 second data. 

This file will take a long time to run. For that reason it will check point its progress at regular intervals by writing smaller files. If its progress is restarted for any reason it will use those files to determine where it was up to. Segmenting the data is the slow part. For each audio file a segment file will be generated. If mp3 files are to be generated later using different parameters the segmented files should make the process much quicker.

The input to this process is all_items_for_processing.csv with musical items removed.

The output will be:
    
    1. /media/originals/fsegs/[iso]/[filename].pkl
        where each pkl file contains the list of raw segments for the item.
    2. /media/originals/fsegs/seg_4sec.csv
        all the metadata for the seg_4sec dataset. Metadata needed for the dataset can be derived from this.
    3. /media/originals/fsegs/seg_6sec.csv
        all the metadata for the seg_6sec dataset. Metadata needed for the dataset can be derived from this.
    4. /media/originals/fsegs/seg_10sec.csv
        all the metadata for the seg_10sec dataset. Metadata needed for the dataset can be derived from this.
    5. /media/originals/datasets/fseg_4sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment
    6. /media/originals/datasets/fseg_6sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment
    7. /media/originals/datasets/fseg_10sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment


In [1]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('../vad_utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
from vad_utils import SAMPLING_RATE, FRAME_SIZE_MS, SAMPLES_PER_FRAME
import vad_utils as vu
from pydub import AudioSegment
import pickle as pkl
from pathlib import Path
import time
import glob


In [2]:
# define the locations for each of the file types
NOTEBOOK_ID=0
SEGMENTS_DIR = '/media/originals/fsegs/'
DATASETS_DIR = '/media/originals/datasets/'
SEC_4_DATA_DIR = 'fseg_4sec/data/'
SEC_6_DATA_DIR = 'fseg_6sec/data/'
SEC_10_DATA_DIR = 'fseg_10sec/data/'

# define specific files used in the process
SEG_4_SEC_DF = f'{SEGMENTS_DIR}seg_4sec_{NOTEBOOK_ID}.csv'
SEG_6_SEC_DF = f'{SEGMENTS_DIR}seg_6sec_{NOTEBOOK_ID}.csv'
SEG_10_SEC_DF = f'{SEGMENTS_DIR}seg_10sec_{NOTEBOOK_ID}.csv'

# define segment sizes for each dataset
SEG_4_SEC = 4.0
SEG_6_SEC = 6.0
SEG_10_SEC = 10.0

In [3]:
# Now read in the description of the input and remove the unwanted columns and rename the rest to be python attribute names.
fd = pd.read_csv(f'/media/originals/fsegs/files_{NOTEBOOK_ID}.csv')
fd.set_index('ID', inplace=True)
fd.columns


Index(['Unnamed: 0', 'iso', 'language_name', 'track', 'location', 'year',
       'path', 'filename', 'length', 'program'],
      dtype='object')

In [4]:
print(fd.iloc[100])

Unnamed: 0                                  198133
iso                                            aar
language_name                                 Afar
track                                            5
location                                    Asmara
year                                        1967.0
path             Programs/02/02110/A02110/From_CM/
filename                            C02110A-05.wav
length                                  176.137625
program                                       2110
Name: 02110_005, dtype: object


In [5]:
print(fd.index)

Index(['14610_001', '14610_002', '13981_002', '13981_001', '64296_012',
       '64296_011', '64296_009', '64296_008', '64296_010', '64296_006',
       ...
       '62996_023', '62996_022', '62996_021', '62996_020', '62996_019',
       '62996_018', '62996_017', '62996_016', '62996_015', '62996_014'],
      dtype='object', name='ID', length=20837)


In [6]:
# generate directories and filenames
def prepare_dir(dirname):
    if dirname[-1] != '/':
        dirname = dirname + '/'
    Path(dirname).mkdir(parents=True, exist_ok=True)
    return dirname

def prepare_raw_seg_dir(fd):
    return prepare_dir(SEGMENTS_DIR + fd.iso)

def raw_seg_filename(fd):
    return f'{fd.filename}.pkl'

def prepare_dataset_data_dir(fd, dataset_dir):
    return prepare_dir(DATASETS_DIR + dataset_dir + fd.iso)

def seg_mp3_filename(fd, seg):
    return f'{fd.filename[:-4]}_{seg:03d}.mp3'

def get_fname(fd):
    path = fd.path
    if path[-1] != '/':
        path = path + '/'
    files = glob.glob('/media/programs/' + path + fd.filename.replace('\ufffd', '*'))
    if len(files) == 1:
        return files[0]
    return '/media/programs/' + path + fd.filename



In [7]:

def condition_audio_segment(audio_seg):
    if audio_seg.channels != 1:
        audio_seg = audio_seg.set_channels(1)

    if audio_seg.sample_width != 2:
        audio_seg = audio_seg.set_sample_width(2)

    if audio_seg.frame_rate != SAMPLING_RATE:
        audio_seg = audio_seg.set_frame_rate(SAMPLING_RATE)

    return audio_seg
        


In [8]:

def save_the_segment_info(fd, segs):
    fname = prepare_raw_seg_dir(fd) + raw_seg_filename(fd)
    with open(fname, 'wb') as pklFile:
         pkl.dump(segs, pklFile)

def read_the_segment_info(fd):
    fname = prepare_raw_seg_dir(fd) + raw_seg_filename(fd)
    if os.path.exists(fname):
        if os.path.getsize(fname) > 0:
            with open(fname, 'rb') as pklFile:
                return pkl.load(pklFile)
    return []
    
def extract_audio_segments_for_file(fd):
    fmt = 'wav'
    if fd.filename[-4:].lower() == '.mp3' :
        fmt = 'mp3'
    audio_seg = AudioSegment.from_file(get_fname(fd), format=fmt)

    # now condition the segment and extract the raw segments.
    audio_seg = condition_audio_segment(audio_seg)
    segs = read_the_segment_info(fd)
    if len(segs) == 0:
        segs = vu.audio_to_raw_voice_segments(audio_seg)
        save_the_segment_info(fd, segs)

    return audio_seg, segs

def create_mp3_files(audio_seg, segs, time_per_segment, dataset_dir, fd):
    segs_for_time = vu.divide_into_segments(segs, time_per_segment)

    # now write out the 4 sec segments
    dirname = prepare_dataset_data_dir(fd, dataset_dir)
    rows = list()

    # note that we discard the first segment
    if len(segs_for_time) > 1:
        for i, seg in enumerate(segs_for_time[1:]):
            file_name = dataset_dir + fd.iso + '/' + seg_mp3_filename(fd, i)
            fname = dirname + seg_mp3_filename(fd, i)
            if not os.path.exists(fname):
                start = vu.convert_frames_to_ms(seg.start)
                stop = vu.convert_frames_to_ms(seg.stop)
                audio_seg[start:stop].export(fname, format='mp3', bitrate='32k')
            desc = dict(fd._asdict())
            desc['seg_start'] = vu.convert_frames_to_seconds(seg.start)
            desc['seg_stop'] = vu.convert_frames_to_seconds(seg.stop)
            desc['seg'] = i
            desc['file_name'] = file_name
            rows.append(desc)

    return rows

def update_dataframes(seg_df_csv, seg_records):
    # now update the dataframes
    if len(seg_records) > 0:
        if os.path.isfile(seg_df_csv):
            seg_sec_df = pd.concat([pd.read_csv(seg_df_csv, index_col='file_name'), pd.DataFrame.from_records(seg_records, index='file_name')])
        else:
            seg_sec_df = pd.DataFrame.from_records(seg_records, index='file_name')
        seg_sec_df.to_csv(seg_df_csv)


In [9]:
# processing these items might take a very long time. To permit the process to be interrupted and restarted the indexes of processed items
# are maintained in a set that is pickled on each batch. This allows the batch to quickly pick up where it left off.
def process_record_batch(files_df, *, batch_size=10):
    batch_proc = 0
    processed_file = f'/media/originals/fsegs/processed16_{NOTEBOOK_ID}.pkl'
    if os.path.isfile(processed_file):
        with open(processed_file, 'rb') as pklFile:
            processed_files = pkl.load(pklFile)
    else:
        processed_files = set()

    segmented_4sec_segs = []
    segmented_6sec_segs = []
    segmented_10sec_segs = []

    for fd in files_df.itertuples():
        if batch_proc < batch_size:
            if fd.Index not in processed_files:
                try:
                    audio_seg, voice_segs = extract_audio_segments_for_file(fd)
                    segmented_4sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_4_SEC, SEC_4_DATA_DIR, fd))
                    segmented_6sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_6_SEC, SEC_6_DATA_DIR, fd))
                    segmented_10sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_10_SEC, SEC_10_DATA_DIR, fd))
                except:
                    print(f'exception {fd.filename}')
                    pass

                # we want to add an fd that has an exception so it is not reprocessed on every batch
                processed_files.add(fd.Index)
                batch_proc += 1

        else:
            break

    update_dataframes(SEG_4_SEC_DF, segmented_4sec_segs)
    update_dataframes(SEG_6_SEC_DF, segmented_6sec_segs)
    update_dataframes(SEG_10_SEC_DF, segmented_10sec_segs)

    with open(processed_file, 'wb') as pklFile:
        pkl.dump(processed_files, pklFile)

    return processed_files



In [10]:
start_time = time.time()
# form the language classification file
files_processed = 0
files_to_process = len(fd)
while files_to_process > files_processed:
    processed_files = process_record_batch(fd, batch_size=50)
    files_processed = len(processed_files)
    print(f'Processed {files_processed} out of {files_to_process} in {time.time()-start_time} seconds')


Processed 50 out of 20837 in 249.3887071609497 seconds
Processed 100 out of 20837 in 588.2970640659332 seconds
Processed 150 out of 20837 in 986.3829157352448 seconds
Processed 200 out of 20837 in 1178.7418448925018 seconds
Processed 250 out of 20837 in 1279.6168038845062 seconds
Processed 300 out of 20837 in 1399.6771149635315 seconds
Processed 350 out of 20837 in 1569.3259797096252 seconds
Processed 400 out of 20837 in 1853.361209154129 seconds
Processed 450 out of 20837 in 2141.530957221985 seconds
Processed 500 out of 20837 in 2489.4116706848145 seconds
Processed 550 out of 20837 in 2825.726236343384 seconds
Processed 600 out of 20837 in 2940.6272230148315 seconds
Processed 650 out of 20837 in 3537.9733049869537 seconds
Processed 700 out of 20837 in 3934.046510219574 seconds
Processed 750 out of 20837 in 4939.300346851349 seconds
Processed 800 out of 20837 in 5230.479301214218 seconds
Processed 850 out of 20837 in 5413.860548019409 seconds
Processed 900 out of 20837 in 6066.2525346