# Setup for segmenting GRN Data for vox-grn
The notebook started off as the place where the data sets would be created, however it was found that by breaking the data into multiple sections greater speed could be attained by running multiple notebooks in parallel. This notebook morphed into the place where the items were prepared for the other notebooks to use.

# Data to be Created

Three sets of data will be made, 4, 6, and 10 second data. 

This file will take a long time to run. For that reason it will check point its progress at regular intervals by writing smaller files. If its progress is restarted for any reason it will use those files to determine where it was up to. Segmenting the data is the slow part. For each audio file a segment file will be generated. If mp3 files are to be generated later using different parameters the segmented files should make the process much quicker.

The input to this process is all_items_for_processing.csv with musical items removed.

The output will be:
    
    1. /media/originals/segments/[iso]/[program_id]_[item_no].pkl
        where each pkl file contains the list of raw segments for the item.
    2. /media/originals/segments/seg_4sec.csv
        all the metadata for the seg_4sec dataset. Metadata needed for the dataset can be derived from this.
    3. /media/originals/segments/seg_6sec.csv
        all the metadata for the seg_6sec dataset. Metadata needed for the dataset can be derived from this.
    4. /media/originals/segments/seg_10sec.csv
        all the metadata for the seg_10sec dataset. Metadata needed for the dataset can be derived from this.
    5. /media/originals/datasets/seg_4sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment
    6. /media/originals/datasets/seg_6sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment
    7. /media/originals/datasets/seg_10sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment


In [26]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('../vad_utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
from vad_utils import SAMPLING_RATE, FRAME_SIZE_MS, SAMPLES_PER_FRAME
import vad_utils as vu
from pydub import AudioSegment
import pickle as pkl
from pathlib import Path
import time
import glob


In [20]:
# Now read in the description of the input and remove the unwanted columns and rename the rest to be python attribute names.
items = pd.read_csv('/home/jovyan/work/GRN-Notebooks/Data/all_items_for_processing.csv')
items = items.drop(columns=['Unnamed: 0', 'LanguageID', 'Language', 'Track', 'Recordist', 'Size', 'Length', 'Tape Side', 'Item Type', 'composite'])
items.rename(inplace=True, columns={ 'ISO' : 'iso', 'Location' : 'location', 'Year' : 'year', 'Path' : 'path', 'Filename' : 'filename', 'Title' : 'title', 'Program Item Number' : 'item_no', 'Program' : 'program' })

# the existing ID is based on track number rather than item number. We want to use the item number.
items['ID'] = items['program'] + '_' + items['item_no'].astype(int).apply('{:0>3d}'.format)

items.set_index('ID', inplace=True)
items.columns


Index(['iso', 'program', 'location', 'year', 'path', 'filename', 'item_no',
       'title', 'start', 'end'],
      dtype='object')

In [21]:
items = items[items.iso.notna()]

In [22]:
print(items.iloc[100])

iso                                       cja
program                                A03410
location                             Cambodia
year                                   1955.0
path        Programs/03/03410/A03410/From_CM/
filename                       C03410B-01.wav
item_no                                     4
title                          The Lost Sheep
start                                     0.0
end                                     192.0
Name: A03410_004, dtype: object


In [23]:
print(items.index)

Index(['A03101_001', 'A03101_002', 'A03101_003', 'A03101_004', 'A03101_005',
       'A03101_006', 'A03101_007', 'A03111_008', 'A03111_009', 'A03111_010',
       ...
       'A03031_012', 'C03060_009', 'C03060_006', 'C03060_010', 'C03060_004',
       'C03060_008', 'C03060_001', 'C03060_015', 'C03060_012', 'C03060_013'],
      dtype='object', name='ID', length=201317)


In [24]:
# remove all items that have zero length
items = items[items.end > 0]

In [25]:
# check that the index does not have duplicates
print(sum(items.index.duplicated()))
dup_items = items[items.index.duplicated(False)]


262


In [27]:
# define the locations for each of the file types
NOTEBOOK_ID=0
SEGMENTS_DIR = '/media/originals/segments/'
DATASETS_DIR = '/media/originals/datasets/'
SEC_4_DATA_DIR = 'seg_4sec/data/'
SEC_6_DATA_DIR = 'seg_6sec/data/'
SEC_10_DATA_DIR = 'seg_10sec/data/'

# define specific files used in the process
SEG_4_SEC_DF = f'{SEGMENTS_DIR}seg_4sec_{NOTEBOOK_ID}.csv'
SEG_6_SEC_DF = f'{SEGMENTS_DIR}seg_6sec_{NOTEBOOK_ID}.csv'
SEG_10_SEC_DF = f'{SEGMENTS_DIR}seg_10sec_{NOTEBOOK_ID}.csv'

# define segment sizes for each dataset
SEG_4_SEC = 4.0
SEG_6_SEC = 6.0
SEG_10_SEC = 10.0

In [29]:
# generate directories and filenames
def prepare_dir(dirname):
    if dirname[-1] != '/':
        dirname = dirname + '/'
    Path(dirname).mkdir(parents=True, exist_ok=True)
    return dirname

def prepare_raw_seg_dir(item):
    return prepare_dir(SEGMENTS_DIR + item.iso)

def raw_seg_filename(item):
    return f'{item.program}_{item.item_no:03d}.pkl'

def prepare_dataset_data_dir(item, dataset_dir):
    return prepare_dir(DATASETS_DIR + dataset_dir + item.iso)

def seg_mp3_filename(item, seg):
    return f'{item.program}_{item.item_no:03d}_{seg:03d}.mp3'


In [37]:
# to enable the reuse of segment data it is necessary to remove all the items that were duplicated as we have no idea which item was actually analysed.
for item in dup_items.itertuples():
    print(item)
    fname = prepare_raw_seg_dir(item) + raw_seg_filename(item)
    if os.path.exists(fname):
        os.remove(fname)
        print(f'Deleted {fname}')
    
    # remove any mp3 files that were created for this item
    for dataset_dir in [SEC_4_DATA_DIR, SEC_6_DATA_DIR, SEC_10_DATA_DIR]:
        dirname = DATASETS_DIR + dataset_dir + item.iso
        fname_pat = dirname + '/' + f'{item.program}_{item.item_no:03d}_*.mp3'
        for f in glob.glob(fname_pat):
            os.remove(f)    
            print(f'Deleted {f}')
    


Pandas(Index='A23091_001', iso='krw', program='A23091', location='Liberia', year=1990.0, path='Programs/23/23091/A23091/PM-0904/', filename='A23091-01.wav', item_no=1, title='Introduction to Good News Reader III', start=0.0, end=24.0)
Deleted /media/originals/segments/krw/A23091_001.pkl
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_000.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_001.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_002.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_003.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_004.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_005.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_006.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_007.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_008.mp3
Deleted /media/originals/datasets/seg_4sec/data/krw/A23091_001_009.mp3
De

In [15]:
# the duplicates appear to be caused by items that have been incorrectly numbered. Take the
# last of each. 
items = items[items.index.duplicated('last') == False]

# Parallel Processing
Through trial and error the ideal number to run in parallel was found to be about 10. More than this and vs code starts to cause errors. Divide the data into 10 lots.

In [17]:
sorted_items = items.sort_values('iso')
row_divisions = [i for i in range(len(sorted_items)//10, len(sorted_items), len(sorted_items)//10)]
item_df = []
start = 0
for row_end in row_divisions:
    item_df.append(sorted_items[start:row_end])
    start = row_end
item_df.append(sorted_items[start:])
# now write the divided rows out as csv files
for i, df in enumerate(item_df):
    df.to_csv(f'/media/originals/segments/items_{i}.csv')

In [18]:
# read one back to check
df = pd.read_csv('/media/originals/segments/items_1.csv')