# Segment GRN data using pyAudioAnalysis

The group of notebooks SegmentPyVOXn.ipynb (n=0-9) will be used to segment the data using the segmentation algorithm I developed in DevelopASSegmentation.ipynb. It follows the same pattern as used in SegmentFVOXn.ipynb.

# Data to be Created

Three sets of data will be made, 4, 6, and 10 second data. 

This file will take a long time to run. For that reason it will check point its progress at regular intervals by writing smaller files. If its progress is restarted for any reason it will use those files to determine where it was up to. Segmenting the data is the slow part. For each audio file a segment file will be generated. If mp3 files are to be generated later using different parameters the segmented files should make the process much quicker.

The input to this process is the same as SegmentFVOX.ipynb. We actually use its division of the files into 10 groups.

The output will be:
    
    1. /media/originals/py_audio_seg/[iso]/[filename].pkl
        where each pkl file contains the list of raw segments for the item.
    2. /media/originals/py_audio_seg/seg_4sec.csv
        all the metadata for the seg_4sec dataset. Metadata needed for the dataset can be derived from this.
    3. /media/originals/py_audio_seg/seg_6sec.csv
        all the metadata for the seg_6sec dataset. Metadata needed for the dataset can be derived from this.
    4. /media/originals/py_audio_seg/seg_10sec.csv
        all the metadata for the seg_10sec dataset. Metadata needed for the dataset can be derived from this.
    5. /media/originals/datasets/py_audio_seg_4sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment
    6. /media/originals/datasets/py_audio_seg_6sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment
    7. /media/originals/datasets/py_audio_seg_10sec/data/[iso]/[program_id]_[item_no]_[seg].mp3
        which is an mp3 for each segment


In [1]:
import pandas as pd
import numpy as np
import os
import sys
import time
import pickle as pkl
from pathlib import Path
import glob

module_path = os.path.abspath(os.path.join('~/work/pyAudioAnalysis'))
if module_path not in sys.path:
    sys.path.append(module_path)
from collections import namedtuple
from pydub import AudioSegment

from pyAudioAnalysis import audioSegmentation as aS
from pyAudioAnalysis import audioTrainTest as at
from pyAudioAnalysis import MidTermFeatures as mtf
from pyAudioAnalysis import audioBasicIO as aIO

In [2]:
# define the locations for each of the file types
NOTEBOOK_ID=7
SEGMENTS_DIR = '/media/originals/py_audio_seg/'
DATASETS_DIR = '/media/originals/datasets/'
SEC_4_DATA_DIR = 'py_audio_seg_4sec/data/'
SEC_6_DATA_DIR = 'py_audio_seg_6sec/data/'
SEC_10_DATA_DIR = 'py_audio_seg_10sec/data/'

# define specific files used in the process
SEG_4_SEC_DF = f'{SEGMENTS_DIR}seg_4sec_{NOTEBOOK_ID}.csv'
SEG_6_SEC_DF = f'{SEGMENTS_DIR}seg_6sec_{NOTEBOOK_ID}.csv'
SEG_10_SEC_DF = f'{SEGMENTS_DIR}seg_10sec_{NOTEBOOK_ID}.csv'

# define segment sizes for each dataset
SEG_4_SEC = 4.0
SEG_6_SEC = 6.0
SEG_10_SEC = 10.0

SAMPLING_RATE = 16000

def convert_to_ms(sec):
    return int(sec*1000.0)

In [3]:
# Now read in the description of the input and remove the unwanted columns and rename the rest to be python attribute names.
fd = pd.read_csv(f'/media/originals/fsegs/files_{NOTEBOOK_ID}.csv')
fd.set_index('ID', inplace=True)
fd.columns


Index(['Unnamed: 0', 'iso', 'language_name', 'track', 'location', 'year',
       'path', 'filename', 'length', 'program'],
      dtype='object')

In [4]:
# augment the fd with 1/3 of files 9
fd9 = pd.read_csv(f'/media/originals/fsegs/files_9.csv')
fd9.set_index('ID', inplace=True)
print(len(fd9))


20841


In [5]:
third = int(len(fd9)/3)
fd = pd.concat([fd, fd9.iloc[third:2*third]])

In [6]:
print(fd.iloc[100])

Unnamed: 0                                   70361
iso                                            raw
language_name                               Rawang
track                                           29
location                                    Yangon
year                                        2005.0
path             Programs/62/62595/A62595/PM-0603/
filename                             A62595-29.wav
length                                   48.413333
program                                      62595
Name: 62595_029, dtype: object


In [7]:
print(fd.index)

Index(['37914_039', '37914_038', '37914_030', '37914_031', '37914_032',
       '37914_033', '37914_034', '37914_035', '37914_036', '37914_020',
       ...
       '38296_010', '38296_011', '38296_012', '38296_007', '38296_006',
       '38296_005', '38296_004', '12390_001', '12390_002', '12390_003'],
      dtype='object', name='ID', length=27784)


In [8]:
# generate directories and filenames
def prepare_dir(dirname):
    if dirname[-1] != '/':
        dirname = dirname + '/'
    Path(dirname).mkdir(parents=True, exist_ok=True)
    return dirname

def prepare_raw_seg_dir(fd):
    return prepare_dir(SEGMENTS_DIR + fd.iso)

def raw_seg_filename(fd):
    return f'{fd.filename}.pkl'

def prepare_dataset_data_dir(fd, dataset_dir):
    return prepare_dir(DATASETS_DIR + dataset_dir + fd.iso)

def seg_mp3_filename(fd, seg):
    return f'{fd.filename[:-4]}_{seg:03d}.mp3'

def get_fname(fd):
    path = fd.path
    if path[-1] != '/':
        path = path + '/'
    files = glob.glob('/media/programs/' + path + fd.filename.replace('\ufffd', '*'))
    if len(files) == 1:
        return files[0]
    return '/media/programs/' + path + fd.filename



In [9]:

def condition_audio_segment(audio_seg):
    if audio_seg.channels != 1:
        audio_seg = audio_seg.set_channels(1)

    if audio_seg.sample_width != 2:
        audio_seg = audio_seg.set_sample_width(2)

    if audio_seg.frame_rate != SAMPLING_RATE:
        audio_seg = audio_seg.set_frame_rate(SAMPLING_RATE)
    return audio_seg
        


In [10]:
import warnings
# sklearn puts out a lot of annoying warnings
warnings.filterwarnings('ignore')

Segment = namedtuple('Segment', ['start', 'end', 'classification'])

# re-implement a simplified version of mid_term_file_classification
# it is implemented as a class to allow the model to be cached.
class ExtractVoiceSegments():
    classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model('/home/jovyan/work/pyAudioAnalysis/pyAudioAnalysis/data/models/svm_rbf_4class')

    def __init__(self):
        pass

    def segments_in(self, signal, sampling_rate, offset):
        labels = []

        # mid-term feature extraction:
        mt_feats, _, _ = \
            mtf.mid_feature_extraction(signal, sampling_rate,
                                    ExtractVoiceSegments.mt_win * sampling_rate,
                                    ExtractVoiceSegments.mid_step * sampling_rate,
                                    round(sampling_rate * ExtractVoiceSegments.st_win),
                                    round(sampling_rate * ExtractVoiceSegments.st_step))

        # for each feature vector (i.e. for each fix-sized segment):
        for col_index in range(mt_feats.shape[1]):
            # normalize current feature v
            feature_vector = (mt_feats[:, col_index] - ExtractVoiceSegments.mean) / ExtractVoiceSegments.std

            # classify vector:
            label_predicted, _ = \
                at.classifier_wrapper(ExtractVoiceSegments.classifier, 'svm', feature_vector)
            labels.append(label_predicted)

        segs, classes = aS.labels_to_segments(labels, ExtractVoiceSegments.mid_step)
        # there is a bug in labels to segments when there is a single label. In this case it returns a list rather than a list of lists
        if len(labels) == 1:
            segs = [].append(segs)
        return [] if segs is None else [Segment(seg[0]+offset, seg[1]+offset, ExtractVoiceSegments.class_names[int(label)]) for seg, label in zip(segs, classes)]



"""
This function performs mid-term classification of an audio stream.
Towards this end, supervised knowledge is used,
i.e. a pre-trained classifier.
ARGUMENTS:
    - input_file:        path of the input WAV/mp3 file
RETURNS:
    - list of Segments (see above tuple)
"""
def extract_voice_segments(input_file, *, __extract_voice_segments=ExtractVoiceSegments()):
    segments = []

    # load input file
    sampling_rate, signal = aIO.read_audio_file(input_file)

    # could not read file
    if sampling_rate == 0:
        return segments

    # convert stereo (if) to mono
    signal = aIO.stereo_to_mono(signal)

    # find the silence segments
    non_silent_segments = aS.silence_removal(signal, sampling_rate, 0.02, 0.02, smooth_window=1.0, weight=0.3)

    # work through each segment
    for seg in non_silent_segments:
        start = int(seg[0]*sampling_rate)
        stop = int(seg[1]*sampling_rate)
        sig = signal[start:stop]

        segments.extend(__extract_voice_segments.segments_in(signal[start:stop], sampling_rate, seg[0]))

    return segments

In [11]:
Epoch = namedtuple('Epoch', ['start', 'end'])
def speech_epochs_from_segments(segments, epoch_length=4.0, silence_tolerance=0.0):
    epochs = []
    i = 0
    silence_this_epoch = silence_tolerance
    while i < len(segments):
        seg_duration = segments[i].end - segments[i].start
        if segments[i].classification != 'speech':
            silence_this_epoch = silence_tolerance

        elif seg_duration >= epoch_length:
            epochs.append(Epoch(segments[i].start, segments[i].start+epoch_length))
            silence_this_epoch = silence_tolerance
            # process the same segment again with a smaller size
            new_start = segments[i].start+epoch_length
            new_end = segments[i].end
            if new_start < new_end:
                segments[i] = Segment(new_start, new_end, segments[i].classification)
                continue
        else:
            if i+1 < len(segments):
                if (segments[i].end + silence_this_epoch) >= segments[i+1].start and segments[i+1].classification == 'speech':
                    # did we use up any silence tolerence
                    if segments[i].end < segments[i+1].start:
                        silence_this_epoch -= (segments[i+1].start - segments[i].end)
                    segments[i+1] = Segment(segments[i].start, segments[i+1].end, segments[i].classification)
                else:
                    silence_this_epoch = silence_tolerance

        i+=1

    return epochs

In [12]:

def save_the_segment_info(fd, segs):
    fname = prepare_raw_seg_dir(fd) + raw_seg_filename(fd)
    with open(fname, 'wb') as pklFile:
         pkl.dump(segs, pklFile)

def read_the_segment_info(fd):
    fname = prepare_raw_seg_dir(fd) + raw_seg_filename(fd)
    if os.path.exists(fname):
        if os.path.getsize(fname) > 0:
            with open(fname, 'rb') as pklFile:
                return pkl.load(pklFile)
    return []
    
def update_dataframes(seg_df_csv, seg_records):
    # now update the dataframes
    if len(seg_records) > 0:
        if os.path.isfile(seg_df_csv):
            seg_sec_df = pd.concat([pd.read_csv(seg_df_csv, index_col='file_name'), pd.DataFrame.from_records(seg_records, index='file_name')])
        else:
            seg_sec_df = pd.DataFrame.from_records(seg_records, index='file_name')
        seg_sec_df.to_csv(seg_df_csv)


In [13]:
def extract_audio_segments_for_file(fd):
    fmt = 'wav'
    if fd.filename[-4:].lower() == '.mp3' :
        fmt = 'mp3'
    audio_seg = AudioSegment.from_file(get_fname(fd), format=fmt)

    # now condition the segment and extract the raw segments.
    audio_seg = condition_audio_segment(audio_seg)
    segs = read_the_segment_info(fd)
    if len(segs) == 0:
        segs = extract_voice_segments(get_fname(fd))
        save_the_segment_info(fd, segs)

    return audio_seg, segs


def create_mp3_files(audio_seg, segs, time_per_segment, dataset_dir, fd):
    epochs_for_time = speech_epochs_from_segments(segs, epoch_length=time_per_segment, silence_tolerance=time_per_segment/4.0)

    # now write out the 4 sec segments
    dirname = prepare_dataset_data_dir(fd, dataset_dir)
    rows = list()

    for i, seg in enumerate(epochs_for_time):
        file_name = dataset_dir + fd.iso + '/' + seg_mp3_filename(fd, i)
        fname = dirname + seg_mp3_filename(fd, i)
        start = convert_to_ms(seg.start)
        stop = convert_to_ms(seg.end)
        if not os.path.exists(fname):
            audio_seg[start:stop].export(fname, format='mp3', bitrate='32k')
        desc = dict(fd._asdict())
        desc['seg_start'] = start
        desc['seg_stop'] = stop
        desc['seg'] = i
        desc['file_name'] = file_name
        rows.append(desc)

    return rows


       

In [14]:
# processing these items might take a very long time. To permit the process to be interrupted and restarted the indexes of processed items
# are maintained in a set that is pickled on each batch. This allows the batch to quickly pick up where it left off.
def process_record_batch(files_df, *, batch_size=10):
    batch_proc = 0
    processed_file = f'{SEGMENTS_DIR}processed16_{NOTEBOOK_ID}.pkl'
    if os.path.isfile(processed_file):
        with open(processed_file, 'rb') as pklFile:
            processed_files = pkl.load(pklFile)
    else:
        processed_files = set()

    segmented_4sec_segs = []
    segmented_6sec_segs = []
    segmented_10sec_segs = []

    for fd in files_df.itertuples():
        if batch_proc < batch_size:
            if fd.Index not in processed_files:
                try:
                    audio_seg, voice_segs = extract_audio_segments_for_file(fd)

                    segmented_4sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_4_SEC, SEC_4_DATA_DIR, fd))
                    segmented_6sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_6_SEC, SEC_6_DATA_DIR, fd))
                    segmented_10sec_segs.extend(create_mp3_files(audio_seg, voice_segs, SEG_10_SEC, SEC_10_DATA_DIR, fd))
                except:
                    print(f'exception {fd.filename}')
                    pass

                # we want to add an fd that has an exception so it is not reprocessed on every batch
                processed_files.add(fd.Index)
                batch_proc += 1

        else:
            break

    update_dataframes(SEG_4_SEC_DF, segmented_4sec_segs)
    update_dataframes(SEG_6_SEC_DF, segmented_6sec_segs)
    update_dataframes(SEG_10_SEC_DF, segmented_10sec_segs)

    with open(processed_file, 'wb') as pklFile:
        pkl.dump(processed_files, pklFile)

    return processed_files



In [15]:
def format_time(seconds):
    hours, remainder = divmod(int(seconds), 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02}:{minutes:02}:{seconds:02}"

start_time = time.time()
# form the language classification file
files_processed = 0
files_to_process = len(fd)
while files_to_process > files_processed:
    processed_files = process_record_batch(fd, batch_size=20)
    files_processed = len(processed_files)
    print(f'Processed {files_processed} out of {files_to_process} in {format_time(time.time()-start_time)}')


Processed 9840 out of 27784 in 00:02:33
Processed 9860 out of 27784 in 00:05:36
Processed 9880 out of 27784 in 00:09:09
Processed 9900 out of 27784 in 00:12:17
Processed 9920 out of 27784 in 00:20:44
Processed 9940 out of 27784 in 00:24:53
Processed 9960 out of 27784 in 00:27:49
Processed 9980 out of 27784 in 00:31:47
Processed 10000 out of 27784 in 00:36:40
Processed 10020 out of 27784 in 00:41:53
Processed 10040 out of 27784 in 00:44:54
Processed 10060 out of 27784 in 00:48:30
Processed 10080 out of 27784 in 00:51:45
Processed 10100 out of 27784 in 00:54:22
Processed 10120 out of 27784 in 00:57:32
Processed 10140 out of 27784 in 01:02:13
Processed 10160 out of 27784 in 01:05:34
Processed 10180 out of 27784 in 01:09:27
Processed 10200 out of 27784 in 01:14:25
Processed 10220 out of 27784 in 01:20:48
Processed 10240 out of 27784 in 01:51:23
Processed 10260 out of 27784 in 01:56:17
Processed 10280 out of 27784 in 02:04:28
Processed 10300 out of 27784 in 02:12:10
Processed 10320 out of 2