# SIDA: Speaker Identification for Archives

In [None]:
import attk
import os
import csv
import numpy as np
import librosa
import timeit
import random
import subprocess
import unicodecsv
import urllib2
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
from IPython.display import display, Audio

!mkdir -p /sharedfolder/sida_classifier/training_set/

os.chdir('/sharedfolder/sida_classifier/training_set/')

## Download audio files for training
#### (You may want to comment out the lines below once the download is complete.)
!wget -N http://xtra.arloproject.com/datasets/audio/The_World_WGBH_100_episodes.zip
!unzip The_World_WGBH_100_episodes.zip

training_audio_dir_name = "The_World_WGBH_100_episodes"

In [None]:
## Download new 1-second labels

csv_url = "https://raw.githubusercontent.com/hipstas/aapb-speaker-labels/master/speaker_labels_randomized/The_World_WGBH_labels_100_episodes.csv"

csv_string = urllib2.urlopen(csv_url)

train_table = []


## Load CSV as list of lists

csv_reader = unicodecsv.reader(csv_string)

for row in csv_reader:
        train_table.append(row)

train_table[:10] + ['...']

In [None]:
## Remove header row (if present)

if 'Media file basename' in train_table[0]:
    train_table = train_table[1:]

In [None]:
%%capture
## Excerpting WAV clips corresponding to labels
#### (This may take a while.)

os.chdir('/sharedfolder/sida_classifier/')

media_dir_pathname = "/sharedfolder/sida_classifier/training_set/" + training_audio_dir_name

class_dir_pathname = '/sharedfolder/sida_classifier/training_set/_classes_' + training_audio_dir_name

for row in train_table:
    try:
        basename, start, duration, class_name, labeled_by = row  ## Assigning values in row to variables
        filename = str(basename + '.mp3')
        start = float(start)
        end = float(start) + float(duration)
        wav_out_pathname = str(os.path.join(class_dir_pathname, class_name.replace(' ','_')))
        try: 
            subprocess.call(['mkdir', '-p', wav_out_pathname])
        except:
            pass
        attk.subclip(os.path.join(media_dir_pathname, filename), float(start), end, wav_out_pathname)
    except Exception as e: 
        print(row)
        print(e)

In [None]:
## Defining functions we will use use below

# Extract audio segments to WAV for an audio/video pathname and a list of 2-tuple time values
def extract_vowel_pairs(media_path, vowel_time_ranges):
    snd = AudioFileClip.AudioFileClip(media_path)
    file_duration = attk.duration(media_path)
    for pair in vowel_time_ranges:
        start, end = pair
        start = float(start)
        end = float(end)
        if end - start >= 0.05:  ## Ignore clips shorter than 0.05 second
            basename = media_path.split('/')[-1][:-4]
            out_filename = basename+'__'+str(round(start, 4))+'_'+str(round(end, 4))+'.wav'
            out_pathname = os.path.join('_vowel_clips',out_filename)
            if not os.path.isfile(out_pathname):
                snd.subclip(start, end).write_audiofile(out_pathname)

# Extract vowel segments to WAV for every audio/video file in a given directory
def batch_extract_vowels(media_dir):
    starting_location = os.getcwd()
    os.chdir(media_dir)
    bin_2048_to_sec_constant = 0.046439909297052155
    try: os.mkdir('_vowel_clips')
    except: pass
    filenames=[item for item in os.listdir('./') if item[-4:].lower() in ('.mp3','.wav','.mp4')]
    for filename in filenames[::-1]:
        try:
            vowel_bools = attk.get_vowel_segments(filename)
            vowel_bin_ranges = attk.labels_to_ranges(vowel_bools, label=True)
            vowel_time_ranges = [(s*bin_2048_to_sec_constant, e*bin_2048_to_sec_constant) for s, e in vowel_bin_ranges]
            extract_vowel_pairs(filename,vowel_time_ranges)
        except Exception as e:
            print("ERROR: " + filename)
            print(e)
    os.chdir(starting_location)

In [None]:
%%capture
## Extract vowel segments from labeled audio clips
#### (This may take a while.)

os.chdir(class_dir_pathname)

batch_extract_vowels('Marco_Werman')
batch_extract_vowels('Male')
batch_extract_vowels('Female')
batch_extract_vowels('Carol_Hills')

In [None]:
#%%capture
## Extract features (MFCCs, deltas, and delta-deltas) from Marco Werman & UBM vowel clips, then write features to CSVs

os.chdir(class_dir_pathname)

dir_names = [item for item in os.listdir('./') if os.path.isdir(item)]

for dir_name in dir_names:
    print("> Starting " + dir_name)
    try:
        os.chdir(os.path.join(class_dir_pathname, dir_name, '_vowel_clips'))
        try: os.mkdir('../_vowel_mfccs_and_deltas')
        except: pass
        filenames = [item for item in os.listdir('./') if item[-4:].lower()=='.wav']
        for filename in filenames:
            csv_out_path = '../_vowel_mfccs_and_deltas/' + filename[:-4] + '.mfcc.csv'
            if not os.path.isfile(csv_out_path):
                try:
                    mfccs = attk.get_mfccs_and_deltas(filename)
                    if len(mfccs) > 0:
                        with open(csv_out_path, 'w') as fo:
                            csv_writer = csv.writer(fo)
                            csv_writer.writerows(mfccs)  
                except Exception as e:
                    print('FILE ERROR: ' + filename)
                    print(e)
    except Exception as e:
        print('SKIPPING DIRECTORY: ' + dir_name)     ## Skipping class directories for which we didn't extract vowels
        #print(e)

In [None]:
%%capture

## Download and unzip prepared UBM feature set

os.chdir('/sharedfolder/sida_classifier/')

!wget -q https://github.com/hipstas/aapb-ubm/blob/master/UBM_feature_set/AAPB_female_vowel_mfccs_and_deltas.zip?raw=true -O AAPB_female_vowel_mfccs_and_deltas.zip
!wget -q https://github.com/hipstas/aapb-ubm/blob/master/UBM_feature_set/AAPB_male_vowel_mfccs_and_deltas.zip?raw=true -O AAPB_male_vowel_mfccs_and_deltas.zip

!unzip AAPB_female_vowel_mfccs_and_deltas.zip
!unzip AAPB_male_vowel_mfccs_and_deltas.zip

In [None]:
## Continue to the next notebook to train and run the speaker ID classifier.