# SIDA: Speaker Identification for Archives: Bill Clinton, notebook 1 of 2

## Before you begin, download the training corpus of audio files and the labeled data you'll need:

### Bill Clinton Audio: 
- http://xtra.arloproject.com/datasets/audio/Bill_Clinton_Speeches_UVA_Miller_Center.zip
- http://xtra.arloproject.com/datasets/audio/AAPB_Hand_Labeled/Bill_Clinton_wav.zip

### CSV that will allow you to create exercepts of Bill Clinton speaking from the above corpora: 
- https://github.com/hipstas/aapb-speaker-labels/blob/master/speaker_labels_randomized/Bill_Clinton/Bill_Clinton_5K_labels.csv

### Male and Female audio for a universal background model: 
- http://xtra.arloproject.com/datasets/audio/Mozilla_CV_UBM_Subset/females_5k.zip
- http://xtra.arloproject.com/datasets/audio/Mozilla_CV_UBM_Subset/males_5k.zip
- http://xtra.arloproject.com/datasets/aapb-ubm/Female_AAPB_171110.zip
- http://xtra.arloproject.com/datasets/aapb-ubm/Male_AAPB_171110.zip


In [None]:
# import modules and create the training directory
import attk
import os
import csv
import numpy as np
import librosa
import timeit
import random
import subprocess
import urllib2
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
from IPython.display import display, Audio
import pandas as pd

!mkdir -p /sharedfolder/sida_classifier/

os.chdir('/sharedfolder/sida_classifier/')

Before this step, you must move the *.csv file into the /sharedfolder/sida_classifier/ directory

In [None]:
## Load 1-second labels

csv_path = '/sharedfolder/sida_classifier/Bill_Clinton_5K_labels.csv'

train_table_df = pd.read_csv(csv_path, names = ['Media file basename', 'Start', 'Duration', 'Label'])

train_table_df.head()

In [None]:
## Choose variables to extract and assigning variables we'll use below

training_audio_dir_name = "Bill_Clinton"

labels_to_use = ['Bill Clinton']

media_dir_pathname = '/sharedfolder/sida_classifier/' + training_audio_dir_name

class_dir_pathname = '/sharedfolder/sida_classifier/_classes_' + training_audio_dir_name

Before running the next cell, you must have both sets of the Bill Clinton audio files together in a */sharedfolder/sida_classifier/Bill_Clinton* directory.

In [None]:
%%capture
## Excerpt WAV clips corresponding to labels
## This cell will take some time.

os.chdir('/sharedfolder/sida_classifier/')

train_table_groups = train_table_df.groupby(['Media file basename', 'Label'])

for name, group in train_table_groups:
    list_of_lists = group.values.tolist()
    basename, start, duration, label = list_of_lists[0]
    filename = str(basename) + '.wav'
    media_path = os.path.join(media_dir_pathname, filename)
    if not os.path.isfile(media_path):
        filename = str(basename) + '.mp3'
        media_path = os.path.join(media_dir_pathname, filename)
    label_dir_pathname = str(os.path.join(class_dir_pathname, label.replace(' ','_')))
    if label in labels_to_use:
        subclip_pairs = []
        for row in list_of_lists:
            basename, start, duration, label = row
            subclip_pairs.append((float(start), float(duration)))
        try: subprocess.call(['mkdir', '-p', label_dir_pathname])
        except: pass
        try:
            attk.subclip_list(media_path, subclip_pairs, label_dir_pathname)
        except Exception as e: 
            print(e)

Before beginning this step, be sure all the male and female voices that you downloaded are unzipped into directories  in the _classes_Bill_Clinton directory like so: */sharedfolder/sida_classifier/_classes_Bill_Clinton/Male_AAPB_171110* and */sharedfolder/sida_classifier/_classes_Bill_Clinton/Female_AAPB_171110*

In [None]:
%%capture
## Extract vowel segments from labeled audio clips
#### (This may take a while.)

os.chdir(class_dir_pathname)

for dir_name in [item for item in os.listdir('./') if os.path.isdir(item)]:
    try:
        attk.batch_extract_vowels(dir_name)
    except Exception as e:
        print("ERROR: " + dir_name)
        print(e)
    os.chdir(class_dir_pathname)

In [None]:
%%capture 
# The 'magic' command above suppresses this cell's output (including errors) 
# in order to avoid surpassing the browser's memory limit. Comment it out 
# to see output and errors.

## Extract features (MFCCs, deltas, and delta-deltas) from all clips, then write features to CSVs

os.chdir(class_dir_pathname)

for dir_name in [item for item in os.listdir('./') if os.path.isdir(item)]:
    print("> Starting " + dir_name)
    try:
        os.chdir(os.path.join(class_dir_pathname, dir_name, '_vowel_clips'))
        try: os.mkdir('../_vowel_mfccs_and_deltas')
        except: pass
        filenames = [item for item in os.listdir('./') if item[-4:].lower()=='.wav']
        for filename in filenames:
            csv_out_path = '../_vowel_mfccs_and_deltas/' + filename[:-4] + '.mfcc.csv'
            if not os.path.isfile(csv_out_path):
                try:
                    mfccs = attk.get_mfccs_and_deltas(filename, n_mfcc=30, n_fft=4096, freq_min=100, freq_max=16000)
                    if len(mfccs) > 0:
                        with open(csv_out_path, 'w') as fo:
                            csv_writer = csv.writer(fo)
                            csv_writer.writerows(mfccs)  
                except Exception as e:
                    print('FILE ERROR: ' + filename)
                    print(e)
    except Exception as e:
        print('SKIPPING DIRECTORY: ' + dir_name)     ## Skipping class directories for which we didn't extract vowels
        #print(e)

In [None]:
## Continue to the next notebook to train and run the speaker ID classifier.