# SIDA: Speaker Identification for Archives
## Bill Clinton

In [None]:
%%capture

!pip install git+git://github.com/hipstas/audio-tagging-toolkit.git

In [None]:
import attk
import os
import csv
import numpy as np
import librosa
import timeit
import random
import subprocess
import unicodecsv
import urllib2
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
from IPython.display import display, Audio
import pandas as pd

## Download audio files for training
#### (You may want to comment out the lines below once the download is complete.)

training_audio_dir_name = "Bill_Clinton_Classifier"

classifier_dir_pathname = '/sharedfolder/' + training_audio_dir_name + '/'

try: os.mkdir(classifier_dir_pathname)
except: pass

os.chdir(classifier_dir_pathname)

In [None]:
%%capture
## Download audio files for training
#### (You may want to comment out the lines below once the download is complete.)

#!wget -N http://xtra.arloproject.com/datasets/audio/Bill_Clinton_Speeches_UVA_Miller_Center.zip
#!unzip Bill_Clinton_Speeches_UVA_Miller_Center.zip

# Background Model vowels
#!wget -N http://xtra.arloproject.com/datasets/aapb-ubm/Female_AAPB_vowels_171110.zip
#!wget -N http://xtra.arloproject.com/datasets/aapb-ubm/Male_AAPB_vowels_171110.zip

#!unzip Female_AAPB_vowels_171110.zip
#!unzip Male_AAPB_vowels_171110.zip

In [None]:
## Renaming UBM audio clips to generic 'Male' and 'Female'

!mkdir Male
!mkdir Female

!mv Male_AAPB_vowels_171110 Male/_vowel_clips/
!mv Female_AAPB_vowels_171110 Female/_vowel_clips/

In [None]:
## Download new 1-second labels

csv_url = "https://raw.githubusercontent.com/hipstas/aapb-speaker-labels/master/speaker_labels_randomized/Bill_Clinton/Bill_Clinton_Miller_Center_Labels.csv"

csv_string = urllib2.urlopen(csv_url)

train_table_df = pd.read_csv(csv_url)

train_table_df.head()

In [None]:
## Choosing variables to extract and assigning variables we'll use below

labels_to_use = ["Male", "Bill Clinton", "Female"]

label_dir_names = [item.replace(' ', '_') for item in labels_to_use]

speaker_0_label, speaker_1_label, speaker_2_label = labels_to_use

In [None]:
%%capture
## Excerpting WAV clips corresponding to labels
#### (This may take a while.)

os.chdir(classifier_dir_pathname)

media_dir_pathname = classifier_dir_pathname + 'Bill_Clinton_Speeches_UVA_Miller_Center'

train_table_groups = train_table_df.groupby(['Media file basename', 'Label'])

for name, group in train_table_groups:
    list_of_lists = group.values.tolist()
    basename, start, duration, label, labeled_by = list_of_lists[0]
    filename = str(basename) + '.mp3'
    media_path = os.path.join(media_dir_pathname, filename)
    label_dir_pathname = str(os.path.join(classifier_dir_pathname, label.replace(' ','_')))
    if label in labels_to_use:
        subclip_pairs = []
        for row in list_of_lists:
            basename, start, duration, label, labeled_by = row
            subclip_pairs.append((float(start), float(duration)))
        try: subprocess.call(['mkdir', '-p', label_dir_pathname])
        except: pass
        try:
            attk.subclip_list(media_path, subclip_pairs, label_dir_pathname)
        except Exception as e: 
            print(e)

In [None]:
#%%capture
## Extract vowel segments from labeled audio clips
#### (This may take a while.)

os.chdir(classifier_dir_pathname)

for dir_name in label_dir_names:
    try:
        attk.batch_extract_vowels(dir_name)
    except Exception as e:
        print("ERROR: " + dir_name)
        print(e)

In [None]:
#%%capture
## Extract features (MFCCs, deltas, and delta-deltas) from Speaker 1 & UBM vowel clips, then write features to CSVs

os.chdir(classifier_dir_pathname)

for dir_name in label_dir_names:
    print("> Starting " + dir_name)
    try:
        os.chdir(os.path.join(classifier_dir_pathname, dir_name, '_vowel_clips'))
        try: os.mkdir('../_vowel_mfccs_and_deltas')
        except: pass
        filenames = [item for item in os.listdir('./') if item[-4:].lower()=='.wav']
        for filename in filenames:
            csv_out_path = '../_vowel_mfccs_and_deltas/' + filename[:-4] + '.mfcc.csv'
            if not os.path.isfile(csv_out_path):
                try:
                    mfccs = attk.get_mfccs_and_deltas(filename, n_mfcc=30, n_fft=4096, freq_min=100, freq_max=16000)
                    if len(mfccs) > 0:
                        with open(csv_out_path, 'w') as fo:
                            csv_writer = csv.writer(fo)
                            csv_writer.writerows(mfccs)  
                except Exception as e:
                    print('FILE ERROR: ' + filename)
                    print(e)
    except Exception as e:
        print('SKIPPING DIRECTORY: ' + dir_name)     ## Skipping class directories for which we didn't extract vowels
        print(e)

In [None]:
## Continue to the next notebook to train and run the speaker ID classifier.