In [1]:
import os
import csv
import subprocess
import array
import random
import pandas as pd
import numpy as np
from pyAudioAnalysis import audioSegmentation as aS
from pyAudioAnalysis import audioTrainTest as aT
from itertools import groupby
from operator import itemgetter
from IPython.display import display, Audio
from pydub import AudioSegment
from pydub.utils import get_array_type
import matplotlib.pyplot as plt
import shutil
import timeit
%matplotlib inline

random.seed(999)

os.chdir('/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/Model_training_clips')

In [2]:
#############################
## Enter Speaker Name Here ##
#############################

speaker="Clinton, Bill"

#############################

last_name = speaker.split(', ')[0]+'_Bill'

print(speaker)
print(last_name)

Clinton, Bill
Clinton_Bill


In [3]:
## Loading dictionary of applause classification values

applause_labels_dir="/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/Applause_labels/"

applause_filenames=os.listdir(applause_labels_dir)


applause_dict={}

for filename in applause_filenames:
    basename=filename.replace('.applause.csv','')
    path=os.path.join(applause_labels_dir,filename)
    range_table=[]
    with open(path) as csvfile:
        spamreader = csv.reader(csvfile)
        for start,numeric_id,duration in spamreader:
            if float(duration)>2.0:
                if (float(start)-1.0) > 0:
                    adjusted_start=float(start)-1.0
                else:
                    adjusted_start=float(start)
                range_table.append([adjusted_start,float(start)+float(duration)])
    applause_dict[basename]=range_table

In [4]:
applause_dict['cpb-aacip-15-5m6251fq65__barcode349720_.h264']

[[99.0, 103.0],
 [301.0, 305.0],
 [741.0, 745.0],
 [949.0, 954.0],
 [1522.0, 1526.0],
 [1591.0, 1599.0],
 [1684.0, 1688.0],
 [1696.0, 1703.0],
 [1725.0, 1738.0],
 [1753.0, 1770.0],
 [1796.0, 1801.0],
 [1907.0, 1942.0],
 [1949.0, 1957.0],
 [1962.0, 1970.0],
 [2720.0, 2734.0],
 [2747.0, 2751.0],
 [2754.0, 2765.0],
 [2799.0, 2830.0],
 [3237.0, 3244.0],
 [3318.0, 3329.0]]

In [5]:
def is_between(value,range_pair):
    if float(range_pair[0]) < float(value) < float(range_pair[1]):
        return True
    else:
        return False

def spans_overlap(range_1,range_2):
    if is_between(range_2[0],range_1):
        return True
    elif is_between(range_2[1],range_1):
        return True
    elif is_between(range_1[0],range_2):
        return True
    elif is_between(range_1[1],range_2):
        return True
    else: return False

In [6]:
range_1=[1725.0, 1738.0]
range_2=[1727.0, 1728.0]
range_3=[900.0, 901.0]

print(spans_overlap(range_1,range_2))
print(spans_overlap(range_2,range_3))

True
False


In [7]:
## Loading table of labeled 2-second audio segments

aapb_metadata_all_split=pd.read_csv("/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/AAPB_ARLO_All_170414_2_sec_segs.csv")
aapb_metadata_all_split.tail(1)

Unnamed: 0,AAPB Unique Identifier (GUID),Type,Value,class ID,Timecode IN,Timecode OUT,Tag Duration,File Duration,SonyCi ID,Filename,Pathname
178273,cpb-aacip/15-9w37kv75,Sound quality,audience clapping,,223,225,2,1319.976,090d1475c56c45808fad6567b2f88b3d,cpb-aacip-15-9w37kv75__213877_,/Volumes/McLaughlin-6TB-1/Extended_Corpus/Gera...


In [8]:
## Function for extracting speaker clips

def extract_clips(x_table):
    for unique_audio_path in [item for item in sorted(list(set(list(x_table['Pathname'])))) if str(item)!='nan']:
        inputfile=unique_audio_path
        if os.path.exists(inputfile):
            if inputfile.lower()[-4:].lower() in ('.wav','.mp3','.mp4'):
                wav_source=True
                if inputfile.lower()[-4:]=='.mp4':     # Creates a temporary WAV
                    wav_source=False                         # if input is MP4
                    temp_filename=inputfile.split('/')[-1]+'_temp.wav'
                    audio_path='/var/tmp/'+temp_filename   # Pathname for temp WAV
                    subprocess.call(['ffmpeg', '-y', '-i', inputfile, audio_path]) # '-y' option overwrites existing file if present
                else:
                    audio_path=inputfile

                song=None
                from pydub import AudioSegment
                
                if inputfile[-4:].lower()=='.mp3':
                    song = AudioSegment.from_mp3(audio_path)
                else:
                    song = AudioSegment.from_wav(audio_path)


        for index, rowref in x_table[x_table['Pathname']==unique_audio_path].iterrows():
            row=aapb_metadata_all_split.iloc[index] ## Ref to master split table
            time_in=row['Timecode IN']
            clip_duration=row['Tag Duration']
            inputfile=row['Pathname']
            basename=row['Filename']
            dir_name=row['Value']

            
            try: os.mkdir(dir_name)
            except: pass

            applause_table = applause_dict[basename]
            
            contains_applause=False
            
            span_pair=[float(time_in),float(time_in)+float(clip_duration)]
            
            for row in applause_table:
                if spans_overlap(row,span_pair):
                    contains_applause=True
            
            if contains_applause==False:
                from pydub import AudioSegment 
                start_msec = float(time_in) * 1000.0
                duration_msec = float(clip_duration) * 1000
                clip_pathname=basename+'.start_'+str(time_in)[:8]+'.dur_2s.wav'
                if not os.path.exists(os.path.join(dir_name,clip_pathname)):
                    clip_data = song[start_msec:start_msec+duration_msec]
                    clip_data=clip_data.set_channels(1)
                    clip_data.export(os.path.join(dir_name,clip_pathname), format="wav")

        try:
            if wav_source==False:
                os.remove(audio_path)
        except: pass

    print("*** All segments extracted! ***")



In [9]:
print(str(len(aapb_metadata_all_split[aapb_metadata_all_split['Value']==speaker]))+" clips available for "+speaker)

2692 clips available for Clinton, Bill


In [19]:
##### Running the Random Clip Extractor #####


num_clips=2692     ## number of 2-second clips desired

x_table=aapb_metadata_all_split[aapb_metadata_all_split['Value']==speaker].sample(n=num_clips)

########### skipping files in haystack ############
haystack_files=['cpb-aacip-111-53wstzp5.h264', 'cpb-aacip-111-53wsv001.h264', 'cpb-aacip-111-569325x7.h264', 'cpb-aacip-189-12m6402b.h264', 'cpb-aacip-293-br8mc8rr8k__HUT00000045001_.h264', 'cpb-aacip-503-j96057dh5p__NHPR95198', 'cpb-aacip-503-s17sn01t72__NHPR95200']

x_table=x_table[~x_table['Filename'].isin(haystack_files)]

print(len(x_table))




extract_clips(x_table)



2692
*** All segments extracted! ***


In [20]:
## Creating ID string appended to each model's filename ##

clip_count=len(os.listdir(speaker))

model_id = '_'+last_name+'_UBM_'+str(clip_count)+'x2s'


In [21]:
## Assembling UBM files

os.chdir('/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/Model_training_clips')

new_ubm_dir='UBM'+'_'+last_name


try: os.mkdir(new_ubm_dir)
except: pass

ubm_source="/Volumes/U/AAPB_Corpus_May_2017/test_set_616_clips"
ubm_files=[item for item in os.listdir(ubm_source) if (speaker not in item)&('16000.wav' in item)]
ubm_files=random.sample(ubm_files,60)
for filename in ubm_files:
    shutil.copy(os.path.join(ubm_source,filename),new_ubm_dir)

    
ubm_source="/Volumes/U/AAPB_Corpus_May_2017/PennSound_UBM_for_Creeley_full_clips_16000"
ubm_files=[item for item in os.listdir(ubm_source) if (speaker not in item)&('.wav' in item)]
ubm_files=random.sample(ubm_files,60)
for filename in ubm_files:
    shutil.copy(os.path.join(ubm_source,filename),new_ubm_dir)



In [22]:

## Train Model
import timeit
tic=timeit.default_timer()
#print(timeit.default_timer() - tic)



os.chdir('/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/Model_training_clips')


aT.featureAndTrain([new_ubm_dir,speaker], 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep, "svm", "svm"+model_id, False)
print("done")
print(timeit.default_timer() - tic)



Feature extraction complexity ratio: 41.4 x realtime
Feature extraction complexity ratio: 44.1 x realtime
Number of training experiments changed to 50 due to high number of samples
Number of training experiments changed to 10 due to high number of samples
Param = 0.00100 - Classifier Evaluation Experiment 1 of 10
Param = 0.00100 - Classifier Evaluation Experiment 2 of 10
Param = 0.00100 - Classifier Evaluation Experiment 3 of 10
Param = 0.00100 - Classifier Evaluation Experiment 4 of 10
Param = 0.00100 - Classifier Evaluation Experiment 5 of 10
Param = 0.00100 - Classifier Evaluation Experiment 6 of 10
Param = 0.00100 - Classifier Evaluation Experiment 7 of 10
Param = 0.00100 - Classifier Evaluation Experiment 8 of 10
Param = 0.00100 - Classifier Evaluation Experiment 9 of 10
Param = 0.00100 - Classifier Evaluation Experiment 10 of 10
Param = 0.01000 - Classifier Evaluation Experiment 1 of 10
Param = 0.01000 - Classifier Evaluation Experiment 2 of 10
Param = 0.01000 - Classifier Evalua

In [23]:
print(timeit.default_timer() - tic)/60.0

13.3024674177


In [24]:
## Other classifiers ##

#os.chdir('/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/Model_training_clips')

#tic=timeit.default_timer()
#aT.featureAndTrain([new_ubm_dir,speaker], 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep, "gradientboosting", "gradientboosting"+model_id, False)
#print("done")
#print(timeit.default_timer() - tic)
#tic=timeit.default_timer()
#aT.featureAndTrain([new_ubm_dir,speaker], 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep, "extratrees", "extratrees"+model_id, False)
#print("done")
#print(timeit.default_timer() - tic)
#tic=timeit.default_timer()
#aT.featureAndTrain([new_ubm_dir,speaker], 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep, "randomforest", "randomforest"+model_id, False)
#print("done")
#print(timeit.default_timer() - tic)
#tic=timeit.default_timer()
#aT.featureAndTrain([new_ubm_dir,speaker], 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep, "knn", "knn"+model_id, False)
#print("done")
#print(timeit.default_timer() - tic)

In [25]:
print(timeit.default_timer() - tic)/60.0

13.3077500502


In [26]:
# Takes list of 1-second segments classified as speaker (1.0) or 
# non-speaker (0.0) and returns list of 2-tuples specifying speaker ranges.
# Add 1 to 2nd speaker in each 2-tuple for inclusive time span.
def seconds_list_to_ranges(seconds_list): 
    ranges = []                
    for k, g in groupby(enumerate(seconds_list), lambda (i,x):i-x):
        group = map(itemgetter(1), g)
        ranges.append((group[0], group[-1]))
    return ranges


# Displays specified audio segment in Jupyter window using IPython.display
def display_clip(wav_path,start_time,end_time):
    track_data = AudioSegment.from_wav(file=wav_path)
    track_data = track_data.set_channels(1)
    bit_depth = track_data.sample_width * 8
    clip_data = track_data[int(1000*start_time):int(1000*end_time)]
    array_type = get_array_type(bit_depth)
    numeric_array = array.array(array_type, clip_data._data)
    display(Audio(numeric_array, rate=track_data.frame_rate))

# Classifies audio at 1-second resolution, plots results if speaker found, 
# and returns speaker ranges as list of 2-tuples.
# Add 1 to 2nd speaker in each 2-tuple for inclusive time span.

def find_speaker(audio_path,classifier_model_path):
    classifier_model_name = classifier_model_path.split('/')[-1]
    classifier_model_type = classifier_model_name.split('_')[0].lower() # assuming model file begins svm_etc
    
    buffer_secs=0
    is_mp3=False
    if audio_path.lower()[-4:] in ['.mp3','.mp4','.wav']:    # Creates a temporary WAV
        is_mp3=True                        # if input is MP3
        random.seed(audio_path)
        wav_path='/var/tmp/'+str(random.random())+'_temp.wav' # Filename for temp WAV is a random float
        subprocess.call(['ffmpeg', '-i', audio_path, '-y', '-ar', '16000', '-ac', '1', '-af', "volume=0.99,highpass=f=150, lowpass=f=5000",  wav_path]) # '-y' option overwrites existing file if present
    else:
        wav_path=audio_path
    print(wav_path)
    output, classesAll, acc, CM = aS.mtFileClassification(wav_path, classifier_model_path, classifier_model_type) #or replace with 'svm' etc. as needed
    output = list(output)
    counter=0
    speaker_secs=[]
    for speaker in output:
        if speaker>0.0:
            speaker_secs.append(counter)
        counter+=1
    speaker_ranges=seconds_list_to_ranges(speaker_secs)
    #if len(speaker_ranges)>0:
    #    print speaker_ranges
    #    print '\n'
    #    pd.Series(output).plot()                      # uncomment to display plot and audio clips in notebook
    #    plt.show()
    #for pair in speaker_ranges:
    #    print pair
    #    display_clip(wav_path,pair[0],pair[1]+1)
    if is_mp3==True:
        os.remove(wav_path)
    outputfile=audio_path.split('/')[-1][:-4]+"_"+classifier_model_name+".csv"
    with open(outputfile, 'w') as csv_fo:
        speaker_ranges_expanded=[(start,1,end-start+1) for start,end in speaker_ranges]
        csv_writer = csv.writer(csv_fo)
        csv_writer.writerows(speaker_ranges_expanded)
    print(outputfile)
    return speaker_ranges_expanded

In [27]:
test_dir="/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/YouTube_corpus/"+speaker

test_dir="/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill"

test_files=[os.path.join(test_dir,item) for item in os.listdir(test_dir) \
            if (".DS_Store" not in item)&(item[-4:].lower() in ('.mp4','.mp3','.wav'))]

print(test_files)

['/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill/cpb-aacip-503-j96057dh5p__NHPR95198.16000.wav', '/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill/cpb-aacip-503-s17sn01t72__NHPR95200.16000.wav', '/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill/cpb-aacip-293-br8mc8rr8k__HUT00000045001_.h264.16000.wav', '/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill/cpb-aacip-111-53wsv001.h264.16000.wav', '/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill/cpb-aacip-111-53wstzp5.h264.16000.wav', '/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill/cpb-aacip-189-12m6402b.h264.16000.wav', '/Volumes/U/AAPB_Corpus_May_2017/AAPB_Test_Haystack_Clinton_Bill/cpb-aacip-111-569325x7.h264.16000.wav']


In [28]:
classifier_model_paths=["/Volumes/McLaughlin-6TB-1/Dropbox/aapb-hipstas/Model_training_clips/"+"svm"+model_id]

In [29]:
# Example speaker search
import timeit
tic=timeit.default_timer()
#print(timeit.default_timer() - tic)

#audio_path=test_files[2]

for classifier_model_path in classifier_model_paths:
    for audio_path in test_files:
        try: 
            dd=find_speaker(audio_path,classifier_model_path)
            print(timeit.default_timer() - tic)
        except: pass

/var/tmp/0.0705723580741_temp.wav
cpb-aacip-503-j96057dh5p__NHPR95198.16000_svm_Clinton_Bill_UBM_2684x2s.csv
32.6739301682
/var/tmp/0.148064958265_temp.wav
cpb-aacip-503-s17sn01t72__NHPR95200.16000_svm_Clinton_Bill_UBM_2684x2s.csv
996.240266085
/var/tmp/0.963435849973_temp.wav
cpb-aacip-293-br8mc8rr8k__HUT00000045001_.h264.16000_svm_Clinton_Bill_UBM_2684x2s.csv
1141.51437998
/var/tmp/0.914327820858_temp.wav
cpb-aacip-111-53wsv001.h264.16000_svm_Clinton_Bill_UBM_2684x2s.csv
1596.13094616
/var/tmp/0.455338151996_temp.wav
cpb-aacip-111-53wstzp5.h264.16000_svm_Clinton_Bill_UBM_2684x2s.csv
1674.11130118
/var/tmp/0.357730644585_temp.wav
cpb-aacip-189-12m6402b.h264.16000_svm_Clinton_Bill_UBM_2684x2s.csv
1681.94317317
/var/tmp/0.679796493049_temp.wav
cpb-aacip-111-569325x7.h264.16000_svm_Clinton_Bill_UBM_2684x2s.csv
1750.07323813


In [30]:
print(timeit.default_timer() - tic)

1750.08189106
