In [None]:
%%capture
# Install the latest version of attk (if necessary)
#!pip install -U git+git://github.com/hipstas/audio-tagging-toolkit.git

In [None]:
import attk
import os
import numpy as np
import librosa
from IPython.display import display, Audio
import timeit
import random
from itertools import groupby
from operator import itemgetter
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
import subprocess
import unicodecsv
import urllib2

os.chdir('/sharedfolder/GitHub/sida/')

!mkdir /sharedfolder/GitHub/sida/___training_audio

os.chdir('/sharedfolder/GitHub/sida/___training_audio')

## Uncomment lines below to download audio files for training

#!wget http://www.stephenmclaughlin.net/HILT/audio_corpora/NPR_Fresh_Air_diarized.zip
#!unzip NPR_Fresh_Air_diarized.zip

In [None]:
## Loading label data CSV as a list of lists

csv_url = "https://raw.githubusercontent.com/hipstas/aapb-labels/master/Terry_Gross/Terry_Gross_labels.csv"

csv_string = urllib2.urlopen(csv_url)

train_table = []

csv_reader = unicodecsv.reader(csv_string)

for row in csv_reader:
        train_table.append(row)

train_table[:10]+['...']

In [None]:
## Removing header row if present

if 'Media file basename' in train_table[0]:
    train_table = train_table[1:]

random.shuffle(train_table)

In [None]:
%%capture
## Excerpting labeled WAV clips

training_audio_pathname = "NPR_Fresh_Air_diarized"
out_dir = '_classes_' + training_audio_pathname


for row in train_table:
    try:
        basename , start, duration, class_name, labeled_by = row  ## Assigning values in row to variables
        filename = str(basename + '.mp3')
        start = float(start)
        end = float(start) + float(duration)
        out_pathname = str(os.path.join(out_dir, class_name.replace(' ','_')))
        try: 
            subprocess.call(['mkdir', '-p', out_pathname])
        except:
            pass
        attk.subclip(os.path.join(training_audio_pathname, filename), float(start), end, out_pathname) ## <- attk
    except Exception as e: 
        print(row)
        print(e)



In [None]:

def extract_pairs(media_path,vowel_ranges):
    snd = AudioFileClip.AudioFileClip(media_path)
    file_duration = attk.duration(media_path)
    for pair in vowel_ranges:
        if int(pair[1]) >= 4:
            start, duration = pair
            start=float(start)*(512/44100.0)
            duration=float(duration)*(512/44100.0)
            if start + duration > file_duration:
                duration = file_duration - start
            basename = media_path.split('/')[-1][:-4]
            out_filename = basename+'_'+str(start)+'_'+str(duration)+'.wav'
            snd.subclip(float(start),float(start)+float(duration)).write_audiofile(os.path.join('_vowel_clips',out_filename))


def batch_extract_vowels(media_dir):

    starting_location = os.getcwd()
    
    os.chdir(media_dir)

    try: os.mkdir('_vowel_clips')
    except: pass

    filenames=[item for item in os.listdir('./') if item[-4:].lower() in ('.mp3','.wav')]

    tic=timeit.default_timer()

    for filename in filenames:
        try:
            vowel_bools = attk.get_vowel_segments(filename)
            vowel_ranges = attk.labels_to_ranges(vowel_bools, label=True)
            extract_pairs(filename,vowel_ranges)
        except: print("***** ERROR: "+filename)

    print("Time elapsed: "+str(timeit.default_timer() - tic))

    os.chdir(starting_location)


In [None]:
%%capture

#for class_dir_name in [item for item in os.listdir('./') if os.path.isdir(item)]:
#    batch_extract_vowels(class_dir_name)

batch_extract_vowels('Terry_Gross')
batch_extract_vowels('Background_Speaker')

In [None]:


def classify_clip(clip_pathname):
    mfccs=get_mfccs_and_deltas(clip_pathname)
    results = random_forest.predict(mfccs)  ## Predicting new observation
    vowel_results=[]
    vowel_bools = get_vowel_segments(clip_pathname)[::2]

    for i in range(len(results)):
        if vowel_bools[i]==True:
            vowel_results.append(results[i])

    return np.mean(vowel_results) ## Vowels only


def seconds_list_to_ranges(seconds_list): 
    ranges = []                
    for k, g in groupby(enumerate(seconds_list), lambda (i,x):i-x):
        group = map(itemgetter(1), g)
        ranges.append((group[0], group[-1]))
    return ranges

seconds_list_to_ranges([1,2,3,7,8,9,34,99,100,101,102,199])

In [None]:
## Extracting features
tic=timeit.default_timer()

speaker_1_mfccs = []

for filename in os.listdir('_classes/Carol_Hills/_vowel_clips'):
    if '.wav' in filename:
        speaker_1_mfccs += attk.get_mfccs_and_deltas('_classes/Carol_Hills/_vowel_clips/'+filename)

print(timeit.default_timer() - tic)

tic=timeit.default_timer()

##

bg_mfccs = []

for filename in os.listdir('_classes/Background_Speaker/_vowel_clips'):
    if '.wav' in filename:
        bg_mfccs += attk.get_mfccs_and_deltas('_classes/Background_Speaker/_vowel_clips/'+filename)

print(timeit.default_timer() - tic)



In [None]:
## Decision tree

#from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier

tic=timeit.default_timer()

X = speaker_1_mfccs[:-len(speaker_1_mfccs)/10] + bg_mfccs[:-len(bg_mfccs)/10]
y = [0]*len(speaker_1_mfccs[:-len(speaker_1_mfccs)/10]) + [1]*len(bg_mfccs[:-len(bg_mfccs)/10])

X_test = speaker_1_mfccs[-len(speaker_1_mfccs)/10:] + bg_mfccs[-len(bg_mfccs)/10:]
y_test = [0]*len(speaker_1_mfccs[-len(speaker_1_mfccs)/10:]) + [1]*len(bg_mfccs[-len(bg_mfccs)/10:])

classifier = ExtraTreesClassifier().fit(X, y)

## Saving trained model
joblib.dump(classifier,'hills_vowels_extratrees_2048.pkl')
classifier=joblib.load('hills_vowels_extratrees_2048.pkl')

print(timeit.default_timer() - tic)

In [None]:
classifier.score(X_test,y_test)

In [None]:
## Loading pre-trained model

#from sklearn.ensemble import RandomForestClassifier

#random_forest=joblib.load('pesca_vowels_random_forest_2048.pkl')

In [None]:

def classify_clip(clip_pathname):
    mfccs=get_mfccs_and_deltas(clip_pathname)
    results = random_forest.predict(mfccs)  ## Predicting new observation
    vowel_results=[]
    vowel_bools = get_vowel_segments(clip_pathname)[::2]

    for i in range(len(results)):
        if vowel_bools[i]==True:
            vowel_results.append(results[i])

    return np.mean(vowel_results) ## Vowels only


In [None]:
## Classifying short clips


tic=timeit.default_timer()

filename = random.choice(os.listdir('unseen/'))
test_pathname = 'unseen/'+filename
test_mfccs=attk.get_mfccs_and_deltas(test_pathname)

print(test_pathname)

results = classifier.predict(test_mfccs)  ## Predicting new observation

print(results)


vowel_results=[]

vowel_bools = get_vowel_segments(test_pathname)[::2]

for i in range(len(results)):
    if vowel_bools[i]==True:
        vowel_results.append(results[i])

display(Audio(test_pathname))


print("All: "+str(np.mean(results)))
print("Vowels only: "+str(np.mean(vowel_results)))

#print("Time elapsed: "+str(timeit.default_timer() - tic))

In [None]:
len(get_vowel_segments(test_pathname))

In [None]:
%%capture

## Classifying a long audio file


tic=timeit.default_timer()


media_path = "/sharedfolder/3_training_classes/unseen_full_episodes/SM5931850435.mp3"


snd = AudioFileClip.AudioFileClip(media_path)

classifications=[]

for i in range(int(media_duration(media_path))):
    try:
        snd.subclip(i,i+1).write_audiofile('/tmp/temp_clip.wav')
        classifications.append(classify_clip('/tmp/temp_clip.wav'))
    except: print('missed one')

In [None]:
print(timeit.default_timer() - tic)

In [None]:
# Writing classification output to CSV

counter=0

class_0_secs=[]
class_1_secs=[]

i=0

for classification in smooth(np.array(classifications)):
    if classification < 0.34:
        class_0_secs.append(i)
    if classification > 0.38:
        class_1_secs.append(i)
    i+=1


counter=0

csv_path=media_path[:-4]+'_extratrees2048_labels.csv'

with open(csv_path,'w') as fo:
    for pair in seconds_list_to_ranges(class_0_secs):
        fo.write(str(float(pair[0]))+','+str(float(pair[1]))+',Pesca\n')
    for pair in seconds_list_to_ranges(class_1_secs):
        fo.write(str(float(pair[0]))+','+str(float(pair[1]))+',Background\n')