In [2]:
import os
import numpy as np
import librosa
from IPython.display import display, Audio
import timeit
import random
from itertools import groupby
from operator import itemgetter
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
import subprocess

os.chdir('/home/sharedfolder')

In [3]:
def get_mfccs(wav_pathname):
    sample_array, sample_rate = librosa.load(wav_pathname)
    mfcc_frames = librosa.feature.mfcc(sample_array, sample_rate, hop_length=2048, n_mfcc=13).T
    mfcc_frames_sans_0th = [frame_values[1:] for frame_values in mfcc_frames]
    return mfcc_frames_sans_0th

def get_mfccs_and_deltas(wav_pathname):
    sample_array, sample_rate = librosa.load(wav_pathname)
    mfcc = librosa.feature.mfcc(sample_array, sample_rate, hop_length=2048, n_mfcc=13)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    mfcc=mfcc.T     ### Transposing tables
    delta=delta.T   ## (We can instead set the axis above to do this without the extra step)
    delta2=delta2.T
    mfcc_sans_0th = [frame_values[1:] for frame_values in mfcc]
    all_features=[]
    for i in range(len(mfcc)):
        all_features.append(list(mfcc_sans_0th[i])+list(delta[i])+list(delta2[i]))
    return all_features

def get_vowel_segments(media_path):
    downsample = 1
    samplerate = 44100 // downsample

    win_s = 2048 // downsample # fft size
    hop_s = 2048  // downsample # hop size

    s = source(media_path, samplerate, hop_s)
    samplerate = s.samplerate

    tolerance = 0.6

    pitch_o = pitch("yin", win_s, hop_s, samplerate)
    pitch_o.set_unit("Hz")
    pitch_o.set_tolerance(tolerance)

    pitches = []
    confidences = []

    # total number of frames read
    total_frames = 0
    samples=[]
    pitches=[]
    while True:
        samples, read = s()
        pitch_ = pitch_o(samples)[0]
        #pitch = int(round(pitch))
        confidence = pitch_o.get_confidence()
        #print("%f %f %f" % (total_frames / float(samplerate), pitch, confidence))
        pitches += [pitch_]
        confidences += [confidence]
        total_frames += read
        if read < hop_s: break

    pitches = np.array(pitches)
    confidences = np.array(confidences)

    cleaned_pitches = ma.masked_where(confidences < tolerance, pitches)
    cleaned_pitches = ma.masked_where(cleaned_pitches > 1000, cleaned_pitches)
    return list(np.logical_not(cleaned_pitches.mask))


def media_duration(media_path):
    return float(subprocess.check_output(['ffprobe', '-v', 'quiet', '-of', 'csv=p=0', '-show_entries', 'format=duration', media_path]).strip())


def smooth(x,window_len=10,window='hanning'):
        if x.ndim != 1:
                raise ValueError, "smooth only accepts 1 dimension arrays."
        if x.size < window_len:
                raise ValueError, "Input vector needs to be bigger than window size."
        if window_len<3:
                return x
        if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
                raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
        s=np.r_[2*x[0]-x[window_len-1::-1],x,2*x[-1]-x[-1:-window_len:-1]]
        if window == 'flat': #moving average
                w=np.ones(window_len,'d')
        else:  
                w=eval('np.'+window+'(window_len)')
        y=np.convolve(w/w.sum(),s,mode='same')
        return y[window_len:-window_len+1]



def classify_clip(clip_pathname):
    mfccs=get_mfccs_and_deltas(clip_pathname)
    results = random_forest.predict(mfccs)  ## Predicting new observation
    vowel_results=[]
    vowel_bools = get_vowel_segments(clip_pathname)[::2]

    for i in range(len(results)):
        if vowel_bools[i]==True:
            vowel_results.append(results[i])

    return np.mean(vowel_results) ## Vowels only


def seconds_list_to_ranges(seconds_list): 
    ranges = []                
    for k, g in groupby(enumerate(seconds_list), lambda (i,x):i-x):
        group = map(itemgetter(1), g)
        ranges.append((group[0], group[-1]))
    return ranges

seconds_list_to_ranges([1,2,3,7,8,9,34,99,100,101,102,199])

[(1, 3), (7, 9), (34, 34), (99, 102), (199, 199)]

In [4]:
## Extracting features
tic=timeit.default_timer()

pesca_mfccs = []

for filename in os.listdir('3_training_classes/Mike_Pesca/_vowel_clips'):
    if '.wav' in filename:
        pesca_mfccs += get_mfccs_and_deltas('3_training_classes/Mike_Pesca/_vowel_clips/'+filename)

print(timeit.default_timer() - tic)

tic=timeit.default_timer()

##
bg_male_mfccs = []

for filename in os.listdir('3_training_classes/Background_male/_vowel_clips'):
    if '.wav' in filename:
        bg_male_mfccs += get_mfccs_and_deltas('3_training_classes/Background_male/_vowel_clips/'+filename)

print(timeit.default_timer() - tic)

##
tic=timeit.default_timer()

bg_female_mfccs = []

for filename in os.listdir('3_training_classes/Background_female/_vowel_clips'):
    if '.wav' in filename:
        bg_female_mfccs += get_mfccs_and_deltas('3_training_classes/Background_female/_vowel_clips/'+filename)

print(timeit.default_timer() - tic)

70.8567268848
40.5347089767
15.2988278866


In [184]:
## Decision tree

#from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier

tic=timeit.default_timer()

X = pesca_mfccs[:-len(pesca_mfccs)/10] + bg_male_mfccs[:-len(bg_male_mfccs)/10] + bg_female_mfccs[:-len(bg_female_mfccs)/10]
y = [0]*len(pesca_mfccs[:-len(pesca_mfccs)/10]) + [1]*len(bg_male_mfccs[:-len(bg_male_mfccs)/10]) + [1]*len(bg_female_mfccs[:-len(bg_female_mfccs)/10])

X_test = pesca_mfccs[-len(pesca_mfccs)/10:] + bg_male_mfccs[-len(bg_male_mfccs)/10:] + bg_female_mfccs[:-len(bg_female_mfccs)/10:]
y_test = [0]*len(pesca_mfccs[-len(pesca_mfccs)/10:]) + [1]*len(bg_male_mfccs[-len(bg_male_mfccs)/10:]) + [1]*len(bg_female_mfccs[:-len(bg_female_mfccs)/10:])

classifier = ExtraTreesClassifier().fit(X, y)

## Saving trained model
joblib.dump(classifier,'pesca_vowels_extratrees_2048.pkl')
classifier=joblib.load('pesca_vowels_extratrees_2048.pkl')

print(timeit.default_timer() - tic)

0.45753121376


In [185]:
classifier.score(X_test,y_test)

0.85217940619077703

In [215]:
import tflearn

ImportError: No module named tensorflow

In [186]:
## Loading pre-trained model

#from sklearn.ensemble import RandomForestClassifier

#random_forest=joblib.load('pesca_vowels_random_forest_2048.pkl')

In [187]:

def classify_clip(clip_pathname):
    mfccs=get_mfccs_and_deltas(clip_pathname)
    results = random_forest.predict(mfccs)  ## Predicting new observation
    vowel_results=[]
    vowel_bools = get_vowel_segments(clip_pathname)[::2]

    for i in range(len(results)):
        if vowel_bools[i]==True:
            vowel_results.append(results[i])

    return np.mean(vowel_results) ## Vowels only


In [211]:
tic=timeit.default_timer()

filename = random.choice(os.listdir('3_training_classes/unseen/'))
test_pathname = '3_training_classes/unseen/'+filename
test_mfccs=get_mfccs_and_deltas(test_pathname)

print(test_pathname)

results = classifier.predict(test_mfccs)  ## Predicting new observation

print(results)


vowel_results=[]

vowel_bools = get_vowel_segments(test_pathname)[::2]

for i in range(len(results)):
    if vowel_bools[i]==True:
        vowel_results.append(results[i])

display(Audio(test_pathname))


print("All: "+str(np.mean(results)))
print("Vowels only: "+str(np.mean(vowel_results)))

#print("Time elapsed: "+str(timeit.default_timer() - tic))

3_training_classes/unseen/SM4892127271.mp3.0408_clip.wav
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0]


All: 0.212121212121
Vowels only: 0.190476190476


In [11]:
len(get_vowel_segments(test_pathname))

65

In [212]:
%%capture

## Classifying a long audio file


tic=timeit.default_timer()


media_path = "/home/sharedfolder/3_training_classes/unseen_full_episodes/SM5931850435.mp3"


snd = AudioFileClip.AudioFileClip(media_path)

classifications=[]

for i in range(int(media_duration(media_path))):
    try:
        snd.subclip(i,i+1).write_audiofile('/tmp/temp_clip.wav')
        classifications.append(classify_clip('/tmp/temp_clip.wav'))
    except: print('missed one')

In [213]:
print(timeit.default_timer() - tic)

403.742562056


In [214]:
# Writing classification output to CSV

counter=0

class_0_secs=[]
class_1_secs=[]

i=0

for classification in smooth(np.array(classifications)):
    if classification < 0.34:
        class_0_secs.append(i)
    if classification > 0.38:
        class_1_secs.append(i)
    i+=1


counter=0

csv_path=media_path[:-4]+'_extratrees2048_labels.csv'

with open(csv_path,'w') as fo:
    for pair in seconds_list_to_ranges(class_0_secs):
        fo.write(str(float(pair[0]))+','+str(float(pair[1]))+',Pesca\n')
    for pair in seconds_list_to_ranges(class_1_secs):
        fo.write(str(float(pair[0]))+','+str(float(pair[1]))+',Background\n')