In [None]:
%%capture
!pip install -U git+git://github.com/hipstas/audio-tagging-toolkit

In [None]:
import attk
import os
import csv
import numpy as np
import librosa
import timeit
import random
import subprocess
import unicodecsv
import urllib2
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
from IPython.display import display, Audio

os.chdir('/sharedfolder/sida_classifier/training_set/')

In [None]:
## Load saved features

def load_features(dir_path):
    features = []
    for filename in os.listdir(dir_path):
        with open(os.path.join(dir_path, filename)) as fi:
            csv_reader = csv.reader(fi)
            for row in csv_reader:
                features.append([float(item) for item in row])
    return features

gross_features = load_features('/sharedfolder/sida_classifier/training_set/_classes_NPR_Fresh_Air_20_episodes/Terry_Gross/_vowel_mfccs_and_deltas')
print(len(gross_features))

fresh_air_ubm_features = load_features('/sharedfolder/sida_classifier/training_set/_classes_NPR_Fresh_Air_20_episodes/Terry_Gross/_vowel_mfccs_and_deltas')
print(len(fresh_air_ubm_features))

m_ubm_features = load_features('/sharedfolder/sida_classifier/AAPB_male_vowel_mfccs_and_deltas')
print(len(m_ubm_features))

f_ubm_features = load_features('/sharedfolder/sida_classifier/AAPB_female_vowel_mfccs_and_deltas')
print(len(f_ubm_features))

In [None]:
## Printing MFCCs and deltas for a single frame

print(random.choice(gross_features))

In [None]:
## Combining feature sets

speaker_1_mfccs = gross_features
ubm_mfccs = fresh_air_ubm_features + m_ubm_features + f_ubm_features

print(len(speaker_1_mfccs))
print(len(ubm_mfccs))

In [None]:
## Training and evaluating a simple multi-layer perceptron model

os.chdir('/sharedfolder/sida_classifier/')

from sklearn.neural_network import MLPClassifier
#from sklearn.ensemble import ExtraTreesClassifier

X = speaker_1_mfccs[:-len(speaker_1_mfccs)/10] + ubm_mfccs[:-len(ubm_mfccs)/10]
y = [1]*len(speaker_1_mfccs[:-len(speaker_1_mfccs)/10]) + [0]*len(ubm_mfccs[:-len(ubm_mfccs)/10])

X_test = speaker_1_mfccs[-len(speaker_1_mfccs)/10:] + ubm_mfccs[-len(ubm_mfccs)/10:]
y_test = [1]*len(speaker_1_mfccs[-len(speaker_1_mfccs)/10:]) + [0]*len(ubm_mfccs[-len(ubm_mfccs)/10:])

#classifier = ExtraTreesClassifier().fit(X, y)
classifier = MLPClassifier().fit(X, y)

print(classifier.score(X_test,y_test))

In [None]:
## Training and saving an MLP model with all training data

X = speaker_1_mfccs + ubm_mfccs
y = [1]*len(speaker_1_mfccs) + [0]*len(ubm_mfccs)

#classifier = ExtraTreesClassifier().fit(X, y)
classifier = MLPClassifier().fit(X, y)

## Saving trained model
joblib.dump(classifier, 'Terry_Gross_vowels_et_2048.pkl')
classifier = joblib.load('Terry_Gross_vowels_et_2048.pkl')

In [None]:
##############################################
#### Start here to load pre-trained model ####
##############################################

os.chdir('/sharedfolder/sida_classifier/')
#classifier = joblib.load('Terry_Gross_vowels_mlpc_2048.pkl')
#classifier = joblib.load('Terry_Gross_vowels_mlpc_2048.pkl')

In [None]:
%%capture
## Download and unzip a set of 358 3-second Fresh Air clips

os.chdir('/sharedfolder/sida_classifier/')
!wget -N https://github.com/hipstas/shaping-humanities-data/blob/master/audio/Fresh_Air_2017-07-31_3-sec_clips.zip?raw=true -O Fresh_Air_2017-07-31_3-sec_clips.zip
!unzip Fresh_Air_2017-07-31_3-sec_clips.zip

In [None]:
## Classifying short clips
#### Repeat this cell several times to help choose a classifier threshold value.

os.chdir('/sharedfolder/sida_classifier/Fresh_Air_2017-07-31_3-sec_clips/')

wav_pathname = os.path.abspath(random.choice(os.listdir('./')))

test_mfccs = attk.get_mfccs_and_deltas(wav_pathname)

print(wav_pathname)

results = classifier.predict(test_mfccs)  ## Predicting new observation

print(results)

vowel_results=[]

vowel_bools = attk.get_vowel_segments(wav_pathname)

for i in range(len(results)):
    if vowel_bools[i]==True:
        vowel_results.append(results[i])

display(Audio(wav_pathname))

print("All samples: "+str(np.mean(results)))
print("Vowels only: "+str(np.mean(vowel_results)))

In [None]:
## Function that classifies vowel segments only and returns 
## average output for the full clip

def classify_clip(clip_pathname):
    mfccs = attk.get_mfccs_and_deltas(clip_pathname)
    results = classifier.predict(mfccs)  ## Predicting new observation
    vowel_results=[]
    vowel_bools = attk.get_vowel_segments(clip_pathname)

    for i in range(len(results)):
        if vowel_bools[i]==True:
            vowel_results.append(results[i])

    return np.mean(vowel_results) ## Vowels only

In [None]:
print(classify_clip(wav_pathname))

In [None]:
%%capture
## Classifying a long audio file

resolution_secs = 5.0

os.chdir('/sharedfolder/')

!wget -N https://github.com/hipstas/shaping-humanities-data/blob/master/audio/Fresh_Air_2017-07-31.mp3?raw=true -O Fresh_Air_2017-07-31.mp3

import timeit
tic=timeit.default_timer()

media_path = "/sharedfolder/Fresh_Air_2017-07-31.mp3"

snd = AudioFileClip.AudioFileClip(media_path)

classifications = []

for i in range(int(attk.duration(media_path)/resolution_secs)):
    try:
        snd.subclip(i * resolution_secs , (i * resolution_secs) + resolution_secs).write_audiofile('/tmp/temp_clip.wav')
        classifications.append(classify_clip('/tmp/temp_clip.wav'))
    except:
        classifications.append(0.0)
        print("Error: " + str(i))

In [None]:
print("Time elapsed: "+str(timeit.default_timer() - tic))

In [None]:
## Writing classification output to CSV

classifier_threshold = 0.06

classifier_output = []

for classification in attk.smooth(np.array(classifications)):
    if classification < classifier_threshold:
        classifier_output.append(0)
    if classification >= classifier_threshold:
        classifier_output.append(1)

csv_path = media_path[:-4]+'_mlpc2048_labels.csv'
csv_path = media_path[:-4]+'_extratrees2048_labels.csv'

with open(csv_path,'w') as fo:
    for pair in attk.labels_to_ranges(classifier_output, label=1):
        start = pair[0] * resolution_secs
        duration = (pair[1] - pair[0]) * resolution_secs
        fo.write(str(start) + ',' + str(start + duration) + ',Terry Gross\n')