# Humanities Research with Sound: Introduction to Audio Machine Learning

**Stephen McLaughlin and Tanya Clement**

**August 7, 2017**

In [1]:
import attk
import os
import csv
import numpy as np
import librosa
import timeit
import random
import subprocess
import unicodecsv
import urllib2
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
from IPython.display import display, Audio

!mkdir -p /sharedfolder/_training_audio

os.chdir('/sharedfolder/_training_audio/')

## Download audio files for training

#!wget -N http://www.stephenmclaughlin.net/HILT/audio_corpora/NPR_Fresh_Air_diarized.zip
#!unzip NPR_Fresh_Air_diarized.zip

In [None]:
## Download new 1-second labels

csv_url = "https://raw.githubusercontent.com/hipstas/aapb-labels/master/Speaker_labels_randomized/Terry_Gross/Terry_Gross_labels.csv"

csv_string = urllib2.urlopen(csv_url)

train_table = []


## Load CSV as list of lists

csv_reader = unicodecsv.reader(csv_string)

for row in csv_reader:
        train_table.append(row)

train_table[:10]+['...']

In [None]:
## Remove header row (if present)

if 'Media file basename' in train_table[0]:
    train_table = train_table[1:]

In [None]:
%%capture

## Excerpting WAV clips corresponding to labels

training_audio_pathname = "NPR_Fresh_Air_diarized"
out_dir = '_classes_' + training_audio_pathname

for row in train_table:
    try:
        basename, start, duration, class_name, labeled_by = row  ## Assigning values in row to variables
        filename = str(basename + '.mp3')
        start = float(start)
        end = float(start) + float(duration)
        wav_out_pathname = str(os.path.join(out_dir, class_name.replace(' ','_')))
        try: 
            subprocess.call(['mkdir', '-p', wav_out_pathname])
        except:
            pass
        attk.subclip(os.path.join(training_audio_pathname, filename), float(start), end, wav_out_pathname) ## <- attk
    except Exception as e: 
        print(row)
        print(e)

In [None]:
## Defining functions we'll use use below

# Extract audio segments to WAV for an audio/video pathname and a list of 2-tuple time values
def extract_vowel_pairs(media_path, vowel_time_ranges):
    snd = AudioFileClip.AudioFileClip(media_path)
    file_duration = attk.duration(media_path)
    for pair in vowel_time_ranges:
        start, end = pair
        start = float(start)
        end = float(end)
        if end-start >= 0.1:  ## Ignore clips shorter than 0.1 second
            basename = media_path.split('/')[-1][:-4]
            out_filename = basename+'__'+str(round(start, 4))+'_'+str(round(end, 4))+'.wav'
            snd.subclip(start, end).write_audiofile(os.path.join('_vowel_clips',out_filename))

# Extract vowel segments to WAV for every audio/video file in a given directory
def batch_extract_vowels(media_dir):
    starting_location = os.getcwd()
    os.chdir(media_dir)
    bin_2048_to_sec_constant = 0.046439909297052155
    try: os.mkdir('_vowel_clips')
    except: pass
    filenames=[item for item in os.listdir('./') if item[-4:].lower() in ('.mp3','.wav','.mp4')]
    for filename in filenames[::-1]:
        try:
            vowel_bools = attk.get_vowel_segments(filename)
            vowel_bin_ranges = attk.labels_to_ranges(vowel_bools, label=True)
            vowel_time_ranges = [(s*bin_2048_to_sec_constant, e*bin_2048_to_sec_constant) for s, e in vowel_bin_ranges]
            extract_pairs(filename,vowel_time_ranges)
        except: print("***** ERROR: "+filename)
    os.chdir(starting_location)

In [None]:
%%capture

## Extract vowel segments from labeled audio clips

os.chdir('/sharedfolder/_training_audio/_classes_NPR_Fresh_Air_diarized')

batch_extract_vowels('Terry_Gross')
batch_extract_vowels('Background_Speaker')

In [None]:
!ls

In [None]:
## Extract features from Terry Gross & UBM vowel clips & write CSVs (MFCCs, deltas, and deltad-deltas)

os.chdir('/sharedfolder/_training_audio/_classes_NPR_Fresh_Air_diarized')

dir_names = [item for item in os.listdir('./') if os.path.isdir(item)]

for dir_name in dir_names:
    
    try:
        os.chdir('/sharedfolder/_classes_NPR_Fresh_Air_diarized/' + dir_name + '/_vowel_clips')
    
        try: os.mkdir('../_vowel_mfccs_and_deltas')
        except: pass
    
        csv_out_path = '../_vowel_mfccs_and_deltas/' + filename[:-4] + '.mfcc.csv'
        if not os.path.isfile(csv_out_path):
            try:
                mfccs = attk.get_mfccs_and_deltas(filename)
                with open(csv_out_path, 'w') as fo:
                    csv_writer = csv.writer(fo)
                    csv_writer.writerows(mfccs)  
            except:
                "ERROR on " + filename
                
    except: pass  ## Ignoring classes for which we haven't extracted vowels

In [None]:
%%capture

## Download and unzip prepared feature sets

os.chdir('/sharedfolder/')

!wget -N https://raw.githubusercontent.com/hipstas/shaping-humanities-data/master/feature_sets/Terry_Gross_vowel_mfccs_and_deltas.zip
!wget -N https://raw.githubusercontent.com/hipstas/shaping-humanities-data/master/feature_sets/Fresh_Air_ubm_vowel_mfccs_and_deltas.zip
!wget -N https://raw.githubusercontent.com/hipstas/shaping-humanities-data/master/feature_sets/AAPB_female_vowel_mfccs_and_deltas.zip
!wget -N https://raw.githubusercontent.com/hipstas/shaping-humanities-data/master/feature_sets/AAPB_male_vowel_mfccs_and_deltas.zip

!unzip Terry_Gross_vowel_mfccs_and_deltas.zip
!unzip Fresh_Air_ubm_vowel_mfccs_and_deltas.zip
!unzip AAPB_female_vowel_mfccs_and_deltas.zip
!unzip AAPB_male_vowel_mfccs_and_deltas.zip

In [None]:
## Load saved features

os.chdir('/sharedfolder/Terry_Gross_vowel_mfccs_and_deltas')

gross_features = []

for filename in os.listdir('./'):
    with open(filename) as fi:
        csv_reader = csv.reader(fi)
        for row in csv_reader:
            gross_features.append([float(item) for item in row])

print(len(gross_features))


os.chdir('/sharedfolder/Terry_Gross_vowel_mfccs_and_deltas')

fresh_air_ubm_features = []

for filename in os.listdir('./'):
    with open(filename) as fi:
        csv_reader = csv.reader(fi)
        for row in csv_reader:
            fresh_air_ubm_features.append([float(item) for item in row])

print(len(fresh_air_ubm_features))


os.chdir('/sharedfolder/AAPB_male_vowel_mfccs_and_deltas')

m_ubm_features = []

for filename in os.listdir('./'):
    with open(filename) as fi:
        csv_reader = csv.reader(fi)
        for row in csv_reader:
            m_ubm_features.append([float(item) for item in row])

print(len(m_ubm_features))


os.chdir('/sharedfolder/AAPB_female_vowel_mfccs_and_deltas')

f_ubm_features = []

for filename in os.listdir('./'):
    with open(filename) as fi:
        csv_reader = csv.reader(fi)
        for row in csv_reader:
            f_ubm_features.append([float(item) for item in row])

print(len(f_ubm_features))


In [None]:
## Printing MFCCs and deltas for a single frame

print(random.choice(gross_features))

In [None]:
## Combining feature sets

speaker_1_mfccs = gross_features
ubm_mfccs = fresh_air_ubm_features + m_ubm_features + f_ubm_features

print(len(speaker_1_mfccs))
print(len(ubm_mfccs))

In [None]:
## Training Extra Trees Classifier

os.chdir('/sharedfolder/')

from sklearn.ensemble import ExtraTreesClassifier

X = speaker_1_mfccs[:-len(speaker_1_mfccs)/10] + ubm_mfccs[:-len(ubm_mfccs)/10]
y = [1]*len(speaker_1_mfccs[:-len(speaker_1_mfccs)/10]) + [0]*len(ubm_mfccs[:-len(ubm_mfccs)/10])

X_test = speaker_1_mfccs[-len(speaker_1_mfccs)/10:] + ubm_mfccs[-len(ubm_mfccs)/10:]
y_test = [1]*len(speaker_1_mfccs[-len(speaker_1_mfccs)/10:]) + [0]*len(ubm_mfccs[-len(ubm_mfccs)/10:])

classifier = ExtraTreesClassifier().fit(X, y)

## Saving trained model
joblib.dump(classifier,'gross_vowels_extratrees_2048.pkl')
classifier = joblib.load('gross_vowels_extratrees_2048.pkl')

print(classifier.score(X_test,y_test))

In [None]:
## Training Simple Multi-Layer Perceptron Model

os.chdir('/sharedfolder/')

from sklearn.neural_network import MLPClassifier

X = speaker_1_mfccs[:-len(speaker_1_mfccs)/10] + ubm_mfccs[:-len(ubm_mfccs)/10]
y = [1]*len(speaker_1_mfccs[:-len(speaker_1_mfccs)/10]) + [0]*len(ubm_mfccs[:-len(ubm_mfccs)/10])

X_test = speaker_1_mfccs[-len(speaker_1_mfccs)/10:] + ubm_mfccs[-len(ubm_mfccs)/10:]
y_test = [1]*len(speaker_1_mfccs[-len(speaker_1_mfccs)/10:]) + [0]*len(ubm_mfccs[-len(ubm_mfccs)/10:])

#classifier = ExtraTreesClassifier().fit(X, y)
classifier = MLPClassifier().fit(X, y)

## Saving trained model
joblib.dump(classifier,'gross_vowels_mlpc_2048.pkl')
classifier = joblib.load('gross_vowels_mlpc_2048.pkl')

print(classifier.score(X_test,y_test))

In [3]:
##############################################
#### Start here to load pre-trained model ####
##############################################

os.chdir('/sharedfolder/shaping-humanities-data')
classifier = joblib.load('gross_vowels_mlpc_2048.pkl')
#classifier = joblib.load('gross_vowels_extratrees_2048.pkl')

In [None]:
%%capture
## Download and unzip a set of 358 3-second Fresh Air clips

os.chdir('/sharedfolder/')
!wget -N https://github.com/hipstas/shaping-humanities-data/blob/master/audio/Fresh_Air_2017-07-31_3-sec_clips.zip?raw=true -O Fresh_Air_2017-07-31_3-sec_clips.zip
!unzip Fresh_Air_2017-07-31_3-sec_clips.zip

In [None]:
## Classifying short clips

os.chdir('/sharedfolder/Fresh_Air_2017-07-31_3-sec_clips/')

wav_pathname = os.path.abspath(random.choice(os.listdir('./')))

test_mfccs = attk.get_mfccs_and_deltas(wav_pathname)

print(wav_pathname)

results = classifier.predict(test_mfccs)  ## Predicting new observation

print(results)

vowel_results=[]

vowel_bools = attk.get_vowel_segments(wav_pathname)

for i in range(len(results)):
    if vowel_bools[i]==True:
        vowel_results.append(results[i])

display(Audio(wav_pathname))

print("All samples: "+str(np.mean(results)))
print("Vowels only: "+str(np.mean(vowel_results)))

In [4]:
## Function that classifies *vowel segments only* and returns 
## average output for the full recording

def classify_clip(clip_pathname):
    mfccs = attk.get_mfccs_and_deltas(clip_pathname)
    results = classifier.predict(mfccs)  ## Predicting new observation
    vowel_results=[]
    vowel_bools = attk.get_vowel_segments(clip_pathname)

    for i in range(len(results)):
        if vowel_bools[i]==True:
            vowel_results.append(results[i])

    return np.mean(vowel_results) ## Vowels only

In [None]:
print(classify_clip(wav_pathname))

In [9]:
os.chdir('/sharedfolder/')
!mkdir fa_episodes
os.chdir('fa_episodes')
!wget https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/543481418/npr_543481418.mp3
!wget https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/542940836/npr_542940836.mp3
!wget https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/542872654/npr_542872654.mp3
!wget https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/542472146/npr_542472146.mp3
!wget https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/541728116/npr_541728116.mp3
!wget https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/540147314/npr_540147314.mp3
!wget https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/539331219/npr_539331219.mp3



mkdir: cannot create directory 'fa_episodes': File exists
--2017-08-15 12:07:54--  https://play.podtrac.com/npr-381444908/npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/543481418/npr_543481418.mp3
Resolving play.podtrac.com (play.podtrac.com)... 52.40.240.5
Connecting to play.podtrac.com (play.podtrac.com)|52.40.240.5|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/543481418/npr_543481418.mp3 [following]
--2017-08-15 12:07:54--  https://npr.mc.tritondigital.com/NPR_381444908/media/anon.npr-podcasts/podcast/381444908/543481418/npr_543481418.mp3
Resolving npr.mc.tritondigital.com (npr.mc.tritondigital.com)... 192.173.28.56, 192.173.28.58, 208.92.52.71, ...
Connecting to npr.mc.tritondigital.com (npr.mc.tritondigital.com)|192.173.28.56|:443... connected.
HTTP request sent, awaiting response... 302 Temporarily Moved
Location: https://1665

Resolving npr.mc.tritondigital.com (npr.mc.tritondigital.com)... 192.173.28.56, 192.173.28.58, 208.92.52.71, ...
Connecting to npr.mc.tritondigital.com (npr.mc.tritondigital.com)|192.173.28.56|:443... connected.
HTTP request sent, awaiting response... 302 Temporarily Moved
Location: https://16653.mc.tritondigital.com:443/NPR_381444908/media-session/85f6f9ba-abc8-4528-b0fb-89b9a61c4209/anon.npr-podcasts/podcast/381444908/541728116/npr_541728116.mp3 [following]
--2017-08-15 12:09:01--  https://16653.mc.tritondigital.com/NPR_381444908/media-session/85f6f9ba-abc8-4528-b0fb-89b9a61c4209/anon.npr-podcasts/podcast/381444908/541728116/npr_541728116.mp3
Resolving 16653.mc.tritondigital.com (16653.mc.tritondigital.com)... 192.173.28.56
Connecting to 16653.mc.tritondigital.com (16653.mc.tritondigital.com)|192.173.28.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Cookie coming from 16653.mc.tritondigital.com attempted to set domain to live.streamtheworld.com
Length: 24242472 

In [11]:
resolution_secs = 5.0

os.chdir('/sharedfolder/fa_episodes')

import timeit
tic=timeit.default_timer()

for media_path in [item for item in os.listdir('./') if (item[0]!='.')&('.mp3' in item.lower())]:
    print(media_path)

npr_543481418.mp3
npr_542940836.mp3
npr_539829086.mp3
npr_542872654.mp3
npr_542472146.mp3
npr_541728116.mp3
npr_542669817.mp3
npr_542091509.mp3
npr_541436949.mp3
npr_540147314.mp3
npr_539331219.mp3


In [None]:
%%capture
## Classifying a long audio file

resolution_secs = 5.0

os.chdir('/sharedfolder/fa_episodes')

import timeit
tic=timeit.default_timer()


for media_path in [item for item in os.listdir('./') if (item[0]!='.')&('.mp3' in item.lower())]:

    snd = AudioFileClip.AudioFileClip(media_path)

    classifications = []

    for i in range(int(attk.duration(media_path)/resolution_secs)):
        try:
            snd.subclip(i * resolution_secs , (i * resolution_secs) + resolution_secs).write_audiofile('/tmp/temp_clip.wav')
            classifications.append(classify_clip('/tmp/temp_clip.wav'))
        except:
            classifications.append(0.0)
            print("Error: " + str(i))
        ## Writing classification output to CSV

    classifier_threshold = 0.06

    classifier_output = []

    for classification in attk.smooth(np.array(classifications)):
        if classification < classifier_threshold:
            classifier_output.append(0)
        if classification >= classifier_threshold:
            classifier_output.append(1)

    csv_path = media_path[:-4]+'_mlpc2048_labels.csv'
    #csv_path = media_path[:-4]+'_extratrees2048_labels.csv'

    with open(csv_path,'w') as fo:
        for pair in attk.labels_to_ranges(classifier_output, label=1):
            start = pair[0] * resolution_secs
            duration = (pair[1] - pair[0]) * resolution_secs
            fo.write(str(start) + ',' + str(start + duration) + ',Terry Gross\n')

In [13]:
print("Time elapsed: "+str(timeit.default_timer() - tic))

Time elapsed: 1091.93428802


In [None]:
## Writing classification output to CSV

classifier_threshold = 0.06

classifier_output = []

for classification in attk.smooth(np.array(classifications)):
    if classification < classifier_threshold:
        classifier_output.append(0)
    if classification >= classifier_threshold:
        classifier_output.append(1)

csv_path = media_path[:-4]+'_mlpc2048_labels.csv'
csv_path = media_path[:-4]+'_extratrees2048_labels.csv'

with open(csv_path,'w') as fo:
    for pair in attk.labels_to_ranges(classifier_output, label=1):
        start = pair[0] * resolution_secs
        duration = (pair[1] - pair[0]) * resolution_secs
        fo.write(str(start) + ',' + str(start + duration) + ',Terry Gross\n')