In [None]:
import attk
import os
import csv
import numpy as np
import librosa
import timeit
import random
import subprocess
import unicodecsv
import urllib2
from sklearn.externals import joblib
from numpy import ma
from aubio import source, pitch
from moviepy.audio.io import AudioFileClip
from IPython.display import display, Audio


training_audio_dir_name = "Bill_Clinton_Classifier"

classifier_dir_pathname = '/sharedfolder/' + training_audio_dir_name + '/'

os.chdir(classifier_dir_pathname)

labels_to_use = ["Male", "Bill Clinton", "Female"]

label_dir_names = [item.replace(' ', '_') for item in labels_to_use]

speaker_0_label, speaker_1_label, speaker_2_label = labels_to_use

In [None]:
def load_features(dir_path):
    features = []
    for filename in [item for item in os.listdir(dir_path) if '.csv' in item]:
        with open(os.path.join(dir_path, filename)) as fi:
            csv_reader = csv.reader(fi)
            for row in csv_reader:
                features.append([float(item) for item in row])
    return features

In [None]:
## Load saved features

speaker_1_features = load_features('/sharedfolder/Bill_Clinton_Classifier/Bill_Clinton/_vowel_mfccs_and_deltas')
print(len(speaker_1_features))

aapb_ubm_male_features = load_features('/sharedfolder/Bill_Clinton_Classifier/Male/_vowel_mfccs_and_deltas')
print(len(aapb_ubm_male_features))

aapb_ubm_female_features = load_features('/sharedfolder/Bill_Clinton_Classifier/Female/_vowel_mfccs_and_deltas')
print(len(aapb_ubm_female_features))

In [None]:
## Printing MFCCs and deltas for a single frame

print(random.choice(speaker_1_features))

In [None]:
## Combining feature sets

#speaker_1_features = random.sample(speaker_1_features, 10000)

speaker_0_features = aapb_ubm_male_features

speaker_2_features = aapb_ubm_female_features

#print(len(speaker_1_features))
#print(len(ubm_features))

In [None]:
## Training and multi-layer perceptron model with 9/10 of training data and evaluating performance on remaining 1/10

os.chdir(classifier_dir_pathname)

import random
#random.shuffle(speaker_0_features)
#random.shuffle(speaker_1_features)
#random.shuffle(speaker_2_features)

from sklearn.neural_network import MLPClassifier

X = speaker_0_features[:-len(speaker_0_features)/10] + speaker_1_features[:-len(speaker_1_features)/10] + speaker_2_features[:-len(speaker_2_features)/10]
y = [1]*len(speaker_0_features[:-len(speaker_0_features)/10]) + [0]*len(speaker_1_features[:-len(speaker_1_features)/10]) + [2]*len(speaker_2_features[:-len(speaker_2_features)/10])

X_train = np.array(X)
y_train = np.array(y)

X_test = speaker_0_features[-len(speaker_0_features)/10:] + speaker_1_features[-len(speaker_1_features)/10:] + speaker_2_features[-len(speaker_2_features)/10:]
y_test = [1]*len(speaker_0_features[-len(speaker_0_features)/10:]) + [0]*len(speaker_1_features[-len(speaker_1_features)/10:]) + [2]*len(speaker_2_features[-len(speaker_2_features)/10:])

X_test = np.array(X_test)
y_test = np.array(y_test)
    
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#classifier = MLPClassifier().fit(X_train_scaled, y_train)

classifier = MLPClassifier(max_iter = 2000, random_state = 9, \
                          hidden_layer_sizes = (100, 100), solver = 'adam', \
                          activation = 'relu').fit(X_train_scaled, y_train)

print(classifier.score(X_test_scaled,y_test))

In [None]:
## Training and saving an MLP model with all training data
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

X = speaker_0_features + speaker_1_features + speaker_2_features
y = [0]*len(speaker_0_features) + [1]*len(speaker_1_features) + [2]*len(speaker_2_features)

scaler = StandardScaler()

scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

X_scaled = scaler.transform(X)

#classifier = MLPClassifier().fit(X_scaled, y)

classifier = MLPClassifier(max_iter = 2000, random_state = 9, \
                          hidden_layer_sizes = (100, 100), solver = 'adam', \
                          activation = 'relu').fit(X_scaled, y)


trained_model_filename = 'Bill_Clinton_vowels_mlpc_4096_100-16K_w_genders_scaled.pkl'
scaler_filename = 'Bill_Clinton_vowels_mlpc_4096_100-16K_w_genders_scaled.scaler'

print(trained_model_filename)
print(scaler_filename)

## Saving trained model and scaler
joblib.dump(classifier, trained_model_filename)
joblib.dump(scaler, scaler_filename)
classifier = joblib.load(trained_model_filename)
scaler = joblib.load(scaler_filename)

In [None]:
##############################################
#### Start here to load pre-trained model ####
##############################################

trained_model_filename = 'Bill_Clinton_vowels_mlpc_4096_100-16K_w_genders_scaled.pkl'
scaler_filename = 'Bill_Clinton_vowels_mlpc_4096_100-16K_w_genders_scaled.scaler'

os.chdir(classifier_dir_pathname)
classifier = joblib.load(trained_model_filename)
scaler = joblib.load(scaler_filename)

In [None]:
#%%capture
## Download unseen audio and split into 3-second WAV clips for testing
import subprocess
os.chdir(classifier_dir_pathname)

try: os.mkdir('test_clips/')
except: pass

os.chdir(os.path.join(classifier_dir_pathname, 'test_clips'))

#mp3_url = "https://archive.org/download/GreatSpeechesAndInterviewsWithPresidentBillClintonAndOthers/GSI071110A_64kb.mp3"
mp3_url = "http://www.stephenmclaughlin.net/hipstas/misc/nc6j0201.mp3"

mp3_filename = os.path.basename(mp3_url)

wav_filename = mp3_filename[:-4]+'.wav'

subprocess.call(['wget', '-N', mp3_url])

subprocess.call(['ffmpeg', '-i', mp3_filename, wav_filename])

subprocess.call(['ffmpeg', '-i', wav_filename, '-f', 'segment', '-segment_time', '3',  wav_filename[:-4] + '_3_sec_%04d.wav'])


In [None]:
import scipy.stats

def most_common_class(class_ids):
    mode_id = int(list(scipy.stats.mode(class_ids))[0][0])
    #mode_id = 1  #####################################################################################
    mode_id_percentage = float(float(class_ids.count(mode_id))/len(class_ids))
    return (mode_id, mode_id_percentage)

In [None]:
## Classifying short clips
#### Repeat this cell several times to help choose a classifier threshold value.

import scipy

os.chdir(os.path.join(classifier_dir_pathname, 'test_clips'))

wav_pathname = os.path.abspath(random.choice([item for item in os.listdir('./') if '3_sec' in item]))

test_features = np.array(attk.get_mfccs_and_deltas(wav_pathname, n_mfcc=30, n_fft=4096, freq_min=100, freq_max=16000))
test_features = scaler.transform(test_features)

print(wav_pathname)

results = classifier.predict(test_features)  ## Predicting new observation
results_proba = classifier.predict_proba(test_features)  ## Predicting new observation

print(results)
print([round(max(item), 4) for item in list(results_proba)])

vowel_results=[]

vowel_bools = attk.get_vowel_segments(wav_pathname)

for i in range(len(results)):
    try:
        if vowel_bools[i]==True:
            vowel_results.append(results[i])
    except: pass
            
display(Audio(wav_pathname))

print("MODE: " + str(list(scipy.stats.mode(results))[0][0])) 
print("MODE vowels only: " + str(list(scipy.stats.mode(vowel_results))[0][0])) ## Vowels only
#print("All samples: "+str(np.mean(results)))
#print("Vowels only: "+str(np.mean(vowel_results)))

mode_id, mode_id_percentage = most_common_class(vowel_results)
top_label = labels_to_use[mode_id]

print('')
print("Speaker: " + str(top_label))
print("Confidence: " + str(mode_id_percentage))

print('')

print(str(mode_id) +','+ str(mode_id_percentage) + ',' + str(top_label) + '\n')

In [None]:
## Function that classifies vowel segments only and returns 
## average output for the full clip

## Function that classifies vowel segments only and returns 
## average output for the full clip

def classify_clip(clip_pathname):
    mfccs = np.array(attk.get_mfccs_and_deltas(clip_pathname,  n_mfcc=30, n_fft=4096, freq_min=100, freq_max=16000))
    mfccs = scaler.transform(mfccs)
    results = classifier.predict(mfccs)  ## Predicting new observation
    vowel_results=[]
    vowel_bools = attk.get_vowel_segments(clip_pathname)
    #print(len(mfccs))
    #print(len(results))
    #print(len(vowel_bools))
    
    if len(vowel_bools)==0:
            return most_common_class(results)
        
    for i in range(len(results)):
        if vowel_bools[i]==True:
            vowel_results.append(results[i])

    return  most_common_class(vowel_results)[1]   ###### Just the percentage


In [None]:
import scipy.stats

def most_common_class(class_ids):
    mode_id = int(list(scipy.stats.mode(class_ids))[0][0])
    mode_id = 1  #####################################################################################
    mode_id_percentage = float(float(class_ids.count(mode_id))/len(class_ids))
    return (mode_id, mode_id_percentage)

In [None]:
%%capture
## Classifying a long audio file

## Writing classification output to CSV

classifier_threshold = 0.0      # Classifier values below this threshold will be discarded

os.chdir(os.path.join(classifier_dir_pathname, 'test_clips'))

resolution_secs = 3.0

media_filename = 'nc6j0201.wav'

media_path = os.path.join(classifier_dir_pathname, 'test_clips', media_filename)

csv_path = media_path[:-4]+'_Bill_Clinton_mlpc_4096_100-16K_w_genders_scaled_'+str(resolution_secs)+'s.csv'

counter=0


import timeit
tic=timeit.default_timer()

snd = AudioFileClip.AudioFileClip(media_path)

classifications = []

for i in range(int(attk.duration(media_path)/resolution_secs)):
    try:
        snd.subclip(i * resolution_secs , (i * resolution_secs) + resolution_secs).write_audiofile('/tmp/temp_clip.wav')
        value = classify_clip('/tmp/temp_clip.wav')
        
        with open(csv_path,'a') as fo:
            duration = resolution_secs

            if value >= classifier_threshold:
                start = i * resolution_secs
                fo.write(str(start) + ',' + str(duration) +','+ str(value) + ',' + speaker_1_label + '\n')
        
    except:
        classifications.append(0.0)
        print("Error: " + str(i))

In [None]:
print("Time elapsed: "+str(timeit.default_timer() - tic))

In [None]:
## Writing classification output to CSV

classifier_threshold = 0.2      # Classifier values below this threshold will be discarded

os.chdir('/sharedfolder/sida_classifier')

csv_path = media_path[:-4]+'_mlpc2048_labels_'+str(resolution_secs)+'s.csv'

counter=0

with open(csv_path,'w') as fo:
    duration = resolution_secs
    for value in classifications:
        if value >= classifier_threshold:
            start = counter * resolution_secs
            fo.write(str(start) + ',' + str(duration) +','+ str(value) + ',' + speaker_1_label + '\n')
        counter+=1