In [1]:
#import needed libraries

import librosa as lb #used for feature extracion and resampling
import numpy as np 
import matplotlib.pyplot as plt
import librosa.display #builds on matplotlib to draw nice spectrograms.
import IPython.display as ipd #used to play .wav files from notebook
import pandas as pd 
# from scipy.io import wavfile #can be used to import .wav files but does not work on 24 bit depth audio files


import soundfile as sf #soundfile enables loading 24 bit depth audio files.
import time #used to measure code runtime. 
import pickle

The bottleneck in this code is reading the audio file. A single read on my machine takes approximately 0.23 seconds. 
Loading the entire ~5400 wav files takes approximately: 20 minutes


In [10]:
#functions to be used in this exercise:

#function to read wave file and get numpy file:

def ReadAudio(filename):
    x,sr=sf.read(filename)
    x=x.T #transpose to match format of librosa array representation
    #set sample rate at 22050 to reduce memory usage. 
    x=lb.resample(x,sr,22050)
        
    #change any stereo audio to mono.
    if x.shape[0]==2:
        x=lb.to_mono(x)
     
    #change to a list.
    x=list(x)
    return x,22050

#function to find max number of samples over all sound files. Didnt end up using this since it is memory inefficient.
#as Sounds would have to be stored in memory. 
def SampleNumbersMax(Sounds):
    #Sounds in a list of lists.
    maxNow=0
    for sound in Sounds:
        if len(sound)>maxNow:
            maxNow=len(sound)
    return maxNow

#function to check length of sound file (number of samples) and increase it till it is equal to input numSamples
def IncreaseNumSamples(sound,numSamples):
    if len(sound)<numSamples:
        newSound=sound+[0]*(numSamples-len(sound))
    else:
        newSound=sound
    return newSound

#function that takes all sounds and makes them all of equal length: also didnt use this to avoid storing Souds.
def MakeAllEqualLength(Sounds,numSamples):
    NewSounds=[]
    for sound in Sounds:
        NewSounds.append(IncreaseNumSamples(sound,numSamples))
    return NewSounds
    


#function to extract Features:
def getFeatures(Sound):
    
    stft=np.abs(lb.stft(Sound))
    mfccs=np.mean(lb.feature.mfcc(y=Sound,sr=22050,n_mfcc=40).T,axis=0)
    chroma=np.mean(lb.feature.chroma_stft(S=stft,sr=22050).T,axis=0)
    mel=np.mean(lb.feature.melspectrogram(Sound,sr=22050).T,axis=0)
    contrast = np.mean(lb.feature.spectral_contrast(S=stft, sr=22050).T,axis=0)
    tonnetz = np.mean(lb.feature.tonnetz(y=librosa.effects.harmonic(Sound),sr=22050).T,axis=0)


    return mfccs,chroma,mel,contrast,tonnetz

def getFeatures1(Sound):

    C1=lb.feature.chroma_stft(np.array(Sound),sr=22050,hop_length=5000) #see librosa features
    C1=np.reshape(C1,(C1.shape[0]*C1.shape[1],))
    
    return C1


In [3]:

#unpack training Data IDs and labels

Labels=pd.read_csv('train.csv')

#change to numpy array
Labels=np.array(Labels)


#Build list of IDs and Classes, obeservation with ID ID[i] is in class Class[i]
ID=[]
Class=[]

#unpack labels to IDs and Classes
for val in Labels:
    idnow,classnow=val
    ID.append(idnow)
    Class.append(classnow)

    


        



In [4]:
#serialize all training wav files:

#for ids in ID:
#    x,sr=ReadAudio('train/'+str(ids)+'.wav')
#    x=IncreaseNumSamples(x,88375)
#    with open('train_pickle/'+str(ids)+'.pickle','wb') as handle:
#       pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:

#unpack testing Data IDs

Labels=pd.read_csv('test.csv')

#change to numpy array
Labels=np.array(Labels)


#Build list of IDs and Classes, obeservation with ID ID[i] is in class Class[i]
IDtest=[]


#unpack labels to IDs and Classes
for val in Labels:
    idnow=int(val)
    IDtest.append(idnow)
    


In [6]:
#serialize all testing wav files:

#for ids in IDtest:
#    x,sr=ReadAudio('test/'+str(ids)+'.wav')
#    x=IncreaseNumSamples(x,88375)
#    with open('test_pickle/'+str(ids)+'.pickle','wb') as handle:
#        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
#find max sample count of any Sound: no need anymore, as have run and found maxNow (max number of samples till now)=88375
#maxNow=0
#start=time.time()

#for ids in ID:
 #       
  #      x,sr=ReadAudio('Train/'+str(ids)+'.wav')
   #     if len(x)>maxNow:
    #        maxNow=len(x)
       
#endtime=time.time()
#print(-start+endtime)

#maxNow turns out to be 88375
#time elapsed for all data=1312 seconds.

In [8]:
#get unique classes and keep reference vector
Classes=set(Class)
Classes=list(Classes) #the reference list of classes. A class of i will be translated to Classes[i]


In [38]:
#get training features

features=[]
for ids in ID:
    
    with open('train_pickle/'+str(ids)+'.pickle', 'rb') as handle:
        x = pickle.load(handle)
        a,b,c,d,e=getFeatures(np.array(x))
        
        
    
    features.append(np.hstack((a,b,c,d,e)))

features=np.array(features)



In [None]:
#WeTryThe getFeatures function:

#features=[]
#for ids in ID:
    
#    with open('train_pickle/'+str(ids)+'.pickle', 'rb') as handle:
#        x = pickle.load(handle)
        
    
#    features.append(getFeatures(x))
    




#features=np.array(features)
#features.shape


trainFeatures=features[:,:]
#testFeatures=features[4000:,:]
features=[]

trainLabels=Class[:]
#testLabels=Class[4000:]

#change labels from names to 0,1,...,9
for i in range(len(trainLabels)):
    trainLabels[i]=Classes.index(trainLabels[i])
    
   

In [42]:
    
from sklearn.neural_network import MLPClassifier
#clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,100,100), random_state=1)
#clf.fit(trainFeatures, trainLabels)     


#PredictedLabels=clf.predict(testFeatures)

#print(100*np.mean(PredictedLabels==testLabels))
#percent times predicts correct with using getFeatures to get features

In [54]:
#if no change to features
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,50,50), random_state=1)
clf.fit(trainFeatures, trainLabels)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [45]:
#Get features of Testing DATA:


    

featuresTest=[]
for ids in IDtest:
    
    with open('test_pickle/'+str(ids)+'.pickle', 'rb') as handle:
        x = pickle.load(handle)
        a,b,c,d,e=getFeatures(np.array(x))
        
        
    
    featuresTest.append(np.hstack((a,b,c,d,e)))

featuresTest=np.array(featuresTest)


PredictedLabels=clf.predict(featuresTest)







In [55]:
#if no change to features
PredictedLabels=clf.predict(featuresTest)

In [56]:

Export=[]
for index, lab in enumerate(PredictedLabels):
    Export.append([str(Classes[lab])+','+str(IDtest[index])])

import csv


csvfile = "results.csv"

#Assuming res is a flat list
with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n',quoting=csv.QUOTE_NONE,quotechar='',escapechar='/')
    for val in Export:
        writer.writerow(val) 


In [None]:
#Use this to hear Audio of File Desired

ipd.Audio('Train/193.wav')

In [None]:
#use this to check what variables still available, etc.