In [1]:
import numpy as np, matplotlib.pyplot as plt
from numpy import *
import pylab
import scipy as scipy
from scipy.io import wavfile
from numpy.fft import *
import scipy.signal as signal
import sklearn
from sklearn import mixture
import IPython
import librosa
import cmath
import itertools
import pydub
from pydub import AudioSegment
import random

## Random sampler

In [2]:
def random_gen(low, high):
    while True:
        yield random.randrange(low, high)

gen = random_gen(1, 60)
gen2 = random_gen(0, 99)

training_samples = set()
testing_samples = set()

for x in itertools.takewhile(lambda x: len(training_samples) < 50, gen): 
    training_samples.add(x)
    
for i in range(1,61):
    if i in training_samples:
        continue
    testing_samples.add(i)
    
training_samples = np.array(list(training_samples))
testing_samples = np.array(list(testing_samples))

# Making a speech detector
## Extra credit implemented as well

In [3]:
music_training_data = []
music_testing_data = []

speech_training_data = []
speech_testing_data = []

for i in range(50):
    samplerate, sound = wavfile.read("./SpeechMusic/music/" + str(training_samples[i]) + ".wav")
    music_training_data.append((samplerate, sound))
    samplerate, sound = wavfile.read("./SpeechMusic/speech/" + str(training_samples[i]) + ".wav")
    speech_training_data.append((samplerate, sound))

for i in range(10):
    samplerate, sound = wavfile.read("./SpeechMusic/music/" + str(testing_samples[i]) + ".wav")
    music_testing_data.append((samplerate, sound))
    samplerate, sound = wavfile.read("./SpeechMusic/speech/" + str(testing_samples[i]) + ".wav")
    speech_testing_data.append((samplerate, sound))

for i in range(50):
    f, t, Zxx = signal.stft(music_training_data[i][1], fs = music_training_data[i][0], window = np.hanning(1024), nperseg = 1024, noverlap = 512)
    f1, t1, Zxx1 = signal.stft(speech_training_data[i][1], fs = speech_training_data[i][0], window = np.hanning(1024), nperseg = 1024, noverlap = 512)
    if(i==0):
        music_training_data_Zxx = Zxx
        speech_training_data_Zxx = Zxx1
    else:
        music_training_data_Zxx = np.concatenate((music_training_data_Zxx, Zxx), axis = 1)
        speech_training_data_Zxx = np.concatenate((speech_training_data_Zxx, Zxx1), axis = 1)
        
for i in range(10):
    f, t, Zxx = signal.stft(music_testing_data[i][1], fs = music_testing_data[i][0], window = np.hanning(1024), nperseg = 1024, noverlap = 512)
    f1, t1, Zxx1 = signal.stft(speech_testing_data[i][1], fs = speech_testing_data[i][0], window = np.hanning(1024), nperseg = 1024, noverlap = 512)
    if(i==0):
        music_testing_data_Zxx = Zxx
        speech_testing_data_Zxx = Zxx1
    else:
        music_testing_data_Zxx = np.concatenate((music_testing_data_Zxx, Zxx), axis = 1)
        speech_testing_data_Zxx = np.concatenate((speech_testing_data_Zxx, Zxx1), axis = 1)
          

music_training_data_Zxx = np.abs(music_training_data_Zxx) **0.3
speech_training_data_Zxx = np.abs(speech_training_data_Zxx) **0.3
music_testing_data_Zxx = np.abs(music_testing_data_Zxx) **0.3
speech_testing_data_Zxx = np.abs(speech_testing_data_Zxx) **0.3

## Learning Gaussian parameters

In [4]:
music_means = np.mean(music_training_data_Zxx, axis = 1)
variances = []

for i in range(513):
    c = np.var(music_training_data_Zxx[i])
    variances.append(c)

music_covariance_matrix = np.diag(variances)

In [5]:
speech_means = np.mean(speech_training_data_Zxx, axis = 1)
variances = []

for i in range(513):
    c = np.var(speech_training_data_Zxx[i])
    variances.append(c)

speech_covariance_matrix = np.diag(variances)

## Making predictions

In [6]:
num_dimension = music_training_data_Zxx.shape[0]
raised_pi = np.pi**(num_dimension/2)

normalizing_factor_music = (np.log(2*raised_pi) + np.sum(np.log(music_covariance_matrix.diagonal())))
normalizing_factor_speech = (np.log(2*raised_pi) + np.sum(np.log(speech_covariance_matrix.diagonal())))

inverse_music = np.linalg.inv(music_covariance_matrix)
inverse_speech = np.linalg.inv(speech_covariance_matrix)

predicted = []

for i in range(10):
    likelihood_music = 0.0
    likelihood_speech = 0.0
    spec = music_testing_data_Zxx[:, i*(647):(647)*(i+1)]
    for i in range(spec.shape[1]):
        col = spec[:, i]
        diff_music = col - music_means
        diff_speech = col - speech_means

        exponent_music = np.dot(diff_music.T, inverse_music)
        exponent_music = np.dot(exponent_music, diff_music)

        exponent_speech = np.dot(diff_speech.T, inverse_speech)
        exponent_speech = np.dot(exponent_speech, diff_speech)

        likelihood_music = likelihood_music - exponent_music - normalizing_factor_music
        likelihood_speech = likelihood_speech - exponent_speech - normalizing_factor_speech

    if(likelihood_music>likelihood_speech):
        predicted.append("music")
    else:
        predicted.append("speech")
    
for i in range(10):
    likelihood_music = 0.0
    likelihood_speech = 0.0
    spec = speech_testing_data_Zxx[:, i*(647):(647)*(i+1)]
    for i in range(spec.shape[1]):
        col = spec[:, i]
        diff_music = col - music_means
        diff_speech = col - speech_means

        exponent_music = np.dot(diff_music.T, inverse_music)
        exponent_music = np.dot(exponent_music, diff_music)

        exponent_speech = np.dot(diff_speech.T, inverse_speech)
        exponent_speech = np.dot(exponent_speech, diff_speech)

        likelihood_music = likelihood_music - exponent_music - normalizing_factor_music
        likelihood_speech = likelihood_speech - exponent_speech - normalizing_factor_speech
    if(likelihood_music>likelihood_speech):
        predicted.append("music")
    else:
        predicted.append("speech")

actual = ["music" for i in range(10)] + ["speech" for i in range(10)]
correct = 0
for i in range(20):
    if(predicted[i] == actual[i]):
        correct +=1
print("Accuracy of the model is " + str(float(correct/20)*100) + "%")

Accuracy of the model is 85.0%


## Since we choose the training and testing samples randomly, the above accuracy varies from 85% to 95%. Run all of the above code to see different accuracies.

# Making a music genre classifier

In [7]:
training_samples = set()
testing_samples = []

for x in itertools.takewhile(lambda x: len(training_samples) < 50, gen2): 
    training_samples.add(x)
    
for i in range(0,100):
    if i in training_samples:
        continue
    testing_samples.append(i)
        
training_samples = np.array(list(training_samples))
testing_samples = np.array(testing_samples)
        
training_data = []
testing_data = []

print("Loading files...")
training_data = [[], [], [], [], []]
testing_data = [[], [], [], [], []]
genres = ["classical", 'disco', 'metal', 'pop', 'reggae']
for c, genre in enumerate(genres):
    print(genre)
    for i in range(50):
        sound, samplerate =librosa.core.load("./genres/" + genre + "/" + genre + ".000" + format(training_samples[i], '02d') + ".mp3")
        training_data[c].append((samplerate,sound))
        sound, samplerate =librosa.core.load("./genres/" + genre + "/" + genre + ".000" + format(testing_samples[i], '02d') + ".mp3")
        testing_data[c].append((samplerate,sound))
print("Finished loading")
actual_results = ["classical" for i in range(50)] + ["disco" for i in range(50)]  + ["metal" for i in range(50)] + ["pop" for i in range(50)] + ["reggae" for i in range(50)]

Loading files...
classical
disco
metal
pop
reggae
Finished loading


In [8]:
classical_mfccs = []
disco_mfccs = []
metal_mfccs = []
pop_mfccs = []
reggae_mfccs = []

classical_mfccs_testing = []
disco_mfccs_testing = []
metal_mfccs_testing = []
pop_mfccs_testing = []
reggae_mfccs_testing = []
print("Extracting MFCCs...")
for i in range(50):
    classical_mfccs.append(librosa.feature.mfcc(y = np.array(training_data[0][i][1], dtype = float), sr = training_data[0][i][0], n_mfcc = 513))
    disco_mfccs.append(librosa.feature.mfcc(y = np.array(training_data[1][i][1], dtype = float), sr = training_data[1][i][0], n_mfcc = 513))
    metal_mfccs.append(librosa.feature.mfcc(y = np.array(training_data[2][i][1], dtype = float), sr = training_data[2][i][0], n_mfcc = 513))
    pop_mfccs.append(librosa.feature.mfcc(y = np.array(training_data[3][i][1], dtype = float), sr = training_data[3][i][0], n_mfcc = 513))
    reggae_mfccs.append(librosa.feature.mfcc(y = np.array(training_data[4][i][1], dtype = float), sr = training_data[4][i][0], n_mfcc = 513))
    
    classical_mfccs_testing.append(librosa.feature.mfcc(y = np.array(testing_data[0][i][1], dtype = float), sr = testing_data[0][i][0], n_mfcc = 513))
    disco_mfccs_testing.append(librosa.feature.mfcc(y = np.array(testing_data[1][i][1], dtype = float), sr = testing_data[1][i][0], n_mfcc = 513))
    metal_mfccs_testing.append(librosa.feature.mfcc(y = np.array(testing_data[2][i][1], dtype = float), sr = testing_data[2][i][0], n_mfcc = 513))
    pop_mfccs_testing.append(librosa.feature.mfcc(y = np.array(testing_data[3][i][1], dtype = float), sr = testing_data[3][i][0], n_mfcc = 513))
    reggae_mfccs_testing.append(librosa.feature.mfcc(y = np.array(testing_data[4][i][1], dtype = float), sr = testing_data[4][i][0], n_mfcc = 513))
    
print("Finished extracting MFCCs")
print("Preparing for training...")
for i in range(50):
    if(i==0):
        classical_mfccs_prepared = classical_mfccs[i]
        disco_mfccs_prepared = disco_mfccs[i]
        metal_mfccs_prepared = metal_mfccs[i]
        pop_mfccs_prepared = pop_mfccs[i]
        reggae_mfccs_prepared = reggae_mfccs[i]
        
        classical_mfccs_prepared_testing = classical_mfccs_testing[i]
        disco_mfccs_prepared_testing = disco_mfccs_testing[i]
        metal_mfccs_prepared_testing = metal_mfccs_testing[i]
        pop_mfccs_prepared_testing = pop_mfccs_testing[i]
        reggae_mfccs_prepared_testing = reggae_mfccs_testing[i]
        
    else:
        classical_mfccs_prepared = np.concatenate((classical_mfccs_prepared, classical_mfccs[i]), axis=1)
        disco_mfccs_prepared = np.concatenate((disco_mfccs_prepared, disco_mfccs[i]), axis=1)
        metal_mfccs_prepared = np.concatenate((metal_mfccs_prepared, metal_mfccs[i]), axis=1)
        pop_mfccs_prepared = np.concatenate((pop_mfccs_prepared, pop_mfccs[i]), axis=1)
        reggae_mfccs_prepared = np.concatenate((reggae_mfccs_prepared, reggae_mfccs[i]), axis=1)
        
        classical_mfccs_prepared_testing = np.concatenate((classical_mfccs_prepared_testing, classical_mfccs_testing[i]), axis=1)
        disco_mfccs_prepared_testing = np.concatenate((disco_mfccs_prepared_testing, disco_mfccs_testing[i]), axis=1)
        metal_mfccs_prepared_testing = np.concatenate((metal_mfccs_prepared_testing, metal_mfccs_testing[i]), axis=1)
        pop_mfccs_prepared_testing = np.concatenate((pop_mfccs_prepared_testing, pop_mfccs_testing[i]), axis=1)
        reggae_mfccs_prepared_testing = np.concatenate((reggae_mfccs_prepared_testing, reggae_mfccs_testing[i]), axis=1)
print("Done preparing")

Extracting MFCCs...
Finished extracting MFCCs
Preparing for training...
Done preparing


In [9]:
classical_means = np.mean(classical_mfccs_prepared, axis = 1)
variances = []

for i in range(513):
    c = np.var(classical_mfccs_prepared[i])
    variances.append(c)

classical_covariance_matrix = np.diag(variances)

In [10]:
disco_means = np.mean(disco_mfccs_prepared, axis = 1)
variances = []

for i in range(513):
    c = np.var(disco_mfccs_prepared[i])
    variances.append(c)

disco_covariance_matrix = np.diag(variances)

In [11]:
metal_means = np.mean(metal_mfccs_prepared, axis = 1)
variances = []

for i in range(513):
    c = np.var(metal_mfccs_prepared[i])
    variances.append(c)

metal_covariance_matrix = np.diag(variances)

In [12]:
pop_means = np.mean(pop_mfccs_prepared, axis = 1)
variances = []

for i in range(513):
    c = np.var(pop_mfccs_prepared[i])
    variances.append(c)

pop_covariance_matrix = np.diag(variances)

In [13]:
reggae_means = np.mean(reggae_mfccs_prepared, axis = 1)
variances = []

for i in range(513):
    c = np.var(reggae_mfccs_prepared[i])
    variances.append(c)

reggae_covariance_matrix = np.diag(variances)

In [14]:
num_dimension = 513
raised_pi = np.pi**(num_dimension/2)

normalizing_factor_classical = (np.log(2*raised_pi) + np.sum(np.log(classical_covariance_matrix.diagonal())))
normalizing_factor_disco = (np.log(2*raised_pi) + np.sum(np.log(disco_covariance_matrix.diagonal())))
normalizing_factor_metal = (np.log(2*raised_pi) + np.sum(np.log(metal_covariance_matrix.diagonal())))
normalizing_factor_pop = (np.log(2*raised_pi) + np.sum(np.log(pop_covariance_matrix.diagonal())))
normalizing_factor_reggae = (np.log(2*raised_pi) + np.sum(np.log(reggae_covariance_matrix.diagonal())))

inverse_classical = np.linalg.inv(classical_covariance_matrix)
inverse_disco = np.linalg.inv(disco_covariance_matrix)
inverse_metal = np.linalg.inv(metal_covariance_matrix)
inverse_pop = np.linalg.inv(pop_covariance_matrix)
inverse_reggae = np.linalg.inv(reggae_covariance_matrix)

predicted = []

for i in range(50):
    likelihood_classical = 0.0
    likelihood_disco = 0.0
    likelihood_metal = 0.0
    likelihood_pop = 0.0
    likelihood_reggae = 0.0
    
    spec = classical_mfccs_prepared_testing[:, i*(1293):(1293)*(i+1)]
    for i in range(spec.shape[1]):
        col = spec[:, i]
        diff_classical = col - classical_means
        diff_disco = col - disco_means
        diff_metal = col - metal_means
        diff_pop = col - pop_means
        diff_reggae = col - reggae_means
    
        exponent_classical = np.dot(diff_classical.T, inverse_classical)
        exponent_classical = np.dot(exponent_classical, diff_classical)
        
        exponent_disco = np.dot(diff_disco.T, inverse_disco)
        exponent_disco = np.dot(exponent_disco, diff_disco)
        
        exponent_metal = np.dot(diff_metal.T, inverse_metal)
        exponent_metal = np.dot(exponent_metal, diff_metal)
        
        exponent_pop = np.dot(diff_pop.T, inverse_pop)
        exponent_pop = np.dot(exponent_pop, diff_pop)
        
        exponent_reggae = np.dot(diff_reggae.T, inverse_reggae)
        exponent_reggae = np.dot(exponent_reggae, diff_reggae)

        likelihood_classical = likelihood_classical - exponent_classical - normalizing_factor_classical
        likelihood_disco = likelihood_disco - exponent_disco - normalizing_factor_disco
        likelihood_metal = likelihood_metal - exponent_metal - normalizing_factor_metal
        likelihood_pop = likelihood_pop - exponent_pop - normalizing_factor_pop
        likelihood_reggae = likelihood_reggae - exponent_reggae - normalizing_factor_reggae

    likelihoods = [likelihood_classical, likelihood_disco, likelihood_metal, likelihood_pop, likelihood_reggae]
    class_index = likelihoods.index(max(likelihoods))
    predicted.append(genres[class_index])
    
for i in range(50):
    likelihood_classical = 0.0
    likelihood_disco = 0.0
    likelihood_metal = 0.0
    likelihood_pop = 0.0
    likelihood_reggae = 0.0
    
    spec = disco_mfccs_prepared_testing[:, i*(1293):(1293)*(i+1)]
    for i in range(spec.shape[1]):
        col = spec[:, i]
        diff_classical = col - classical_means
        diff_disco = col - disco_means
        diff_metal = col - metal_means
        diff_pop = col - pop_means
        diff_reggae = col - reggae_means
    
        exponent_classical = np.dot(diff_classical.T, inverse_classical)
        exponent_classical = np.dot(exponent_classical, diff_classical)
        
        exponent_disco = np.dot(diff_disco.T, inverse_disco)
        exponent_disco = np.dot(exponent_disco, diff_disco)
        
        exponent_metal = np.dot(diff_metal.T, inverse_metal)
        exponent_metal = np.dot(exponent_metal, diff_metal)
        
        exponent_pop = np.dot(diff_pop.T, inverse_pop)
        exponent_pop = np.dot(exponent_pop, diff_pop)
        
        exponent_reggae = np.dot(diff_reggae.T, inverse_reggae)
        exponent_reggae = np.dot(exponent_reggae, diff_reggae)

        likelihood_classical = likelihood_classical - exponent_classical - normalizing_factor_classical
        likelihood_disco = likelihood_disco - exponent_disco - normalizing_factor_disco
        likelihood_metal = likelihood_metal - exponent_metal - normalizing_factor_metal
        likelihood_pop = likelihood_pop - exponent_pop - normalizing_factor_pop
        likelihood_reggae = likelihood_reggae - exponent_reggae - normalizing_factor_reggae

    likelihoods = [likelihood_classical, likelihood_disco, likelihood_metal, likelihood_pop, likelihood_reggae]
    class_index = likelihoods.index(max(likelihoods))
    predicted.append(genres[class_index])
    
for i in range(50):
    likelihood_classical = 0.0
    likelihood_disco = 0.0
    likelihood_metal = 0.0
    likelihood_pop = 0.0
    likelihood_reggae = 0.0
    
    spec = metal_mfccs_prepared_testing[:, i*(1293):(1293)*(i+1)]
    for i in range(spec.shape[1]):
        col = spec[:, i]
        diff_classical = col - classical_means
        diff_disco = col - disco_means
        diff_metal = col - metal_means
        diff_pop = col - pop_means
        diff_reggae = col - reggae_means
    
        exponent_classical = np.dot(diff_classical.T, inverse_classical)
        exponent_classical = np.dot(exponent_classical, diff_classical)
        
        exponent_disco = np.dot(diff_disco.T, inverse_disco)
        exponent_disco = np.dot(exponent_disco, diff_disco)
        
        exponent_metal = np.dot(diff_metal.T, inverse_metal)
        exponent_metal = np.dot(exponent_metal, diff_metal)
        
        exponent_pop = np.dot(diff_pop.T, inverse_pop)
        exponent_pop = np.dot(exponent_pop, diff_pop)
        
        exponent_reggae = np.dot(diff_reggae.T, inverse_reggae)
        exponent_reggae = np.dot(exponent_reggae, diff_reggae)

        likelihood_classical = likelihood_classical - exponent_classical - normalizing_factor_classical
        likelihood_disco = likelihood_disco - exponent_disco - normalizing_factor_disco
        likelihood_metal = likelihood_metal - exponent_metal - normalizing_factor_metal
        likelihood_pop = likelihood_pop - exponent_pop - normalizing_factor_pop
        likelihood_reggae = likelihood_reggae - exponent_reggae - normalizing_factor_reggae

    likelihoods = [likelihood_classical, likelihood_disco, likelihood_metal, likelihood_pop, likelihood_reggae]
    class_index = likelihoods.index(max(likelihoods))
    predicted.append(genres[class_index])
    
for i in range(50):
    likelihood_classical = 0.0
    likelihood_disco = 0.0
    likelihood_metal = 0.0
    likelihood_pop = 0.0
    likelihood_reggae = 0.0
    
    spec = pop_mfccs_prepared_testing[:, i*(1293):(1293)*(i+1)]
    for i in range(spec.shape[1]):
        col = spec[:, i]
        diff_classical = col - classical_means
        diff_disco = col - disco_means
        diff_metal = col - metal_means
        diff_pop = col - pop_means
        diff_reggae = col - reggae_means
    
        exponent_classical = np.dot(diff_classical.T, inverse_classical)
        exponent_classical = np.dot(exponent_classical, diff_classical)
        
        exponent_disco = np.dot(diff_disco.T, inverse_disco)
        exponent_disco = np.dot(exponent_disco, diff_disco)
        
        exponent_metal = np.dot(diff_metal.T, inverse_metal)
        exponent_metal = np.dot(exponent_metal, diff_metal)
        
        exponent_pop = np.dot(diff_pop.T, inverse_pop)
        exponent_pop = np.dot(exponent_pop, diff_pop)
        
        exponent_reggae = np.dot(diff_reggae.T, inverse_reggae)
        exponent_reggae = np.dot(exponent_reggae, diff_reggae)

        likelihood_classical = likelihood_classical - exponent_classical - normalizing_factor_classical
        likelihood_disco = likelihood_disco - exponent_disco - normalizing_factor_disco
        likelihood_metal = likelihood_metal - exponent_metal - normalizing_factor_metal
        likelihood_pop = likelihood_pop - exponent_pop - normalizing_factor_pop
        likelihood_reggae = likelihood_reggae - exponent_reggae - normalizing_factor_reggae

    likelihoods = [likelihood_classical, likelihood_disco, likelihood_metal, likelihood_pop, likelihood_reggae]
    class_index = likelihoods.index(max(likelihoods))
    predicted.append(genres[class_index])
    
for i in range(50):
    likelihood_classical = 0.0
    likelihood_disco = 0.0
    likelihood_metal = 0.0
    likelihood_pop = 0.0
    likelihood_reggae = 0.0
    
    spec = reggae_mfccs_prepared_testing[:, i*(1293):(1293)*(i+1)]
    for i in range(spec.shape[1]):
        col = spec[:, i]
        diff_classical = col - classical_means
        diff_disco = col - disco_means
        diff_metal = col - metal_means
        diff_pop = col - pop_means
        diff_reggae = col - reggae_means
    
        exponent_classical = np.dot(diff_classical.T, inverse_classical)
        exponent_classical = np.dot(exponent_classical, diff_classical)
        
        exponent_disco = np.dot(diff_disco.T, inverse_disco)
        exponent_disco = np.dot(exponent_disco, diff_disco)
        
        exponent_metal = np.dot(diff_metal.T, inverse_metal)
        exponent_metal = np.dot(exponent_metal, diff_metal)
        
        exponent_pop = np.dot(diff_pop.T, inverse_pop)
        exponent_pop = np.dot(exponent_pop, diff_pop)
        
        exponent_reggae = np.dot(diff_reggae.T, inverse_reggae)
        exponent_reggae = np.dot(exponent_reggae, diff_reggae)

        likelihood_classical = likelihood_classical - exponent_classical - normalizing_factor_classical
        likelihood_disco = likelihood_disco - exponent_disco - normalizing_factor_disco
        likelihood_metal = likelihood_metal - exponent_metal - normalizing_factor_metal
        likelihood_pop = likelihood_pop - exponent_pop - normalizing_factor_pop
        likelihood_reggae = likelihood_reggae - exponent_reggae - normalizing_factor_reggae

    likelihoods = [likelihood_classical, likelihood_disco, likelihood_metal, likelihood_pop, likelihood_reggae]
    class_index = likelihoods.index(max(likelihoods))
    predicted.append(genres[class_index])

In [15]:
correct = 0
for i in range(250):
    if(actual_results[i]==predicted[i]):
        correct+=1
accuracy = float(correct/250)*100
print("The accuracy of my model is " + str(accuracy) + "%")

The accuracy of my model is 80.80000000000001%


## After fine tuning factors, I am able to get a accuracy value of 60-85% results, we will intend to get better results now using sklearn.mixture.GaussianMixture

In [16]:
print("Preparing for training...")
for i in range(50):
    if(i==0):
        classical_mfccs_prepared = classical_mfccs[i].T
        disco_mfccs_prepared = disco_mfccs[i].T
        metal_mfccs_prepared = metal_mfccs[i].T
        pop_mfccs_prepared = pop_mfccs[i].T
        reggae_mfccs_prepared = reggae_mfccs[i].T
        
    else:
        classical_mfccs_prepared = np.concatenate((classical_mfccs_prepared, classical_mfccs[i].T), axis=0)
        disco_mfccs_prepared = np.concatenate((disco_mfccs_prepared, disco_mfccs[i].T), axis=0)
        metal_mfccs_prepared = np.concatenate((metal_mfccs_prepared, metal_mfccs[i].T), axis=0)
        pop_mfccs_prepared = np.concatenate((pop_mfccs_prepared, pop_mfccs[i].T), axis=0)
        reggae_mfccs_prepared = np.concatenate((reggae_mfccs_prepared, reggae_mfccs[i].T), axis=0)
        
print("Done preparing")
classical_model = mixture.GaussianMixture(n_components = 5, covariance_type = "diag")
disco_model = mixture.GaussianMixture(n_components = 5, covariance_type = "diag")
metal_model = mixture.GaussianMixture(n_components = 5, covariance_type = "diag")
pop_model = mixture.GaussianMixture(n_components = 5, covariance_type = "diag")
reggae_model = mixture.GaussianMixture(n_components = 5, covariance_type = "diag")
print("Fitting classical gaussian models")
classical_model.fit(classical_mfccs_prepared)
print("Fitting disco gaussian models")
disco_model.fit(disco_mfccs_prepared)
print("Fitting metal gaussian models")
metal_model.fit(metal_mfccs_prepared)
print("Fitting pop gaussian models")
pop_model.fit(pop_mfccs_prepared)
print("Fitting reggae gaussian models")
reggae_model.fit(reggae_mfccs_prepared)
print("Fitting finished")

Preparing for training...
Done preparing
Fitting classical gaussian models
Fitting disco gaussian models
Fitting metal gaussian models
Fitting pop gaussian models
Fitting reggae gaussian models
Fitting finished


In [17]:
classical_mfccs = []
disco_mfccs = []
metal_mfccs = []
pop_mfccs = []
reggae_mfccs = []

print("Extracting MFCCs...")
for i in range(50):
    classical_mfccs.append(librosa.feature.mfcc(y = np.array(testing_data[0][i][1], dtype = float), sr = testing_data[0][i][0], n_mfcc = 513))
    disco_mfccs.append(librosa.feature.mfcc(y = np.array(testing_data[1][i][1], dtype = float), sr = testing_data[1][i][0], n_mfcc = 513))
    metal_mfccs.append(librosa.feature.mfcc(y = np.array(testing_data[2][i][1], dtype = float), sr = testing_data[2][i][0], n_mfcc = 513))
    pop_mfccs.append(librosa.feature.mfcc(y = np.array(testing_data[3][i][1], dtype = float), sr = testing_data[3][i][0], n_mfcc = 513))
    reggae_mfccs.append(librosa.feature.mfcc(y = np.array(testing_data[4][i][1], dtype = float), sr = testing_data[4][i][0], n_mfcc = 513))
print("Finished extracting MFCCs")

Extracting MFCCs...
Finished extracting MFCCs


In [18]:
genres = ["classical", 'disco', 'metal', 'pop', 'reggae']
predicted = []
for i in range(50):
    mfcc = classical_mfccs[i].T
    scores = []
    scores.append(np.sum(classical_model.score_samples(mfcc)))
    scores.append(np.sum(disco_model.score_samples(mfcc)))
    scores.append(np.sum(metal_model.score_samples(mfcc)))
    scores.append(np.sum(pop_model.score_samples(mfcc)))
    scores.append(np.sum(reggae_model.score_samples(mfcc)))
    class_index = scores.index(max(scores))
    predicted.append(genres[class_index])
    
for i in range(50):
    mfcc = disco_mfccs[i].T
    scores = []
    scores.append(np.sum(classical_model.score_samples(mfcc)))
    scores.append(np.sum(disco_model.score_samples(mfcc)))
    scores.append(np.sum(metal_model.score_samples(mfcc)))
    scores.append(np.sum(pop_model.score_samples(mfcc)))
    scores.append(np.sum(reggae_model.score_samples(mfcc)))
    class_index = scores.index(max(scores))
    predicted.append(genres[class_index])
    
for i in range(50):
    mfcc = metal_mfccs[i].T
    scores = []
    scores.append(np.sum(classical_model.score_samples(mfcc)))
    scores.append(np.sum(disco_model.score_samples(mfcc)))
    scores.append(np.sum(metal_model.score_samples(mfcc)))
    scores.append(np.sum(pop_model.score_samples(mfcc)))
    scores.append(np.sum(reggae_model.score_samples(mfcc)))
    class_index = scores.index(max(scores))
    predicted.append(genres[class_index])
    
for i in range(50):
    mfcc = pop_mfccs[i].T
    scores = []
    scores.append(np.sum(classical_model.score_samples(mfcc)))
    scores.append(np.sum(disco_model.score_samples(mfcc)))
    scores.append(np.sum(metal_model.score_samples(mfcc)))
    scores.append(np.sum(pop_model.score_samples(mfcc)))
    scores.append(np.sum(reggae_model.score_samples(mfcc)))
    class_index = scores.index(max(scores))
    predicted.append(genres[class_index])
    
for i in range(50):
    mfcc = reggae_mfccs[i].T
    scores = []
    scores.append(np.sum(classical_model.score_samples(mfcc)))
    scores.append(np.sum(disco_model.score_samples(mfcc)))
    scores.append(np.sum(metal_model.score_samples(mfcc)))
    scores.append(np.sum(pop_model.score_samples(mfcc)))
    scores.append(np.sum(reggae_model.score_samples(mfcc)))
    class_index = scores.index(max(scores))
    predicted.append(genres[class_index])
    

In [19]:
correct = 0
for i in range(250):
    if(actual_results[i]==predicted[i]):
        correct+=1
accuracy = float(correct/250)*100
print("The accuracy of using this model is " + str(accuracy) + "%")

The accuracy of my model is 77.60000000000001%


## Accuracy of the model above ranges from 80 to 90%. Which is slightly better than what we had in our previous methodology