<a href="https://colab.research.google.com/github/Nedjagang/Music-Genre-Classification-system-using-Deep-Learning/blob/main/miniproject_final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt

In [None]:
example_file = "disco.00001.wav"

load audio file with librosa

In [None]:
signal, sample_rate = librosa.load(example_file)

In [None]:
print(signal)

In [None]:
print(sample_rate)

In [None]:
FIG_SIZE = (16,8)

## waveform

In [None]:
plt.figure(figsize=FIG_SIZE)
librosa.display.waveshow(signal,alpha=0.6)
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.title("Waveform")

fourier transform(FFT) - frequency domain

In [None]:
fft=np.fft.fft(signal)

calculate the magnitude (abs values on complex numbers)

In [None]:
spectrum =  np.abs(fft)

create the frequency variable

In [None]:
f = np.linspace(0,sample_rate,len(spectrum))

plot spectrum

In [None]:
plt.figure(figsize=FIG_SIZE)
plt.plot(f,spectrum,alpha=0.5)
plt.xlabel("Frequency")
plt.ylabel("Magnitude")
plt.title("Power Spectrum")

take half of the spectrum and frequency

In [None]:
left_spectrum = spectrum[:int(len(spectrum)/2)]
left_f = f[:int(len(spectrum)/2)]

plot spectrum

In [None]:
plt.figure(figsize=FIG_SIZE)
plt.plot(left_f,left_spectrum,alpha=0.5)
plt.xlabel("Frequency")
plt.ylabel("Magnitude")
plt.title("Power Spectrum")

spectrogram(STFT)

In [None]:
hop_length=512 # num. of samples
n_fft = 2048 # num. of samples foe window

perform STFT

In [None]:
stft= librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)

calculate the magnitude(abs values on complex numbers)

In [None]:
spectrogram = np.abs(stft)

plot the spectrogram

In [None]:
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar()
plt.title("Spectrogram")

apply logarithm to get values in decibles

In [None]:
log_spectrogram = librosa.amplitude_to_db(spectrogram)

plot the spectrogram in decibles

In [None]:
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(log_spectrogram, sr= sample_rate, hop_length= hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format="+2.0f dB")
plt.title("Spectrogram (dB)")

MFCCs (we use 13 MFCCs)

In [None]:
help(librosa.feature.mfcc)

In [None]:
MFCCs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)

plot MFCCs

In [None]:
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(MFCCs, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("MFCC coefficients")
plt.colorbar()
plt.title("MFCCs")

In [None]:
import json
import os
import math

In [None]:
DATASET_PATH = "/content/drive/MyDrive/mini project dataset/Data/genres_original"
JSON_PATH = "/content/data_10.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

In [None]:
def save_mfcc(DATASET_PATH, JSON_PATH, num_mfcc=13, n_fft=2048, hop_length=512,num_segments=5 ):
  """Extracts MFCCs from music dataset and saves them into a json file along with genre labels."""
  
  #  dictonary to store mapping, labels, and MFCCs
  data = {
     "mapping":[],
     "labels":[],
     "mfcc":[]
  }

  samples_per_segment = int(SAMPLES_PER_TRACK /num_segments)
  num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)


  #loop through all genre sub-folder
  for i,(dirpath, dirnames, filenames) in enumerate(os.walk(DATASET_PATH)):

  #ensure we're processing a genre sub-folder level
    if dirpath is not DATASET_PATH:


      #save genre label (i.e., sub--folder name) in the mapping
      semantic_label = dirpath.split("/")[-1]
      data["mapping"].append(semantic_label)
      print("\nProcessing: {}",format(semantic_label))

      #process all audio files in genre sub-dir
      for f in filenames:
        #load audio file
        file_path = os.path.join(dirpath,f)
        signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)

        #process all segments of audio file
        for d in range(num_segments):

          #calculate start and finish sample for current segment
          start=samples_per_segment*d
          finish=start+samples_per_segment

          #extraxt mfcc
          mfcc = librosa.feature.mfcc(y=signal[start:finish],sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
          mfcc = mfcc.T
          
          #store only mfcc feature with expected number of vectors
          if len(mfcc) == num_mfcc_vectors_per_segment:
            data["mfcc"].append(mfcc.tolist())
            data["labels"].append(i-1)
            print("{}, segment:{}".format(file_path, d+1))

  #save MFCCs to json file
  with open(JSON_PATH, "w") as fp:
        json.dump(data, fp, indent=4)

In [None]:
save_mfcc(DATASET_PATH, JSON_PATH,num_segments=10)

In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras

import matplotlib.pyplot as plt
import random

import librosa 
import math


In [None]:
DATA_PATH = "/content/data_10.json"

In [None]:
def load_data(data_path):
  with open(data_path,"r") as f:
    data = json.load(f)

    #covert lists to numpy arrays
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])
    print("Data successfully loaded!")
    return X,y

load data

In [None]:
X,y = load_data(DATA_PATH)

In [None]:
X.shape

create train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Simple ANN

In [None]:
#build netwwork topology
model = keras.Sequential([
    
    #input layer
    keras.layers.Flatten(input_shape = (X.shape[1],X.shape[2])),
    #1st dense layer
    keras.layers.Dense(512, activation = 'relu'),
    #2nd dense layer 
    keras.layers.Dense(256 ,activation ='relu'),
    #3rd dense layer
    keras.layers.Dense(64, activation= 'relu'),
    #output layer
    keras.layers.Dense(10, activation= 'softmax')
])

compile model

In [None]:
optimiser = keras.optimizers.Adam(learning_rate = 0.0001)
# v = keras.losses.sparse_categorical_crossentropy
model.compile(optimizer = optimiser, 
              loss= 'mean_squared_error',
              metrics=['accuracy'])



In [None]:
model.summary()

train model

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),batch_size=32,epochs=100)

In [None]:
def plot_history(history):
  fig, axs = plt.subplots(2)
  #create accuracy subplot
  axs[0].plot(history.history["accuracy"], label = "train accuracy")
  axs[0].plot(history.history["val_accuracy"],label= "test error")
  axs[0].set_ylabel("Accuracy")
  axs[0].legend(loc="lower right")
  axs[0].set_title("Accuracy eval")
  plt.show()

plot accuracy and error as a function of epochs

In [None]:
plot_history(history)

Managing Overfitting

In [None]:
#build network topology
model_regularized = keras.Sequential ([
    #input layer
    keras.layers.Flatten(input_shape=(X.shape[1],X.shape[2])),
    #1st dense layer
    keras.layers.Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    #2nd dense layer
    keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.3),
    #3rd dense layer
    keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.3),
    #output layer 
    keras.layers.Dense(10, activation='softmax')
])

compile model

In [None]:
optimiser=keras.optimizers.Adam(learning_rate=0.0001)
model_regularized.compile(optimizer=optimiser,
                          loss='mean_squared_error',
                          metrics=['accuracy'])

train model

In [None]:
history=model_regularized.fit(X_train, y_train, validation_data=(X_test, y_test),batch_size=32, epochs=5)

plot accuracy and error as a function of the epochs

In [None]:
plot_history(history)

Convolutional Neural Network

In [None]:
#create train, validation and test split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2)

# add an axis to input sets
X_train = X_train[..., np.newaxis]
X_validation = X_validation[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [None]:
X_train.shape

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2],1)

In [None]:
#build the CNN
from keras import Sequential
model_cnn=Sequential()

#1st conv layer 
model_cnn.add(keras.layers.Conv2D(32,(3,3),activation='relu',input_shape=input_shape))
model_cnn.add(keras.layers.MaxPooling2D((3,3),strides=(2,2),padding='same'))
model_cnn.add(keras.layers.BatchNormalization())

#2nd conv layer
model_cnn.add(keras.layers.Conv2D(32,(3,3),activation='relu'))
model_cnn.add(keras.layers.MaxPooling2D((3,3),strides=(2,2),padding='same'))
model_cnn.add(keras.layers.BatchNormalization())

#3rd conv layer
model_cnn.add(keras.layers.Conv2D(32,(2,2),activation='relu'))
model_cnn.add(keras.layers.MaxPooling2D((2,2),strides=(2,2),padding='same'))
model_cnn.add(keras.layers.BatchNormalization())

#flatten output and feed it into dense layer
model_cnn.add(keras.layers.Flatten())
model_cnn.add(keras.layers.Dense(64, activation='relu'))
model_cnn.add(keras.layers.Dropout(0.3))

#output layer
model_cnn.add(keras.layers.Dense(10, activation='softmax'))

compile model

In [None]:
from scipy.optimize import optimize
optimiser=keras.optimizers.Adam(learning_rate=0.0001)
model_cnn.compile(optimizer=optimiser,
                  loss='mean_squared_error',
                  metrics=['accuracy'])

In [None]:
model_cnn.summary()

train model

In [None]:
history=model_cnn.fit(X_train, y_train,validation_data=(X_validation, y_validation), batch_size=32,epochs=5)

plot accuracy and error as a function of the epochs

In [None]:
plot_history(history)

evaluate model on test set

In [None]:
test_loss, test_acc = model_cnn.evaluate(X_test, y_test, verbose=2)
print('\nTest accuracy:',test_acc)

In [None]:
model_cnn.save("Music_Genre_10_CNN")

In [None]:
model_cnn.save("Music_Genre_10_CNN.h5")

it can be used to construct model identically

In [None]:
reconstructed_model=keras.models.load_model("Music_Genre_10_CNN.h5")

Prediction on Test Set

In [None]:
#pick a sample to predict from the test set
X_to_predict = X_test[100]
y_to_predict = y_test[100]

In [None]:
X_to_predict.shape

In [None]:
print("real Genre:",y_to_predict)

add a dimension to input data foe sample - model.predict() expects a 4d array in this case

In [None]:
X_to_predict = X_to_predict[np.newaxis, ...]# array shape(1,130,13,1)

In [None]:
X_to_predict.shape

perform prediction

In [None]:
prediction = model_cnn.predict(X_to_predict)

get index with max value

In [None]:
predicted_index = np.argmax(prediction, axis=1)
print("Predicted Genre:",int(predicted_index))

pick a sample to predict from the test set

In [None]:
X_to_predict = X_test[50]
y_to_predict = y_test[50]
print("Real Genre:",y_to_predict)
X_to_predict= X_to_predict[np.newaxis, ...]
prediction = model_cnn.predict(X_to_predict)

get index with max value

In [None]:
predicted_index=np.argmax(prediction, axis=1)
print("Predicted Genre:",int(predicted_index))

In [None]:
len(X_test)

In [None]:
for n in range(10):
  i = random.randint(0,len(X_test)-1)
  # pick a samole to predict from the test
  X_to_predict = X_test[i]
  y_to_predict = y_test[i]
  print("\nReal Genre:",y_to_predict)
  X_to_predict = X_to_predict[np.newaxis, ...]
  prediction = model_cnn.predict(X_to_predict)

  #get index with max value
  predicted_index = np.argmax(prediction, axis=1)
  print("Predicted Genre:",int(predicted_index))

Prediction on New Songs

In [None]:
# audio files pre-processing
def process_input(audio_file, track_duration):

  SAMPLE_RATE = 22050
  NUM_MFCC = 13
  N_FFT = 2048
  HOP_LENGTH = 512
  TRACK_DURATION = track_duration #measured in seconds
  SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
  NUM_SEGMENTS =10

  samples_per_segment=int(SAMPLES_PER_TRACK/NUM_SEGMENTS)
  num_mfcc_vectors_per_segment=math.ceil(samples_per_segment/HOP_LENGTH)

  signal, sample_rate = librosa.load(audio_file, sr=SAMPLE_RATE)

  for d in range(10):

    #calcuate start and finish sample for current segment
    start = samples_per_segment*d
    finish=start + samples_per_segment

    #extract mfcc
    mfcc = librosa.feature.mfcc(y=signal[start:finish],sr=sample_rate, n_mfcc = NUM_MFCC, n_fft = N_FFT, hop_length = HOP_LENGTH)
    mfcc = mfcc.T

    return mfcc

In [None]:
genre_dict = {0:"hiphop", 1:"country", 2:"jazz", 3:"classica", 4:"metal", 5:"pop", 6:"rock", 7:"blues", 8:"reggae", 9:"disco"}

In [None]:
new_input_mfcc= process_input('/content/disco.00001.wav',30)

In [None]:
type(new_input_mfcc)

In [None]:
new_input_mfcc.shape

In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

In [None]:
prediction = model_cnn.predict(X_to_predict)

#get index with max value
predicted_index = np.argmax(prediction,axis=1)
print("Predicted Genre:",genre_dict[int(predicted_index)])

In [None]:
new_input_mfcc = process_input('/content/disco.00001.wav',30)                                        

In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

In [None]:
prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:",genre_dict[int(predicted_index)])