Dataset Link: https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/data

## Importing Libraries

In [2]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, Flatten, Dense,Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.image import resize
import matplotlib.pyplot as plt
import seaborn as sns

### Data Preprocessing

In [3]:

data_dir = './Data/genres_original'
classes = ['blues', 'classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']

In [4]:

def load_and_preprocess_data(data_dir, classes, target_shape=(150, 150)):
    data = []
    labels = []
    
    for i_class, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        print("Processing--",class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir, filename)
                audio_data, sample_rate = librosa.load(file_path, sr=None)
                chunk_duration = 4  # seconds
                overlap_duration = 2  # seconds
                
               
                chunk_samples = chunk_duration * sample_rate
                overlap_samples = overlap_duration * sample_rate
                
                
                num_chunks = int(np.ceil((len(audio_data) - chunk_samples) / (chunk_samples - overlap_samples))) + 1
                
            
                for i in range(num_chunks):
                  
                    start = i * (chunk_samples - overlap_samples)
                    end = start + chunk_samples
                    
                  
                    chunk = audio_data[start:end]
                    
                 
                    mel_spectrogram = librosa.feature.melspectrogram(y=chunk, sr=sample_rate)
                    
 
                    mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
                    data.append(mel_spectrogram)
                    labels.append(i_class)
    
    return np.array(data), np.array(labels)

In [5]:
# Split data into training and testing sets
data, labels = load_and_preprocess_data(data_dir, classes)
 

Processing-- blues
Processing-- classical
Processing-- country
Processing-- disco
Processing-- hiphop
Processing-- jazz
Processing-- metal
Processing-- pop
Processing-- reggae
Processing-- rock


In [6]:
data.shape

(14975, 150, 150, 1)

In [7]:
labels.shape

(14975,)

In [8]:
labels

array([0, 0, 0, ..., 9, 9, 9])

In [9]:
labels = to_categorical(labels, num_classes=len(classes))  # Convert labels to one-hot encoding
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [10]:
labels.shape

(14975, 10)

In [11]:
data.shape

(14975, 150, 150, 1)

### Splitting Dataset into Training and Test set

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

## Building Model

In [13]:
model = tf.keras.models.load_model("Trained_model.h5")



In [4]:
model.summary()

## Model Evaluation

In [None]:
##Model Evaluation on Training set
train_accuracy=model.evaluate(X_train,y_train,verbose=0)
print(train_accuracy[1])

0.9952420592308044


In [None]:
##Model Evaluation on Test set
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.9125208854675293


## Precision, Recall, Confusion Metrics calculation

In [None]:
y_pred = model.predict(X_test)
y_pred

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 289ms/step


array([[6.7807719e-05, 8.2583165e-06, 2.1095309e-05, ..., 9.9814165e-01,
        4.8310994e-05, 9.7084383e-04],
       [2.8971922e-11, 6.4629318e-16, 2.8884168e-13, ..., 2.5399535e-16,
        3.4620394e-15, 5.5360429e-06],
       [2.9120498e-10, 8.2325752e-10, 4.0649320e-10, ..., 7.5615452e-09,
        1.6638524e-11, 2.1784937e-03],
       ...,
       [1.0000000e+00, 6.0784043e-20, 1.2166884e-11, ..., 2.8372556e-19,
        1.4960564e-14, 7.6321882e-09],
       [1.5214436e-04, 9.9353611e-01, 7.7946577e-04, ..., 4.2678893e-04,
        7.7527232e-04, 8.9376251e-04],
       [1.9644211e-04, 1.7801986e-08, 2.7221961e-06, ..., 1.2046268e-03,
        1.2935428e-05, 1.4935723e-01]], dtype=float32)

In [None]:
y_pred.shape

(2995, 10)

In [None]:
y_test.shape

(2995, 10)

In [None]:
predicted_categories = np.argmax(y_pred, axis=1)
predicted_categories

array([7, 6, 6, ..., 0, 1, 4], dtype=int64)

In [None]:
y_test

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
true_categories = np.argmax(y_test, axis=1)
true_categories

array([7, 6, 6, ..., 0, 1, 4], dtype=int64)

In [35]:
def loadWav(file_path, target_shape = (150, 150)):
    data = []
    audio_data, sample_rate = librosa.load(file_path, sr=None)
    chunk_duration = 4  # seconds
    overlap_duration = 2  # seconds
    
    
    chunk_samples = chunk_duration * sample_rate
    overlap_samples = overlap_duration * sample_rate
    
    
    num_chunks = int(np.ceil((len(audio_data) - chunk_samples) / (chunk_samples - overlap_samples))) + 1
    

    for i in range(num_chunks):
        
        start = i * (chunk_samples - overlap_samples)
        end = start + chunk_samples
        

        chunk = audio_data[start:end]
        
        
        mel_spectrogram = librosa.feature.melspectrogram(y=chunk, sr=sample_rate)
        

        mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
        data.append(mel_spectrogram)
    return np.array(data)


In [36]:
file_path = "disco.00000.wav"
wavFile = loadWav(file_path)

In [38]:
predictLabelValue = model.predict(wavFile)
predictLabelValue

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 272ms/step


array([[3.18232707e-09, 3.64241498e-10, 1.02907431e-11, 9.99988317e-01,
        3.17497353e-07, 1.03109897e-13, 2.77124573e-10, 3.73905868e-06,
        6.65229436e-08, 7.64831566e-06],
       [1.83273817e-12, 2.01939558e-15, 8.55216387e-15, 9.99959350e-01,
        6.06243875e-11, 2.20752916e-18, 5.44586239e-15, 4.06248255e-05,
        3.12210169e-09, 1.12317622e-08],
       [5.18988932e-12, 2.71263640e-14, 5.80124436e-13, 9.99881148e-01,
        7.17737259e-11, 2.21978715e-17, 1.57074417e-14, 1.17583884e-04,
        1.29404464e-06, 6.44309495e-09],
       [1.00119401e-11, 1.09341679e-14, 6.76392603e-14, 9.99882698e-01,
        2.39926828e-10, 9.63920103e-18, 2.64635251e-14, 1.17345575e-04,
        1.16652652e-08, 3.69633857e-09],
       [5.70657514e-11, 2.24288931e-13, 5.66285191e-13, 9.99883413e-01,
        2.90195423e-10, 1.01236412e-16, 1.12683442e-13, 1.16141578e-04,
        6.99620060e-08, 3.52590689e-07],
       [5.37056294e-13, 4.04552321e-14, 4.27450745e-13, 9.99996781e-01,
   

In [39]:
def predictLabel(predictLabelValue):
    labels = [
        "blues","classical","country","disco","hiphop","jazz","metal","pop","reggae","rock"
    ]
    predictLabels = []
    for i in predictLabelValue:
        value = i[0]
        valueId = 0
        for j in range(len(i)):
            if(i[j] > value):
                value = i[j]
                valueId = j
        predictLabels.append(labels[valueId])
    return predictLabels

In [40]:
predictLabels = predictLabel(predictLabelValue)
predictLabels

['disco',
 'disco',
 'disco',
 'disco',
 'disco',
 'disco',
 'disco',
 'disco',
 'disco',
 'rock',
 'disco',
 'disco',
 'disco',
 'disco',
 'disco']