# Gtzan classification using the melspectogram images

In [None]:
#Importing the libraries 

import os
import numpy as np
import pandas as pd
import random
import shutil

import librosa
import matplotlib.pyplot as plt

import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

print(tf.__version__)

# How melspectogram is generated using librosa 
> (Important features here i learnt are : n_mels,short term forier transform, vmin and vmax range to display the most important sounds)

In [None]:
#Librosa functions to see how the melspecs are generated 

audio_file = '/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/classical/classical.00007.wav'

y, sr = librosa.load(audio_file)
print(y.shape)   #number of frames
print(sr)    #samples per second
D = librosa.stft(y)
print(D.shape)
S = librosa.feature.melspectrogram(S=D, sr=sr, n_mels=256, fmax=8000)

"""
n_mels = 256, you're dividing the spectrum (from 0 Hz to fmax) 
into 256 non-linear frequency bins.
more resolution at low frequencies and vica versa (like ears)
"""

print(S.shape)
S_db = librosa.power_to_db(np.abs(S), ref=np.max) #to db for visualisation
print(S_db.shape)
plt.figure(figsize=(6, 4))
librosa.display.specshow(S_db, x_axis='time', y_axis='mel', sr=sr,vmin=-25,vmax=0)
#O db being the loudest sound and others being relative to that so we will take the values that are most relavant [ref=np.max],np.abs used to handle imaginary values

plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.show()

# Viewing the Melspec images of various generes
> Tensorflow libraries used here
> * tf.io
> * tf.image

In [None]:
#folder containing subfolders of melspec images
folder_path='/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original'

"""
Melspec spatial features can be visualised through cnn's each genre has different pattern 
of melspec which the model captures and thus can differentiate between music types
"""

total_genres=10

#file path for testing
dictionary={
    "Hiphop":'/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original/hiphop/hiphop00001.png',
    "Metal":'/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original/metal/metal00009.png',
    "Classical":'/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original/classical/classical00007.png',
    "Rock":'/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original/rock/rock00010.png',
    "Blues":'/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original/blues/blues00048.png'
    
}

def printing_melspecs(file):
    raw=tf.io.read_file(file)
    image=tf.io.decode_png(raw,channels=3)
    
    print(image.shape)
    #print(image.dtype)
    #print(image)

    image=tf.image.convert_image_dtype(image,tf.float32)
    image=tf.image.adjust_brightness(image,delta=0.2)

    #This not needed as we converted image dtype to adjust brightness
    #image_arr=image.numpy().astype("float32") / 255.0  
    
    image_arr=image.numpy() #(to numpy for visualisation)
    
    plt.figure(figsize=(6,4))
    plt.imshow(image_arr,cmap='cool',interpolation='bicubic')

    plt.axis('off')
    plt.title('Melspectogram')
    plt.show()

for k,v in dictionary.items():
    print(k)
    printing_melspecs(v)

>Notice the difference between melspectograms of different genres,
X axis is time and Y axis is frequency , the colored part represent pitch or amplitude , 
it used Mel scale , similar to how humans perceive audio,(change of frequencies over time)

# Splitting the images from folder into train,test,val sets

In [None]:
base_dir = '/kaggle/working/'
train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

split_ratio = {'train': 0.7, 'val': 0.15, 'test': 0.15}

In [None]:
for genre in os.listdir(folder_path):
    genre_folder = os.path.join(folder_path, genre)
    if os.path.isdir(genre_folder):
      
        os.makedirs(os.path.join(train_dir, genre), exist_ok=True)
        os.makedirs(os.path.join(val_dir, genre), exist_ok=True)
        os.makedirs(os.path.join(test_dir, genre), exist_ok=True)

        images = os.listdir(genre_folder)
        random.shuffle(images)

        total_images = len(images)
        train_size = int(total_images * split_ratio['train'])
        val_size = int(total_images * split_ratio['val'])

        train_images = images[:train_size]
        val_images = images[train_size:train_size + val_size]
        test_images = images[train_size + val_size:]

        for image in train_images:
            shutil.copy(os.path.join(genre_folder, image), os.path.join(train_dir, genre, image))
        for image in val_images:
            shutil.copy(os.path.join(genre_folder, image), os.path.join(val_dir, genre, image))
        for image in test_images:
            shutil.copy(os.path.join(genre_folder, image), os.path.join(test_dir, genre, image))

print("Dataset split completed!")

# Building a Sequential Keras Model of conv layers , dropout, batchnormalisation , etc.
> Padding/cropping/resizing due to different image dimensions , conv2d expects a consistent input tensor dim.