# Music Genre Classifier
--- 
## Imports:

In [None]:
# Usual Libraries
import pandas as pd 
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import os
from IPython.display import Audio, display

# Librosa - for audio and music processing
import librosa
import librosa.display
import IPython.display as ipd
import warnings
warnings.filterwarnings('ignore')

print("Succesfully imported libraries")

Succesfully imported libraries


In [None]:
# Download Dataset:

import kagglehub
path = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\janmi\.cache\kagglehub\datasets\andradaolteanu\gtzan-dataset-music-genre-classification\versions\1


: 

### The Data Folder contains:

* **genres original folder** (collection of 10 genres - each with 100 thirty second long audio files)
* **images original folder** (visual representation for each audio file)
* **features_30_seconds.csv file**  (containing features of the audio files)
* **features_3_seconds.csv file**   (same structure - but this time the songs were split into 3-second parts -> More Data) 

In [None]:
print(list(os.listdir(f'{path}\Data')))

['features_30_sec.csv', 'features_3_sec.csv', 'genres_original', 'images_original']


: 

: 

---

## Explore Audio Data

In [None]:
# Import one file...
file_path = f'{path}\\Data\\genres_original\\blues\\blues.00014.wav'
y, sr = librosa.load(file_path)

if os.path.exists(file_path):
    #print(f'Play Audio: {file_path}')
    display(Audio(file_path))


print('y: ', y, '\n')
print('y shape: ', np.shape(y), '\n')
print('Sample Rate (KHz): ', sr, '\n')

print("Length of Audio(samples/samplerate): ", 661794/sr, "sec")

: 

: 

: 

In [None]:
# Trim silence in the beginning an end:
audio_file, _ = librosa.effects.trim(y)

print("Audio File: ", audio_file, "\n")
print("Audio File shape: ", np.shape(audio_file))

: 

: 

: 

## 2D Representation:

In [None]:
plt.figure(figsize=(16,6))
librosa.display.waveshow(y = audio_file, sr = sr, color = "#A300F0");
plt.title("Sound waves in Blues.00014.wav", fontsize=23)

: 

: 

: 

### Fourier Transform:

* integral transform that takes a signal function as input and outputs another function that describes the extent to which various frequencies are present in the input function

* The transform contains both the y-axis(frequency) to log scale and the "color" axis (amplitude)

In [None]:
n_fft = 2048
hop_length = 512

# Short-time Fourier transform
D = np.abs(librosa.stft(audio_file, n_fft=n_fft, hop_length=hop_length))

print("Shape of D object: ", np.shape(D))



: 

: 

: 

In [None]:
plt.figure(figsize=(16,6))
plt.plot(D);

: 

: 

: 

### FFT-Spectogram:

In [None]:
# Convert amplitude spectrogram to decibel-scaled spectogram
DB = librosa.amplitude_to_db(D, ref = np.max)

plt.figure(figsize=(16,6))
librosa.display.specshow(DB, y_axis='log', x_axis='time', sr=sr, hop_length=hop_length, cmap= 'cool')
plt.title('Spectogram der FFT', fontsize=20)
plt.colorbar();
plt.show()

: 

: 

: 

### Mel Spectogram

In [None]:
y_trim, _ = librosa.effects.trim(y)

S = librosa.feature.melspectrogram(y=y_trim, sr=sr)
S_DB = librosa.amplitude_to_db(S, ref=np.max)
plt.figure(figsize=(16,6))
librosa.display.specshow(S_DB, y_axis='log', sr=sr, hop_length=hop_length, x_axis='time', cmap='cool');
plt.colorbar();
plt.title('Mel Spectogram', fontsize=20)

: 

: 

: 

## Audio Features

### Zero Crossing Rate

* the rate at which the signal changes between positive and negative

In [None]:
zero_crossings = librosa.zero_crossings(audio_file, pad=False)
print("Zero Crossings: ", np.sum(zero_crossings))

: 

: 

: 

### Harmonics and Perceptrual

In [None]:
y_harmonic, y_perceptrual = librosa.effects.hpss(audio_file)

plt.figure(figsize=(16,6))
plt.plot(y_harmonic, color="#00B1C9");
plt.plot(y_perceptrual, color="#CF1302")

: 

: 

: 

### BPM (Tempo)

In [None]:
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
print("Tempo in BPM:", tempo)

: 

: 

: 

### Spectral Centroid

* a measure used in digital signal processing to characterise a spectrum
* indicates where the center of mass of the spectrum is located 
* perceptually, it has a robust connection with the impression of brightness of a sound

In [None]:
spectral_centroids = librosa.feature.spectral_centroid(y=audio_file, sr=sr)[0]

print('Centroids: ', spectral_centroids, '\n')
print('Shape of Spectral Centroids: ', spectral_centroids.shape, '\n')

# Computing the time variable for visualization
frames = range(len(spectral_centroids))

# Converts frame counts to time (seconds)
t = librosa.frames_to_time(frames)

print('frames: ', frames, '\n')
print('t: ', t)

# Normalize Sound Data Function
def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)

: 

: 

: 

In [None]:
plt.Figure(figsize=(16,6))
librosa.display.waveshow(audio_file, sr=sr, alpha=0.4, color = '#00B1C9');
plt.plot(t, normalize(spectral_centroids), color='#CF1302' );


: 

: 

: 

### Spectral Rollof

* The spectral roll-off point is defined as the frequency below which a specified proportion of the total energy of the spectrum is contained

In [None]:
spectral_rollof = librosa.feature.spectral_rolloff(y=audio_file, sr=sr)[0]

plt.figure(figsize=(16,6))
librosa.display.waveshow(audio_file, sr=sr, alpha=0.4, color='#00B1C9')
plt.plot(t, normalize(spectral_rollof), color='#CF1302')

: 

: 

: 

### Mel-Frequency Cepstral Coefficients (MFCCs):

* The Mel frequency cepstral coefficients (MFCCs) of a signal are a small set of features (usually about 10â€“20) which concisely describe the overall shape of a spectral envelope. It models the characteristics of the human voice.

In [None]:
mfccs = librosa.feature.mfcc(y=audio_file, sr=sr);
print('mfccs shape: ', mfccs.shape)

plt.figure(figsize=(16,6))
librosa.display.specshow(mfccs, sr=sr, x_axis='time', cmap='cool');

: 

: 

: 

#### ... scale data

In [None]:
mfccs = sklearn.preprocessing.scale(mfccs, axis=1)
print('Mean: ', mfccs.mean(), '\n')
print('Var: ', mfccs.var())

plt.figure(figsize=(16,6))
librosa.display.specshow(mfccs, sr=sr, x_axis='time', cmap='cool');

: 

: 

: 

### Chroma STFT / Chroma Frequencies

* Chroma-STFT is a methode of calculating chroma-characteristics from an audio-signal
* Short-Term-Fourier-Transform is applied
* Data is assigned to the 12 pitches (C-B)


In [None]:
# Adjust hop_length to change how granular you want your data to be
hop_length = 5000

chromagram = librosa.feature.chroma_stft(y=audio_file, sr=sr, hop_length=hop_length)
print('Chromagra shape: ', chromagram.shape)

plt.figure(figsize=(16,6))
librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm');

: 

: 

: 

---
# Exploratory Data Analysis (EDA)

EDA is going to be performed on the features_30_sec.csv. This file contains the mean and variance for each audio file fo the features analysed above.

So, the table has a final of 1000 rows (10 genrex x 100 audio files) and 60 features (dimensionalities).

In [None]:
data = pd.read_csv(f'{path}\\Data\\features_30_sec.csv')
data.head(10)

: 

: 

: 

## Correlation Heatmap for feature means

In [None]:
# Computing the Correlation Matrix
spike_cols = [col for col in data.columns if 'mean' in col]
corr = data[spike_cols].corr()

# Generate mask for upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up matplotlib figure
f, ax = plt.subplots(figsize=(16,11))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(0, 25, as_cmap=True, s=90, l=45, n=45)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={'shrink': .5})

plt.title('Mean Variable Correlation Heatmap', fontsize = 23)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.savefig('corr heatmap.jpg')


: 

: 

: 

## Box Plot for Genre Distributions

In [None]:
x = data[['label', 'tempo']]

f, ax = plt.subplots(figsize=(16,9));
sns.boxplot(x='label', y='tempo', data=x, palette='husl');

plt.title('BPM Boxplot for Genres', fontsize=23)
plt.xticks(fontsize=14)
plt.yticks(fontsize=10)
plt.xlabel('Genre', fontsize=15)
plt.ylabel('Tempo(BPM)', fontsize=15)
plt.savefig('BPM boxplot.jpg')

: 

: 

: 

## Principal Component Analysis (PCA)

1. Normalization

2. PCA

3. The Scatter Plot

* unsupervised technique of linear algebra, used to reduce the dimensionality of a data set while preserving as much information (varianace) as possible

In [None]:
from sklearn import preprocessing

# Delete first column of dataset, which includes only ID which in unnecessary here
data = data.iloc[0:, 1:] 
y = data['label']                   
X = data.loc[:, data.columns != 'label'] 

# 1. Normalize X (MinMaxScaler)
cols = X.columns                                # save names of feature columns
min_max_scaler = preprocessing.MinMaxScaler()   
np_scaled = min_max_scaler.fit_transform(X)     # calculate the scale factors and applies them to the feature-data
X = pd.DataFrame(np_scaled, columns=cols)       # Converts scaled NumPy-array back into pandas dataframe

# 2. PCA Components
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principal_components =pca.fit_transform(X)      # perform PCA: 
                                                    #fit-> learns optimal directions and 
                                                    # transform-> project X-data onto the new axes
principalDf = pd.DataFrame(data=principal_components, columns= ['principal component 1', 'principal component 2'])

finalDf = pd.concat([principalDf, y], axis=1)



print(pca.explained_variance_ratio_) # The ouput array indicates the proportion of variance (information) explained by each principal component
sum = pca.explained_variance_[0] + pca.explained_variance_[1]
print(f'Total variance is the sum of the two: {sum}')


: 

: 

: 

In [None]:
# 3. Scatter Plot

plt.figure(figsize=(16, 9))
sns.scatterplot(x='principal component 1', y='principal component 2', data = finalDf, hue='label', alpha=0.7, s=100);
plt.title('PCA on genres', fontsize=23)
plt.xticks(fontsize=14)
plt.yticks(fontsize=10)
plt.xlabel('Principal Component 1', fontsize=15)
plt.ylabel('Principal Component 2', fontsize=15)
plt.savefig('PCA scattert.jpg')


: 

: 

: 

---
# Machine Learning Classification

For building a classifier that accuratly predicts the genres, wr are going to use the `feature_3_sec.csv` file

### Libraries

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance
import os
import joblib
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
print('Imported successfully')

: 

: 

: 

In [None]:
# Read scv into pandas data frame
data = pd.read_csv(f'{path}\\Data\\features_3_sec.csv')
data = data.iloc[0:, 1:]

data.head()

: 

: 

: 

In [None]:
y = data['label'] 
X = data.loc[:, data.columns != 'label']

cols = X.columns
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X)

X = pd.DataFrame(np_scaled, columns=cols)

: 

: 

: 

In [None]:
# Splitting Data into Training and Testing Set (70/30)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

: 

: 

: 

### Creating a predefined function to assess the accuracy of a model

In [None]:
def model_assess(model, title='Default'):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    #print(confusion_matrix(y_test, preds))
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 4), '\n')

: 

: 

: 

In [None]:
def xgb_model_assess(model, title='Default'):
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.fit_transform(y_test)

    model.fit(X_train, y_train_encoded)
    preds=model.predict(X_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test_encoded, preds), 4), '\n')
    

: 

: 

: 

### Trying ten classifaction models to asses their performance:

In [None]:
model_file_name='genre_classifier_model.joblib'

if os.path.exists(model_file_name):
    print('Model already trained...')

# Naive Bayes:
else:
    nb = GaussianNB()
    model_assess(nb, 'Naive Bayes')

    # Stochastic Gradient Descent
    sgd = SGDClassifier(max_iter=5000, random_state=0)
    model_assess(sgd, 'Stochastic Gradient Descent')

    # K-Nearest Neighbour
    knn = KNeighborsClassifier(n_neighbors=8)
    model_assess(knn, 'K-Nearest Neighbour')

    # Decission Trees
    tree = DecisionTreeClassifier()
    model_assess(tree, 'Decission Trees')

    # Random Forest
    rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
    model_assess(rforest, 'Random Forest')

    # Support Vector Machine
    svm = SVC(decision_function_shape='ovo')
    model_assess(svm, 'Support Vector Machine')

    # Logistic Regression
    lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    model_assess(lg, 'Logistic Regression')

    # Neural Nets
    nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5000, 10), random_state=1)
    model_assess(nn, 'Neural Nets')

    # Cross Gradient Booster
    xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05, use_label_encoder=False, eval_metrics='mlogloss')
    xgb_model_assess(xgb, 'Cross Gradient Booster')

    # Cross Gradient Booster (Random Forest)
    xgbrf = XGBRFClassifier(objective='multi:softmax', use_label_encoder=False, eval_metrics='mlogloss')
    xgb_model_assess(xgbrf, 'Cross Gradient Booster (Random Forest)')

    joblib.dump(xgb, model_file_name)
    print(f"Training abgeschlossen und MOdell erfolgreich gespeichert unter '{model_file_name}'.")

: 

: 

: 

## XGBoost has the best accuracy (~90%)

### Create Final Model:

In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.fit_transform(y_test)

xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05, use_label_encoder=False, eval_metrics='mlogloss')
xgb.fit(X_train, y_train_encoded)

preds = xgb.predict(X_test)

print('Accuracy', ':' , round(accuracy_score(y_test_encoded, preds), 4), '\n')

: 

: 

: 

### Confusion Matrix:

In [None]:
confusion_matr = confusion_matrix(y_test_encoded, preds)
plt.figure(figsize=(16,9))
sns.heatmap(confusion_matr, cmap="Blues", annot=True, 
            xticklabels = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"],
           yticklabels=["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]);
plt.savefig("conf matrix")

: 

: 

: 

### Feature Importance:
... indicates which of the extracted features the trained modell uses the most

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(estimator=xgb, random_state=1)
perm.fit(X_test, y_test)

eli5.show_weights(estimator=perm, feature_names=X_test.columns.tolist(), top=10)
# ...values are all so small so that they are rounded to 0

: 

: 

: 

# Recommender Systems
... to find the best similarity of a given vector, ranked in descending order

For Audio files, this will be done through `cosine_similarity` library.

In [None]:
import IPython.display as ipd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

data = pd.read_csv(f'{path}\\Data\\features_30_sec.csv', index_col='filename')

labels = data[['label']]

data = data.drop(columns=['length', 'label'])
data.head()

data_scaled=preprocessing.scale(data)
print('type of \"data_scaled\":', type(data_scaled))

: 

: 

: 

## Cosine similarity

*In data analysis, cosine similarity is a measure of similarity between two non-zero vectors defined in an inner product space. Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths.*

In [None]:
similarity =cosine_similarity(data_scaled)
print('Similarity shape:', similarity.shape)

sim_df_labels = pd.DataFrame(similarity)
sim_df_names = sim_df_labels.set_index(labels.index)
sim_df_names.columns = labels.index

: 

: 

: 

`find_similar_songs()` - is a predefined function that takes the name of the song and returns top 5 best matches for that song.

In [None]:
def find_similar_songs(name, n = 5):
    series = sim_df_names[name].sort_values(ascending=False)

    series = series.drop(name)

    print("\n*******\nSimilar songs to ", name)
    print(series.head(n))

: 

: 

: 

In [None]:
base_path = f'{path}/Data/genres_original'


relative_track_path = os.path.join('blues', 'blues.00014.wav')

full_file_path = os.path.join(base_path, relative_track_path)
full_file_path = full_file_path.replace('\\', '/')
ipd.Audio(full_file_path)

: 

: 

: 

In [None]:
find_similar_songs('blues.00014.wav')

: 

: 

: 

In [None]:
sim_song1_path = f'{path}\\Data\\genres_original\\blues\\blues.00022.wav'
sim_song2_path = f'{path}\\Data\\genres_original\\blues\\blues.00021.wav'
sim_song3_path = f'{path}\\Data\\genres_original\\blues\\blues.00015.wav'

if os.path.exists(sim_song1_path):
    print('Similar Song 1:')
    display(Audio(sim_song1_path))

if os.path.exists(sim_song2_path):
    print('Similar Song 2:')
    display(Audio(sim_song2_path))


if os.path.exists(sim_song3_path):
    print('Similar Song 3:')
    display(Audio(sim_song3_path))



: 

: 

: 

## Similarity example with classical song:

In [None]:
find_similar_songs('classical.00032.wav')
display(Audio(f'{path}\\Data\\genres_original\\classical\\classical.00032.wav'))


: 

: 

: 

In [None]:
sim_song1_path = f'{path}\\Data\\genres_original\\classical\\classical.00078.wav'
sim_song2_path = f'{path}\\Data\\genres_original\\classical\\classical.00063.wav'
sim_song3_path = f'{path}\\Data\\genres_original\\classical\\classical.00081.wav'

if os.path.exists(sim_song1_path):
    print('Similar Song 1:')
    display(Audio(sim_song1_path))

if os.path.exists(sim_song2_path):
    print('Similar Song 2:')
    display(Audio(sim_song2_path))


if os.path.exists(sim_song3_path):
    print('Similar Song 3:')
    display(Audio(sim_song3_path))


: 

: 

: 