In [51]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [52]:
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    precision_score, 
    recall_score,
)
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
from sklearn.neighbors import KNeighborsClassifier


In [53]:
df = pd.read_csv("../../our_analyses/dataset_prepared.csv")
df_test= pd.read_csv("../../our_analyses/dataset_test_prepared.csv")

In [54]:
def apply_pattern_rules(df):
    """
    Apply the pattern rules to the DataFrame and filter out rows that do not conform to these rules.
    """
    # World/Commercial Pop
    df_wc_pop = df[(df['genre_group'] == 3) & 
                   (df['danceability'] > 0.695) & (df['danceability'] <= 0.98) & 
                   (df['energy'] > 0.709) & (df['energy'] <= 0.884)]

    # Electronic/Dance
    df_electronic_dance_1 = df[(df['genre_group'] == 0) & 
                               (df['valence'] > -0.001) & (df['valence'] <= 0.196) & 
                               (df['popularity'] > -0.001) & (df['popularity'] <= 14.0)]
    df_electronic_dance_2 = df[(df['genre_group'] == 0) & 
                               (df['loudness'] > -49.532) & (df['loudness'] <= -10.636) & 
                               (df['popularity'] > -0.001) & (df['popularity'] <= 14.0)]

    # Other
    df_other_1 = df[(df['genre_group'] == 4) & 
                    (df['energy'] > -0.001) & (df['energy'] <= 0.48) & 
                    (df['acousticness'] > 0.573) & (df['acousticness'] <= 0.996) & 
                    (df['loudness'] > -49.532) & (df['loudness'] <= -10.636)]
    df_other_2 = df[(df['genre_group'] == 4) & 
                    (df['acousticness'] > 0.573) & (df['acousticness'] <= 0.996) & 
                    (df['energy'] > -0.001) & (df['energy'] <= 0.48)]

    # Country/Folk/Pop
    df_country_folk_pop_1 = df[(df['genre_group'] == 2) & 
                               (df['energy'] > 0.884) & (df['energy'] <= 1.0) & 
                               (df['loudness'] > -5.101) & (df['loudness'] <= 3.156) & 
                               (df['instrumentalness'] > -0.001) & (df['instrumentalness'] <= 0.00313)]
    df_country_folk_pop_2 = df[(df['genre_group'] == 2) & 
                               (df['loudness'] > -5.101) & (df['loudness'] <= 3.156) & 
                               (df['energy'] > 0.884) & (df['energy'] <= 1.0)]

    # Metal/Rock
    df_metal_rock_1 = df[(df['genre_group'] == 1) & 
                         (df['popularity'] > 14.0) & (df['popularity'] <= 24.0) & 
                         (df['acousticness'] > -0.001) & (df['acousticness'] <= 0.00974)]
    df_metal_rock_2 = df[(df['genre_group'] == 1) & 
                         (df['energy'] > 0.884) & (df['energy'] <= 1.0) & 
                         (df['acousticness'] > -0.001) & (df['acousticness'] <= 0.00974)]

    # Combine all filtered DataFrames
    df_filtered = pd.concat([df_wc_pop, df_electronic_dance_1, df_electronic_dance_2, 
                             df_other_1, df_other_2, df_country_folk_pop_1, 
                             df_country_folk_pop_2, df_metal_rock_1, df_metal_rock_2])

    return df_filtered
from sklearn.preprocessing import LabelEncoder, StandardScaler


df = df.drop(['name', 'artists', 'album_name'], axis=1)
df_test = df_test.drop(['name', 'artists', 'album_name'], axis=1)

genre_groups = {
    'idm': 0, 'iranian': 0, 'study': 0,  # Electronic/Dance
    'black-metal': 1, 'breakbeat': 1, 'techno': 1,  # Metal/Rock
    'brazil': 2, 'forro': 2, 'happy': 2, 'spanish': 2, 'j-idol': 2,  # Country/Folk/Pop
    'afrobeat': 3, 'chicago-house': 3, 'industrial': 3, 'j-dance': 3,  # World/Commercial Pop
    'bluegrass': 4, 'disney': 4, 'indian': 4, 'mandopop': 4, 'sleep': 4  # Other
}

df['genre_group'] = df['genre'].map(genre_groups).astype(int)
df_test['genre_group'] = df_test['genre'].map(genre_groups).astype(int)

le = LabelEncoder()
df['explicit'] = le.fit_transform(df['explicit'])
df_test['explicit'] = le.transform(df_test['explicit'])

# Apply the pattern rules to the datasets
df= apply_pattern_rules(df)
df_test = apply_pattern_rules(df_test)

df.head()

Unnamed: 0,duration_ms,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,n_beats,genre,genre_group
3,238373,0,20,0.733,0.862,7,-5.813,1,0.0604,0.287,0.000532,0.466,0.745,107.981,4.0,427.0,afrobeat,3
7,347772,0,18,0.774,0.719,10,-8.976,0,0.0486,0.0181,0.128,0.0781,0.89,132.025,4.0,763.0,afrobeat,3
8,219746,0,31,0.762,0.858,9,-4.589,0,0.0313,0.321,0.0629,0.276,0.836,126.005,4.0,458.0,afrobeat,3
17,257693,1,20,0.759,0.78,0,-8.007,1,0.0385,0.0024,0.0506,0.321,0.51,125.05,4.0,530.0,afrobeat,3
26,175856,0,20,0.787,0.726,7,-5.637,1,0.164,0.0719,0.00313,0.087,0.407,107.024,4.0,303.0,afrobeat,3


In [55]:
df.head()

Unnamed: 0,duration_ms,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,n_beats,genre,genre_group
3,238373,0,20,0.733,0.862,7,-5.813,1,0.0604,0.287,0.000532,0.466,0.745,107.981,4.0,427.0,afrobeat,3
7,347772,0,18,0.774,0.719,10,-8.976,0,0.0486,0.0181,0.128,0.0781,0.89,132.025,4.0,763.0,afrobeat,3
8,219746,0,31,0.762,0.858,9,-4.589,0,0.0313,0.321,0.0629,0.276,0.836,126.005,4.0,458.0,afrobeat,3
17,257693,1,20,0.759,0.78,0,-8.007,1,0.0385,0.0024,0.0506,0.321,0.51,125.05,4.0,530.0,afrobeat,3
26,175856,0,20,0.787,0.726,7,-5.637,1,0.164,0.0719,0.00313,0.087,0.407,107.024,4.0,303.0,afrobeat,3


In [56]:
# Separazione delle features e dei target
X_train = df.drop(['genre_group', 'genre'], axis=1)
y_train = df['genre_group'].values
X_test = df_test.drop(['genre_group', 'genre'], axis=1)
y_test = df_test['genre_group'].values

y = np.array(df['genre_group'])

# Rimozione delle colonne 'genre_group' e 'genre' per ottenere le feature
X = df.drop(['genre_group', 'genre'], axis=1)

y_test = np.array(df['genre_group'])

X_test = df.drop(['genre_group', 'genre'], axis=1)


In [57]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [58]:
from sklearn.neighbors import KNeighborsClassifier

# Addestramento del classificatore KNN
clf = KNeighborsClassifier(n_neighbors=8, metric="cityblock", weights="uniform")
clf.fit(X_train_norm, y_train)

# Valutazione del classificatore sul set di addestramento
y_train_pred = clf.predict(X_train_norm)
print("Accuracy sul set di addestramento:", accuracy_score(y_train, y_train_pred))

# Valutazione del classificatore sul set di test
y_test_pred = clf.predict(X_test_norm)
print("Accuracy sul set di test:", accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))



Accuracy sul set di addestramento: 0.9081101190476191
Accuracy sul set di test: 0.9081101190476191
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      1580
           1       0.85      0.87      0.86      1486
           2       0.87      0.94      0.91      1822
           3       0.90      0.74      0.81       505
           4       0.97      0.95      0.96      2671

    accuracy                           0.91      8064
   macro avg       0.90      0.88      0.89      8064
weighted avg       0.91      0.91      0.91      8064



In [59]:
import plotly.graph_objects as go
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, average_precision_score

# Binarizza le etichette in un formato one-vs-rest e calcola i punteggi di probabilità per le classi
Y = label_binarize(y_test, classes=np.unique(y_test))
y_scores = clf.predict_proba(X_test_norm)

# Inizializza dizionari per precision e recall
precision = dict()
recall = dict()
average_precision = dict()

# Calcola precision e recall per ogni classe
for i in range(len(np.unique(y_test))):
    precision[i], recall[i], _ = precision_recall_curve(Y[:, i], y_scores[:, i])
    average_precision[i] = average_precision_score(Y[:, i], y_scores[:, i])

# Crea un grafico Plotly
fig = go.Figure()

# Aggiunge una linea per ogni classe
for i, genre in enumerate(np.unique(y_test)):
    fig.add_trace(go.Scatter(x=recall[i], y=precision[i], mode='lines', name=f'Class {i} (AP={average_precision[i]:0.2f})'))

# Aggiorna il layout
fig.update_layout(
    title="Precision-Recall curve",
    xaxis_title="Recall",
     width=1000,  # Larghezza del plot
    height=800,  
    yaxis_title="Precision",
    legend_title="Classes"
)

# Mostra il grafico
fig.show()


In [60]:
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix

# Assumiamo che 'y_test' e 'y_test_pred' siano già definiti e contengano le etichette vere e predette dal tuo modello

# Genera la matrice di confusione
cf = confusion_matrix(y_test, y_test_pred)

# Definisci le etichette per i generi in base ai raggruppamenti definiti
genre_labels = ['Dance/Electronic', 'Ambient/Relaxing', 'Global/Traditional', 'Metal/Industrial', 'Pop/World']

# Crea la heatmap usando plotly
fig = ff.create_annotated_heatmap(z=cf, x=genre_labels, y=genre_labels,
                                  annotation_text=cf.astype(str), colorscale='Greens')

# Aggiorna il layout
fig.update_layout(title_text='Confusion Matrix', title_x=0.5,
                  xaxis=dict(title='Predicted Labels', tickangle=45),
                  yaxis=dict(title='True Labels', tickmode='array', tickvals=list(range(len(genre_labels))), ticktext=genre_labels),
                  yaxis_autorange='reversed')  # Reverse the y-axis to have the first class at the top

# Mostra il grafico
fig.show()


In [61]:
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go

# Calcola ROC curve e AUC per ogni classe
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(np.unique(y_test))):
    fpr[i], tpr[i], _ = roc_curve(Y[:, i], y_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Crea un grafico Plotly
fig = go.Figure()

# Aggiunge una linea per ogni classe
for i, genre in enumerate(np.unique(y_test)):
    fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines', name=f'Class {i} (AUC={roc_auc[i]:0.2f})'))

# Aggiunge la linea diagonale per il random guessing
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Guessing', line=dict(dash='dash')))

# Aggiorna il layout
fig.update_layout(
    title="ROC curve per classe",
    xaxis_title="False Positive Rate",
     width=1000,  # Larghezza del plot
    height=800,  
    yaxis_title="True Positive Rate",
    legend_title="Classes"
)

# Mostra il grafico
fig.show()
