In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

df = pd.read_csv("../../our_analyses/dataset_prepared.csv")

from sklearn.preprocessing import LabelEncoder

df=df.drop(['name', 'artists', 'album_name', 'key', 'mode', 'time_signature'], axis=1)

genre_groups = {
    'idm': 0, 'iranian': 0, 'study': 0,  # Electronic/Dance
    'black-metal': 1, 'breakbeat': 1, 'techno': 1,  # Metal/Rock
    'brazil': 2, 'forro': 2, 'happy': 2, 'spanish': 2, 'j-idol': 2,  # Country/Folk/Pop
    'afrobeat': 3, 'chicago-house': 3, 'industrial': 3, 'j-dance': 3,  # World/Commercial Pop
    'bluegrass': 4, 'disney': 4, 'indian': 4, 'mandopop': 4, 'sleep': 4  # Other
}



# Mappatura dei generi del dataframe ai gruppi di generi
df['genre_group'] = df['genre'].map(genre_groups).astype(int)

# Stampa del mapping dei generi ai numeri corrispondenti
print(df['genre_group'].unique())

le = LabelEncoder()
df['explicit'] = le.fit_transform(df['explicit'])

from sklearn.model_selection import train_test_split

attributes = [col for col in df.columns if col != 'genre_group']
# Definizione di y come l'array delle etichette dei generi
y = np.array(df['genre_group'])

# Rimozione delle colonne 'genre_group' e 'genre' per ottenere le feature
X = df.drop(['genre_group', 'genre'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y) # we use 70% and 30% and we stratisfy to have the same distribution for the classes between them in the training and in the test set


from sklearn.preprocessing import StandardScaler

norm = StandardScaler()
norm.fit(X_train)

X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)
df.head()

[3 1 4 2 0]


Unnamed: 0,duration_ms,explicit,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,n_beats,genre,genre_group
0,293106,0,50,0.401,0.683,-5.722,0.0401,0.181,0.0,0.0837,0.226,135.951,654.0,afrobeat,3
1,194972,0,52,0.672,0.858,-5.233,0.145,0.456,0.811,0.104,0.963,166.689,537.0,afrobeat,3
2,178428,0,22,0.636,0.826,-7.486,0.0585,0.461,0.271,0.141,0.731,102.809,304.0,afrobeat,3
3,238373,0,20,0.733,0.862,-5.813,0.0604,0.287,0.000532,0.466,0.745,107.981,427.0,afrobeat,3
4,221893,0,22,0.712,0.225,-10.017,0.0533,0.93,0.001,0.123,0.429,87.52,317.0,afrobeat,3


In [5]:
# from sklearn.model_selection import GridSearchCV

# # Configura una nuova ricerca grid con parametri aggiuntivi o intervalli estesi
# param_grid = {
#     'max_depth': [None] + list(np.arange(5, 20)),
#     'min_samples_split': [2, 5, 10, 20, 40],
#     'min_samples_leaf': [1, 4, 10, 20, 40],
#     'criterion': ['gini', 'entropy'],
#     'max_features': [None, 'sqrt', 'log2'],
#     'min_impurity_decrease': [0, 0.1, 0.01, 0.001, 0.0001],
#     'splitter': ["best", "random"]
# }

# # Esegui la ricerca grid
# grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, n_jobs=-1, verbose=1)
# grid_search.fit(X_train_norm, y_train)

# # Valuta il miglior modello trovato sulla ricerca grid
# best_tree = grid_search.best_estimator_
# print(f'Migliori parametri trovati: {grid_search.best_params_}')
# print(f'Punteggio del miglior modello: {grid_search.best_score_}')

# # Prova ensemble learning con Random Forest
# from sklearn.ensemble import RandomForestClassifier

# # Configura e addestra il RandomForestClassifier
# rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
# rf_clf.fit(X_train_norm, y_train)

# # Valuta il RandomForestClassifier
# rf_train_acc = rf_clf.score(X_train_norm, y_train)
# rf_test_acc = rf_clf.score(X_test_norm, y_test)
# print(f'Random Forest train accuracy: {rf_train_acc}')
# print(f'Random Forest test accuracy: {rf_test_acc}')

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Definizione della griglia dei parametri
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 40, 50],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
}

# Creazione del modello Decision Tree per il GridSearch
dt = DecisionTreeClassifier()

# Creazione del GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Addestramento usando il GridSearchCV
grid_search.fit(X_train_norm, y_train)

# Stampa i migliori parametri
print("Migliori parametri trovati:", grid_search.best_params_)

# Utilizza il miglior modello per fare previsioni sul test set
best_dt = grid_search.best_estimator_
test_accuracy = best_dt.score(X_test_norm, y_test)
print("Accuracy sul Test set:", test_accuracy)


Fitting 5 folds for each of 221760 candidates, totalling 1108800 fits


277200 fits failed out of a total of 1108800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
174136 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/francescocapria/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/francescocapria/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/francescocapria/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/francescocapria/anaconda3/lib/python3.11/site-packages/sklearn/utils/_pa

Migliori parametri trovati: {'criterion': 'entropy', 'max_depth': 15, 'max_features': None, 'min_impurity_decrease': 0.001, 'min_samples_leaf': 10, 'min_samples_split': 50, 'splitter': 'best'}
Accuracy sul Test set: 0.606


In [7]:
plt.figure(figsize=(20, 4), dpi=300)
plot_tree(dtp, feature_names=attributes, filled=True, max_depth=22)
plt.show()

NameError: name 'dtp' is not defined

<Figure size 6000x1200 with 0 Axes>

In [None]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.68      0.67      0.67       675
           1       0.57      0.52      0.54       675
           2       0.58      0.65      0.62      1125
           3       0.50      0.49      0.49       900
           4       0.71      0.68      0.69      1125

    accuracy                           0.61      4500
   macro avg       0.61      0.60      0.60      4500
weighted avg       0.61      0.61      0.61      4500



In [None]:
import plotly.graph_objects as go
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, average_precision_score

# Binarizza le etichette in un formato one-vs-rest e calcola i punteggi di probabilità per le classi
Y = label_binarize(y_test, classes=np.unique(y_test))
y_scores = dt.predict_proba(X_test_norm)

# Inizializza dizionari per precision e recall
precision = dict()
recall = dict()
average_precision = dict()

# Calcola precision e recall per ogni classe
for i in range(len(np.unique(y_test))):
    precision[i], recall[i], _ = precision_recall_curve(Y[:, i], y_scores[:, i])
    average_precision[i] = average_precision_score(Y[:, i], y_scores[:, i])

# Crea un grafico Plotly
fig = go.Figure()

# Aggiunge una linea per ogni classe
for i, genre in enumerate(np.unique(y_test)):
    fig.add_trace(go.Scatter(x=recall[i], y=precision[i], mode='lines', name=f'Class {i} (AP={average_precision[i]:0.2f})'))

# Aggiorna il layout
fig.update_layout(
    title="Precision-Recall curve",
    xaxis_title="Recall",
     width=1000,  # Larghezza del plot
    height=800,  
    yaxis_title="Precision",
    legend_title="Classes"
)

# Mostra il grafico
fig.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go

# Calcola ROC curve e AUC per ogni classe
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(np.unique(y_test))):
    fpr[i], tpr[i], _ = roc_curve(Y[:, i], y_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Crea un grafico Plotly
fig = go.Figure()

# Aggiunge una linea per ogni classe
for i, genre in enumerate(np.unique(y_test)):
    fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines', name=f'Class {i} (AUC={roc_auc[i]:0.2f})'))

# Aggiunge la linea diagonale per il random guessing
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Guessing', line=dict(dash='dash')))

# Aggiorna il layout
fig.update_layout(
    title="ROC curve per classe",
    xaxis_title="False Positive Rate",
     width=1000,  # Larghezza del plot
    height=800,  
    yaxis_title="True Positive Rate",
    legend_title="Classes"
)

# Mostra il grafico
fig.show()


In [None]:
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix

# Assumiamo che 'y_test' e 'y_test_pred' siano già definiti e contengano le etichette vere e predette dal tuo modello

# Genera la matrice di confusione
cf = confusion_matrix(y_test, y_test_pred)

# Definisci le etichette per i generi in base ai raggruppamenti definiti
genre_labels = [' Electronic/Dance', 'Metal/Rock', 'Country/Folk/Pop', 'World/Commercial Pop', 'Other']

# Crea la heatmap usando plotly
fig = ff.create_annotated_heatmap(z=cf, x=genre_labels, y=genre_labels,
                                  annotation_text=cf.astype(str), colorscale='Greens')

# Aggiorna il layout
fig.update_layout(title_text='Confusion Matrix', title_x=0.02,
                  xaxis=dict(title='Predicted Labels', tickangle=45),
                  yaxis=dict(title='True Labels', tickmode='array', tickvals=list(range(len(genre_labels))), ticktext=genre_labels),
                  yaxis_autorange='reversed')  # Reverse the y-axis to have the first class at the top

# Mostra il grafico
fig.show()



In [None]:
# from sklearn.model_selection import cross_val_score
# import matplotlib.pyplot as plt

# # Fit a decision tree to find effective alphas
# path = DecisionTreeClassifier(**optimal_params).cost_complexity_pruning_path(X_train_norm, y_train)
# ccp_alphas, impurities = path.ccp_alphas, path.impurities

# # Perform cross-validation for each alpha
# cv_scores = []
# for ccp_alpha in ccp_alphas:
#     dt = DecisionTreeClassifier(**optimal_params, ccp_alpha=ccp_alpha)
#     scores = cross_val_score(dt, X_train_norm, y_train, cv=5)
#     cv_scores.append(np.mean(scores))

# # Plot the results
# plt.figure(figsize=(10, 6))
# plt.plot(ccp_alphas, cv_scores, marker='o', drawstyle="steps-post")
# plt.xlabel('ccp_alpha')
# plt.ylabel('Average CV Score')
# plt.title('ccp_alpha vs. Average CV Score')
# plt.show()

# # Find the best ccp_alpha
# best_ccp_alpha = ccp_alphas[np.argmax(cv_scores)]
# print('Best ccp_alpha:', best_ccp_alpha)

# # Re-train and evaluate the decision tree with the best ccp_alpha
# dt_optimized = DecisionTreeClassifier(**optimal_params, ccp_alpha=best_ccp_alpha)
# dt_optimized.fit(X_train_norm, y_train)
# train_accuracy = dt_optimized.score(X_train_norm, y_train)
# test_accuracy = dt_optimized.score(X_test_norm, y_test)

# print(f'Optimized Decision Tree train accuracy: {train_accuracy}')
# print(f'Optimized Decision Tree test accuracy: {test_accuracy}')
