In [118]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Function to add new binary features based on pattern mining
def add_pattern_features(row):
    # Initialize the new feature columns with zeros
    row['wc_pop_pattern'] = 0
    row['electronic_dance_pattern_valence'] = 0
    row['electronic_dance_pattern_loudness'] = 0
    row['other_pattern_energy_acousticness'] = 0
    row['other_pattern_acousticness'] = 0
    row['country_folk_pop_pattern_energy'] = 0
    row['country_folk_pop_pattern_loudness'] = 0
    row['metal_rock_pattern_popularity'] = 0
    row['metal_rock_pattern_energy'] = 0
    row['energy_low_loudness_low_mode1'] = 0
    row['instrumentalness_low_country_folk_pop_mode1'] = 0
    row['instrumentalness_low_loudness_high_country_folk_pop'] = 0
    row['acousticness_high_other_energy_low'] = 0
    
    
    # World/Commercial Pop pattern
    if (row['genre_group'] == 3) and (0.695 < row['danceability'] <= 0.98) and (0.709 < row['energy'] <= 0.884):
        row['wc_pop_pattern'] = 1
    # Electronic/Dance patterns
    if (row['genre_group'] == 0) and (-0.001 < row['valence'] <= 0.196) and (-0.001 < row['popularity'] <= 14.0):
        row['electronic_dance_pattern_valence'] = 1
    if (row['genre_group'] == 0) and (-49.532 < row['loudness'] <= -10.636) and (-0.001 < row['popularity'] <= 14.0):
        row['electronic_dance_pattern_loudness'] = 1
    # Other patterns
    if (row['genre_group'] == 4) and (-0.001 < row['energy'] <= 0.48) and (0.573 < row['acousticness'] <= 0.996) and (-49.532 < row['loudness'] <= -10.636):
        row['other_pattern_energy_acousticness'] = 1
    if (row['genre_group'] == 4) and (0.573 < row['acousticness'] <= 0.996) and (-0.001 < row['energy'] <= 0.48):
        row['other_pattern_acousticness'] = 1
    # Country/Folk/Pop patterns
    if (row['genre_group'] == 2) and (0.884 < row['energy'] <= 1.0) and (-5.101 < row['loudness'] <= 3.156) and (-0.001 < row['instrumentalness'] <= 0.00313):
        row['country_folk_pop_pattern_energy'] = 1
    if (row['genre_group'] == 2) and (-5.101 < row['loudness'] <= 3.156) and (0.884 < row['energy'] <= 1.0):
        row['country_folk_pop_pattern_loudness'] = 1
    # Metal/Rock patterns
    if (row['genre_group'] == 1) and (14.0 < row['popularity'] <= 24.0) and (-0.001 < row['acousticness'] <= 0.00974):
        row['metal_rock_pattern_popularity'] = 1
    if (row['genre_group'] == 1) and (0.884 < row['energy'] <= 1.0) and (-0.001 < row['acousticness'] <= 0.00974):
        row['metal_rock_pattern_energy'] = 1
    if (row['mode'] == 1) and (-0.001 < row['energy'] <= 0.48) and (-49.532 < row['loudness'] <= -10.636):
        row['energy_low_loudness_low_mode1'] = 1

    # Instrumentalness low leads to Country/Folk/Pop in mode 1
    if (row['mode'] == 1) and (-0.001 < row['instrumentalness'] <= 0.00313):
        row['instrumentalness_low_country_folk_pop_mode1'] = 1

    # Instrumentalness low and Loudness high associated with Country/Folk/Pop
    if (-0.001 < row['instrumentalness'] <= 0.00313) and (-5.101 < row['loudness'] <= 3.156) and (row['genre_group'] == 2):
        row['instrumentalness_low_loudness_high_country_folk_pop'] = 1

    # Acousticness high and Energy low associated with Other
    if (0.573 < row['acousticness'] <= 0.996) and (-0.001 < row['energy'] <= 0.48) and (row['genre_group'] == 4):
        row['acousticness_high_other_energy_low'] = 1
    
    return row
    


# Load the dataset
df = pd.read_csv("../../our_analyses/dataset_prepared.csv")

# Drop irrelevant columns
df = df.drop(['name', 'artists', 'album_name', 'key', 'time_signature'], axis=1)

# Map genres to genre groups

genre_groups = {
    'idm': 0, 'iranian': 0, 'study': 0,  # Electronic/Dance
    'black-metal': 1, 'breakbeat': 1, 'techno': 1,  # Metal/Rock
    'brazil': 2, 'forro': 2, 'happy': 2, 'spanish': 2, 'j-idol': 2,  # Country/Folk/Pop
    'afrobeat': 3, 'chicago-house': 3, 'industrial': 3, 'j-dance': 3,  # World/Commercial Pop
    'bluegrass': 4, 'disney': 4, 'indian': 4, 'mandopop': 4, 'sleep': 4  # Other
}
df['genre_group'] = df['genre'].map(genre_groups).astype(int)

# Corrected Label encoding for 'explicit' column
le = LabelEncoder()
df['explicit'] = le.fit_transform(df['explicit'])

# Apply the function to add pattern features to each row
df = df.apply(add_pattern_features, axis=1)

# Split the dataset into features and target
X = df.drop(['genre_group', 'genre', 'mode'], axis=1)
y = df['genre_group'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

# Normalize the feature data
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

# Initialize and train the KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6, metric="cityblock", weights="distance")
knn.fit(X_train_norm, y_train)

# Now you can use knn to predict, evaluate, etc.


In [124]:
df.head()

Unnamed: 0,duration_ms,explicit,popularity,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,other_pattern_energy_acousticness,other_pattern_acousticness,country_folk_pop_pattern_energy,country_folk_pop_pattern_loudness,metal_rock_pattern_popularity,metal_rock_pattern_energy,energy_low_loudness_low_mode1,instrumentalness_low_country_folk_pop_mode1,instrumentalness_low_loudness_high_country_folk_pop,acousticness_high_other_energy_low
0,293106,0,50,0.401,0.683,-5.722,1,0.0401,0.181,0.0,...,0,0,0,0,0,0,0,1,0,0
1,194972,0,52,0.672,0.858,-5.233,1,0.145,0.456,0.811,...,0,0,0,0,0,0,0,0,0,0
2,178428,0,22,0.636,0.826,-7.486,1,0.0585,0.461,0.271,...,0,0,0,0,0,0,0,0,0,0
3,238373,0,20,0.733,0.862,-5.813,1,0.0604,0.287,0.000532,...,0,0,0,0,0,0,0,1,0,0
4,221893,0,22,0.712,0.225,-10.017,1,0.0533,0.93,0.001,...,0,0,0,0,0,0,0,1,0,0


In [119]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)


In [120]:
# optimal_params = {
#     'criterion': 'entropy', 'max_depth': 9, 'max_features': None, 
#     'min_impurity_decrease': 0.001, 'min_samples_leaf': 10, 'min_samples_split': 2, 'splitter': 'best'
# }

# # Train the Decision Tree Classifier with optimal parameters on normalized data
# dt = DecisionTreeClassifier(**optimal_params)
# dt.fit(X_train_norm, y_train)  # Use normalized training data

In [121]:
# from sklearn.naive_bayes import GaussianNB, CategoricalNB

# dt = GaussianNB()
# dt.fit(X_train_norm, y_train)

# y_train_pred = dt.predict(X_train_norm)

In [122]:
# Load the test dataset
df_test = pd.read_csv("../../our_analyses/dataset_test_prepared.csv")

# Preprocess the test data (same transformations applied to the training dataset)
df_test = df_test.drop(['name', 'artists', 'album_name', 'key', 'time_signature'], axis=1)
df_test['genre_group'] = df_test['genre'].map(genre_groups).astype(int)

# Apply the same feature engineering to the test set
df_test = df_test.apply(add_pattern_features, axis=1)

# Use the same LabelEncoder instance to transform 'explicit' column in the test data
df_test['explicit'] = le.transform(df_test['explicit'])

# IMPORTANT: Drop the 'mode' feature from the test set to match the training set
X_test_new = df_test.drop(['genre_group', 'genre', 'mode'], axis=1)  # Added 'mode' here
y_test_new = df_test['genre_group'].values

# Normalize the test data using the same scaler fitted to the training data
X_test_new_norm = scaler.transform(X_test_new)

# Load the KNeighborsClassifier (already trained)
# Ensure that 'knn' is the trained KNeighborsClassifier
predictions = knn.predict(X_test_new_norm)

# Evaluate the predictions
print("Accuracy:", accuracy_score(y_test_new, predictions))
print(classification_report(y_test_new, predictions))


Accuracy: 0.755
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       750
           1       0.80      0.70      0.75       750
           2       0.76      0.74      0.75      1250
           3       0.64      0.68      0.66      1000
           4       0.76      0.81      0.79      1250

    accuracy                           0.76      5000
   macro avg       0.76      0.75      0.76      5000
weighted avg       0.76      0.76      0.76      5000



In [123]:
import plotly.graph_objects as go
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, average_precision_score


from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go
# Binarizza le etichette in un formato one-vs-rest e calcola i punteggi di probabilità per le classi
Y = label_binarize(y_test, classes=np.unique(y_test))
y_scores = knn.predict_proba(X_test_norm)

# Calcola ROC curve e AUC per ogni classe
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(np.unique(y_test))):
    fpr[i], tpr[i], _ = roc_curve(Y[:, i], y_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Crea un grafico Plotly
fig = go.Figure()

# Aggiunge una linea per ogni classe
for i, genre in enumerate(np.unique(y_test)):
    fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines', name=f'Class {i} (AUC={roc_auc[i]:0.2f})'))

# Aggiunge la linea diagonale per il random guessing
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Guessing', line=dict(dash='dash')))

# Aggiorna il layout
fig.update_layout(
    title="ROC curves of Decision Tree classifier",
    xaxis_title="False Positive Rate",
     width=1000,  # Larghezza del plot
    height=800,  
    yaxis_title="True Positive Rate",
    legend_title="Classes"
)

# Mostra il grafico0,84
fig.show()
