In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [3]:
df = pd.read_csv("music_genre.csv")
df

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,58878.0,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,43557.0,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,39767.0,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,57944.0,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


In [4]:
df.drop('mode', axis=1, inplace=True)

In [5]:
le_key = LabelEncoder()
df["key"] = le_key.fit_transform(df["key"])

df["tempo"] = pd.to_numeric(df["tempo"], errors="coerce")
df.dropna(subset=["tempo"], inplace=True)

In [6]:
features = [
    "popularity",
    "acousticness",
    "danceability",
    "duration_ms",
    "energy",
    "instrumentalness",
    "key",
    "liveness",
    "loudness",
    "speechiness",
    "tempo",
    "valence",
]

target = "music_genre"

X = df[features]
y = df[target]

In [7]:
le_genre = LabelEncoder()
y_encoded = le_genre.fit_transform(y)
y_categorical = to_categorical(y_encoded)

In [8]:
X_temp, X_real_test, y_temp, y_real_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42
)


X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42
)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_real_test_scaled = scaler.transform(X_real_test)

In [10]:
model = Sequential()
model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(y_categorical.shape[1], activation="softmax"))

In [11]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
history = model.fit(
    X_train_scaled,
    y_train,
    epochs=25,
    validation_data=(X_val_scaled, y_val),
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [13]:
real_test_loss, real_test_accuracy = model.evaluate(X_real_test_scaled, y_real_test)
print(f"Real Test Accuracy: {real_test_accuracy:.4f}")

Real Test Accuracy: 0.5704


In [14]:
df[df['track_name'] == "Ron's Theme"]

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,obtained_date,valence,music_genre
11331,41965.0,empty_field,Ron's Theme,36.0,0.938,0.41,86093.0,0.0117,0.802,3,0.11,-28.914,0.0419,79.948,4-Apr,0.0567,Jazz


In [15]:
import plotly.express as px
from sklearn.decomposition import PCA

# Apply PCA to reduce the dimensionality of the features to 2D
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X)

# Create a DataFrame with the embedded points and genre labels
pca_df = pd.DataFrame({
    'PCA Dimension 1': X_pca[:, 0],
    'PCA Dimension 2': X_pca[:, 1],
    'Genre': df['music_genre'],
    'Track Name': df['track_name']
})

# Create an interactive scatter plot with hover information using Plotly Express
fig = px.scatter(pca_df, x='PCA Dimension 1', y='PCA Dimension 2', color='Genre', hover_data=['Genre', 'Track Name'])
fig.update_layout(title='PCA Visualization of Music Genres Based on Original Features')
fig.show()


In [16]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Assuming model is already trained and you have the test set X_real_test_scaled
# Get the output probabilities for the test set
y_pred_prob = model.predict(X_real_test_scaled)

# Apply PCA to reduce the dimensionality of the output probabilities to 2D
pca = PCA(n_components=2, random_state=42)
y_pred_embedded = pca.fit_transform(y_pred_prob)

# Perform K-Means clustering on the PCA-reduced data
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust the number of clusters as needed
clusters = kmeans.fit_predict(y_pred_embedded)

# Create a DataFrame with the embedded points and genre labels
pca_df = pd.DataFrame({
    'PCA Dimension 1': y_pred_embedded[:, 0],
    'PCA Dimension 2': y_pred_embedded[:, 1],
    'Genre': [le_genre.inverse_transform([label])[0] for label in y_real_test.argmax(axis=1)],
    'Track Name': df.loc[X_real_test.index, 'track_name'].values,
    'Cluster': clusters
})

pca_df['Cluster'] = pca_df['Cluster'].astype(str)

# Create an interactive scatter plot with hover information using Plotly Express
fig = px.scatter(pca_df, x='PCA Dimension 1', y='PCA Dimension 2', color='Cluster', hover_data=['Genre', 'Track Name'])
fig.update_layout(title='PCA Visualization of Music Genre Predictions with Clusters')
fig.show()

pca_df







Unnamed: 0,PCA Dimension 1,PCA Dimension 2,Genre,Track Name,Cluster
0,-0.398256,0.136899,Hip-Hop,Swerve,3
1,-0.437196,0.171099,Rap,No One Knows,3
2,-0.454805,0.183362,Hip-Hop,Bedtime Stories (feat. The Weeknd) - From SR3MM,3
3,0.501972,-0.236892,Anime,Polygon Teacher,0
4,-0.064641,-0.120586,Country,Cadillac Style,1
...,...,...,...,...,...
8999,-0.157451,-0.056258,Country,Sway,1
9000,0.463130,-0.230030,Anime,21,0
9001,-0.380051,0.123697,Electronic,No Coming Down,3
9002,0.117177,-0.146413,Country,Love Remembers,4
