In [179]:
# Import required libraries and dependencies
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [180]:

# Load the data into a Pandas DataFrame

df_music_genre = pd.read_csv(
    "Resources/music_genre.csv")

# Display sample data

df_music_genre.head(10)

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,4-Apr,0.27,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
5,89064.0,Axel Boman,Hello,47.0,0.00523,0.755,519468.0,0.731,0.854,D,0.216,-10.517,Minor,0.0412,?,4-Apr,0.614,Electronic
6,43760.0,Jordan Comolli,Clash,46.0,0.0289,0.572,214408.0,0.803,8e-06,B,0.106,-4.294,Major,0.351,149.995,4-Apr,0.23,Electronic
7,30738.0,Hraach,Delirio,43.0,0.0297,0.809,416132.0,0.706,0.903,G,0.0635,-9.339,Minor,0.0484,120.008,4-Apr,0.761,Electronic
8,84950.0,Kayzo,NEVER ALONE,39.0,0.00299,0.509,292800.0,0.921,0.000276,F,0.178,-3.175,Minor,0.268,149.94799999999998,4-Apr,0.273,Electronic
9,56950.0,Shlump,Lazer Beam,22.0,0.00934,0.578,204800.0,0.731,0.0112,A,0.111,-7.091,Minor,0.173,139.933,4-Apr,0.203,Electronic


In [181]:
# Drop unncessary columns

df_music_genre = df_music_genre.drop(['instance_id','artist_name', 'track_name', 'obtained_date'], axis=1)
df_music_genre.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,-1.0,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,0.531,Electronic
2,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,0.27,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic


In [182]:
# Convert data type and drop NA values on tempo column

df_music_genre["tempo"] = pd.to_numeric(df_music_genre["tempo"], errors="coerce")
df_music_genre.dropna(subset=["tempo"], inplace=True)

In [183]:
# Scale price data, return, and variance values
music_genre_scaled = StandardScaler().fit_transform(
    df_music_genre[["popularity", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "liveness","loudness","speechiness","tempo","valence"]]
)

In [184]:
# Create a DataFrame with the scaled data
df_music_genre_scaled = pd.DataFrame(
    music_genre_scaled,
    columns=["popularity", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "liveness","loudness","speechiness","tempo","valence"]
)

# Copy the track_name from the original data
df_music_genre_scaled["track_name"] = df_music_genre.index

# Set the track_name column as index
df_music_genre_scaled = df_music_genre_scaled.set_index("track_name")

# Display sample data
df_music_genre_scaled.head()

Unnamed: 0_level_0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,-1.109911,-0.884378,0.522585,-1.73213,1.290882,1.87255,-0.488216,0.639344,-0.187086,-0.622164,1.224343
1,-0.852739,-0.860885,0.354853,-0.022528,1.09807,2.357446,-0.432561,0.34014,-0.628606,-0.161578,0.301856
2,-1.045618,-0.889123,0.343671,-0.043517,0.587686,-0.521854,2.102795,0.734206,-0.584257,0.262424,-0.499252
3,-0.65986,-0.823684,1.204697,-0.425216,0.379752,-0.550303,-0.228496,0.753536,1.431165,0.263077,-0.75415
4,-0.788446,-0.884466,0.44431,0.009394,-0.047459,2.231618,-0.228496,0.466352,-0.517241,0.818601,-0.539712


In [185]:
# Encode the categorical variables using get_dummies for columns key, mode, music_genre
key_dummies = pd.get_dummies(df_music_genre["key"])
mode_dummies = pd.get_dummies(df_music_genre["mode"])

#key_dummies.head()
#mode_dummies.head()


In [196]:
# Concatenate the "key", "mode", "music_genre" variables with the scaled data DataFrame.

df_music_genre_scaled = pd.concat([df_music_genre_scaled, key_dummies, mode_dummies, df_music_genre['music_genre']], axis = 1)

# Display the sample data
df_music_genre_scaled.head()


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,D,D#,E,F,F#,G,G#,Major,Minor,music_genre
0,-1.109911,-0.884378,0.522585,-1.73213,1.290882,1.87255,-0.488216,0.639344,-0.187086,-0.622164,...,False,False,False,False,False,False,False,False,True,Electronic
1,-0.852739,-0.860885,0.354853,-0.022528,1.09807,2.357446,-0.432561,0.34014,-0.628606,-0.161578,...,True,False,False,False,False,False,False,False,True,Electronic
2,-1.045618,-0.889123,0.343671,-0.043517,0.587686,-0.521854,2.102795,0.734206,-0.584257,0.262424,...,False,False,False,False,False,False,True,True,False,Electronic
3,-0.65986,-0.823684,1.204697,-0.425216,0.379752,-0.550303,-0.228496,0.753536,1.431165,0.263077,...,False,False,False,False,False,False,False,True,False,Electronic
4,-0.788446,-0.884466,0.44431,0.009394,-0.047459,2.231618,-0.228496,0.466352,-0.517241,0.818601,...,False,False,False,False,True,False,False,True,False,Electronic


In [198]:
# Split target column from dataset

y = df_music_genre_scaled["music_genre"]
X = df_music_genre_scaled.drop(columns="music_genre")



In [199]:
# Split the dataset into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_test.shape

(11255, 39)

In [206]:
# Create a Logistric Regression Model

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver='lbfgs', max_iter=10000, random_state=1)

classifier

In [207]:
# Fit (train) or model using the training data

classifier.fit(X_train, y_train)


In [208]:
# Score the model using the test data

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score:  {classifier.score(X_test, y_test)}")

Training Data Score: 0.5331852509995557
Testing Data Score:  0.5239449133718347


In [209]:
# Make predictions

predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,Rock,Alternative
1,Rock,Rap
2,Alternative,Alternative
3,Classical,Classical
4,Rap,Hip-Hop
5,Anime,Anime
6,Electronic,Jazz
7,Country,Rock
8,Rap,Rap
9,Hip-Hop,Rap


In [210]:
# Calculated the accuracy score

from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.5239449133718347

In [211]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[340,   8,  32,   3, 261,  86,  91,  71,  37, 195],
       [ 27, 699, 112, 139,  48,  64,   0,  32,   0,   3],
       [ 35, 174, 521,  22, 109,  46,   2, 155,   0,  54],
       [ 23,  57,  29, 903,  10,  30,   0,  65,   0,   8],
       [ 68,  17, 127,   3, 628,  33,  18,  49,   9, 169],
       [ 71,  89,  54,  12,  45, 666,  37,  95,  18,  30],
       [ 88,   0,   1,   0,  47,  16, 524,  16, 366,  72],
       [ 31,  43, 134,  91, 104, 184,  34, 463,   6,  40],
       [ 93,   0,   1,   0,  32,   5, 432,  11, 421, 131],
       [125,   4,   8,   0, 143,   9,  23,  23,  73, 732]], dtype=int64)

In [215]:
#classification report

from sklearn.metrics import classification_report


print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

 Alternative       0.38      0.30      0.34      1124
       Anime       0.64      0.62      0.63      1124
       Blues       0.51      0.47      0.49      1118
   Classical       0.77      0.80      0.79      1125
     Country       0.44      0.56      0.49      1121
  Electronic       0.58      0.60      0.59      1117
     Hip-Hop       0.45      0.46      0.46      1130
        Jazz       0.47      0.41      0.44      1130
         Rap       0.45      0.37      0.41      1126
        Rock       0.51      0.64      0.57      1140

    accuracy                           0.52     11255
   macro avg       0.52      0.52      0.52     11255
weighted avg       0.52      0.52      0.52     11255

