In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import time
import operator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Create DataFrame from CSV
t0 = time.time()
filtered_lyric_TF_df = pd.read_csv('../../Data/filtered_lyric_TF.csv')
filtered_lyric_TF_df = filtered_lyric_TF_df.drop([0,1])
filtered_lyric_TF_df.index = filtered_lyric_TF_df.index - 2
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_lyric_TF_df.head()

Run time: 45.03232479095459 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,professed,plottin,sideline,sufficient,girly,reek,duffel,bitter,staff,eighth
0,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WITHOUT YOU,The Kid LAROI,pop,8.0,['australian'],0.662,0.413,0.0,1.0,0.0299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Heat Waves,Glass Animals,pop,8.0,"['shiver', 'indietronica', 'gauze']",0.761,0.525,11.0,1.0,0.0944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,r u ok,Tate McRae,pop,8.0,"['dance', 'pop', 'electropop', 'post-teen']",0.666,0.593,2.0,1.0,0.0373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Create a dictionary of category names and ids
cat_name_id = {}
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    cat_id = list(filtered_lyric_TF_df[filtered_lyric_TF_df['category_name']==cat]['category_id'])[0]
    cat_name_id[cat] = cat_id
cat_name_id

{'blues': 0.0,
 'classical': 1.0,
 'country': 2.0,
 'funk': 3.0,
 'hiphop': 4.0,
 'indie_alt': 5.0,
 'jazz': 6.0,
 'metal': 7.0,
 'pop': 8.0,
 'punk': 9.0,
 'rnb': 10.0,
 'rock': 11.0,
 'romance': 12.0,
 'soul': 13.0}

In [4]:
# Define the features set
X = filtered_lyric_TF_df.copy()
X = X.drop(['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list'], axis=1)
X.head()

Unnamed: 0,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,...,professed,plottin,sideline,sufficient,girly,reek,duffel,bitter,staff,eighth
0,0.392,0.574,7.0,1.0,0.17,0.833,0.00179,0.145,0.529,81.112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.581,0.584,2.0,1.0,0.284,0.0805,0.0,0.366,0.756,179.954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.662,0.413,0.0,1.0,0.0299,0.213,0.0,0.134,0.467,93.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.761,0.525,11.0,1.0,0.0944,0.44,7e-06,0.0921,0.531,80.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666,0.593,2.0,1.0,0.0373,0.318,0.0,0.414,0.329,140.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Define the target set
y = filtered_lyric_TF_df['category_id'].ravel()
y[:5]

array([8., 8., 8., 8., 8.])

In [6]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6030, 12081)
(2011, 12081)
(6030,)
(2011,)


In [7]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Model

In [8]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [9]:
# Fit the model
t0 = time.time()
rf_model = rf_model.fit(X_train_scaled, y_train)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

Run time: 149.7766330242157 seconds


# Making Predictions

In [10]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([11., 11.,  1., ...,  4.,  2.,  4.])

# Model Evaluation

In [11]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from cm
cm_df = pd.DataFrame(
    cm, index=['Actual blues', 'Actual classical', 'Actual country', 'Actual funk',
               'Actual hiphop', 'Actual indie_alt', 'Actual jazz', 'Actual metal',
               'Actual pop', 'Actual punk', 'Actual rnb', 'Actual rock',
               'Actual romance', 'Actual soul'],
        columns=['Predicted blues', 'Predicted classical', 'Predicted country',
                 'Predicted funk', 'Predicted hiphop', 'Predicted indie_alt', 
                 'Predicted jazz', 'Predicted metal', 'Predicted pop', 'Predicted punk',
                 'Predicted rnb', 'Predicted rock', 'Predicted romance', 'Predicted soul'])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [12]:
# Display the results
print('Confusion Matrix')
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print('Classification Report')
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted punk,Predicted rnb,Predicted rock,Predicted romance,Predicted soul
Actual blues,29,0,38,0,1,2,0,4,1,0,0,25,0,0
Actual classical,0,5,6,0,0,2,0,4,0,0,0,9,0,0
Actual country,0,0,286,0,1,2,0,0,7,0,0,15,1,0
Actual funk,0,0,23,8,4,4,0,2,0,0,0,24,0,6
Actual hiphop,0,0,7,0,166,0,0,1,5,0,0,6,0,0
Actual indie_alt,0,0,36,0,1,19,0,9,12,3,0,84,2,0
Actual jazz,1,0,13,0,0,6,3,0,4,0,0,26,1,0
Actual metal,0,0,4,0,2,1,0,169,0,2,0,39,0,0
Actual pop,1,0,50,0,17,11,0,0,53,1,1,34,1,0
Actual punk,0,0,21,0,2,3,0,31,3,12,0,58,1,0


Accuracy Score: 0.48085529587270015
Classification Report
              precision    recall  f1-score   support

         0.0       0.85      0.29      0.43       100
         1.0       1.00      0.19      0.32        26
         2.0       0.46      0.92      0.62       312
         3.0       0.57      0.11      0.19        71
         4.0       0.71      0.90      0.79       185
         5.0       0.28      0.11      0.16       166
         6.0       1.00      0.06      0.11        54
         7.0       0.64      0.78      0.70       217
         8.0       0.45      0.31      0.37       169
         9.0       0.52      0.09      0.16       131
        10.0       0.57      0.08      0.14        97
        11.0       0.34      0.62      0.44       313
        12.0       0.19      0.05      0.07        66
        13.0       0.44      0.12      0.18       104

    accuracy                           0.48      2011
   macro avg       0.57      0.33      0.33      2011
weighted avg       0.5

In [13]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([1.34395719e-02, 1.64195198e-02, 4.68758317e-03, ...,
       1.43812444e-04, 6.11705917e-06, 7.51994614e-06])

In [14]:
# Sort the features by importance
sorted_features = sorted(zip(importances, X.columns), reverse=True)
sorted_features

[(0.019403907177405706, 'audio_ft_acousticness'),
 (0.016419519805076745, 'audio_ft_energy'),
 (0.013622638849146905, 'audio_ft_speechiness'),
 (0.013439571930083985, 'audio_ft_danceability'),
 (0.012571988056526186, 'audio_ft_duration_ms'),
 (0.011703269688264935, 'audio_ft_instrumentalness'),
 (0.011170179651739267, 'audio_ft_valence'),
 (0.007703447014237269, 'audio_ft_tempo'),
 (0.007311099556655294, 'audio_ft_liveness'),
 (0.005097917117156922, 'love'),
 (0.005035080222632455, 'im'),
 (0.004853294516427201, 'aint'),
 (0.004687583169848566, 'audio_ft_key'),
 (0.004629517339064106, 'like'),
 (0.0045914757468940685, 'baby'),
 (0.004512864905282869, '?'),
 (0.004502373929136803, 'know'),
 (0.004417256617596204, 'yeah'),
 (0.00435981511206345, 'dont'),
 (0.00426102678729207, 'oh'),
 (0.00402528923745883, 'got'),
 (0.003603567448806245, 'get'),
 (0.00342391782098942, 'youre'),
 (0.003360423020118291, 'audio_ft_mode'),
 (0.003214564525646555, 'niggas'),
 (0.0030299268883627476, 'cause'),

# Dropping Lower Ranked Features

In [15]:
# Create a dictionary of features and accuracy scores
features_accuracy = {}
feature_percents = [0.3, 0.4, 0.5]
for percent in feature_percents:
    # Define the features and target sets
    important_features = []
    sum = 0
    count = 0
    while sum < percent:
        sum = sum + sorted_features[count][0]
        important_features.append(sorted_features[count][1])
        count+=1
    X = filtered_lyric_TF_df[important_features]
    y = filtered_lyric_TF_df['category_id'].ravel()
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    # Scale the data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    # Fit the random forest model
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
    t0 = time.time()
    rf_model = rf_model.fit(X_train_scaled, y_train)
    t1 = time.time()
    print(f'Run time to fit model with {percent*100}% of total features: {t1-t0} seconds')
    # Make predictions
    predictions = rf_model.predict(X_test_scaled)
    # Evaluate the model
    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=['Actual blues', 'Actual classical', 'Actual country', 'Actual funk',
               'Actual hiphop', 'Actual indie_alt', 'Actual jazz', 'Actual metal',
               'Actual pop', 'Actual punk', 'Actual rnb', 'Actual rock',
               'Actual romance', 'Actual soul'],
        columns=['Predicted blues', 'Predicted classical', 'Predicted country',
                 'Predicted funk', 'Predicted hiphop', 'Predicted indie_alt', 
                 'Predicted jazz', 'Predicted metal', 'Predicted pop', 'Predicted punk',
                 'Predicted rnb', 'Predicted rock', 'Predicted romance', 'Predicted soul'])
    acc_score = accuracy_score(y_test, predictions)
    print('Confusion Matrix')
    display(cm_df)
    print(f'Accuracy Score: {acc_score}')
    print('Classification Report')
    print(classification_report(y_test, predictions))
    features_accuracy[percent] = acc_score

Run time to fit model with 30.0% of total features: 11.03878402709961 seconds
Confusion Matrix


Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted punk,Predicted rnb,Predicted rock,Predicted romance,Predicted soul
Actual blues,32,0,26,2,0,13,0,4,2,0,1,20,0,0
Actual classical,0,13,4,0,0,5,0,0,1,0,0,0,3,0
Actual country,3,0,255,0,2,10,0,0,12,0,0,25,5,0
Actual funk,7,0,9,25,2,5,1,1,2,0,2,11,0,6
Actual hiphop,0,0,6,1,162,0,0,0,12,0,1,3,0,0
Actual indie_alt,2,1,19,5,2,37,1,7,17,4,0,67,3,1
Actual jazz,6,4,7,1,2,8,13,0,4,0,0,4,5,0
Actual metal,0,0,3,0,1,1,0,161,0,9,0,42,0,0
Actual pop,4,0,33,1,14,11,0,0,80,0,5,19,2,0
Actual punk,0,0,21,0,3,3,0,25,3,36,0,40,0,0


Accuracy Score: 0.5067130780706116
Classification Report
              precision    recall  f1-score   support

         0.0       0.44      0.32      0.37       100
         1.0       0.72      0.50      0.59        26
         2.0       0.52      0.82      0.64       312
         3.0       0.50      0.35      0.41        71
         4.0       0.73      0.88      0.79       185
         5.0       0.31      0.22      0.26       166
         6.0       0.62      0.24      0.35        54
         7.0       0.67      0.74      0.70       217
         8.0       0.46      0.47      0.47       169
         9.0       0.47      0.27      0.35       131
        10.0       0.46      0.16      0.24        97
        11.0       0.40      0.54      0.46       313
        12.0       0.17      0.09      0.12        66
        13.0       0.42      0.13      0.20       104

    accuracy                           0.51      2011
   macro avg       0.49      0.41      0.43      2011
weighted avg       0.49

Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted punk,Predicted rnb,Predicted rock,Predicted romance,Predicted soul
Actual blues,32,0,26,2,1,10,0,3,3,0,1,22,0,0
Actual classical,0,12,5,0,0,4,1,0,0,0,0,1,3,0
Actual country,1,0,263,0,2,6,0,0,12,0,0,22,6,0
Actual funk,2,0,12,27,3,5,0,1,2,0,0,13,0,6
Actual hiphop,0,0,3,1,164,1,0,1,13,0,1,1,0,0
Actual indie_alt,1,1,21,3,2,31,2,6,15,6,0,73,3,2
Actual jazz,3,3,8,0,1,4,14,0,6,0,0,10,4,1
Actual metal,0,0,3,0,2,2,0,164,0,6,0,40,0,0
Actual pop,3,0,31,0,15,13,0,0,76,0,7,22,2,0
Actual punk,0,0,22,0,4,3,0,25,2,31,0,44,0,0


Accuracy Score: 0.5087021382396818
Classification Report
              precision    recall  f1-score   support

         0.0       0.62      0.32      0.42       100
         1.0       0.75      0.46      0.57        26
         2.0       0.52      0.84      0.64       312
         3.0       0.55      0.38      0.45        71
         4.0       0.72      0.89      0.80       185
         5.0       0.30      0.19      0.23       166
         6.0       0.70      0.26      0.38        54
         7.0       0.69      0.76      0.72       217
         8.0       0.44      0.45      0.45       169
         9.0       0.45      0.24      0.31       131
        10.0       0.48      0.16      0.25        97
        11.0       0.37      0.54      0.44       313
        12.0       0.23      0.14      0.17        66
        13.0       0.44      0.15      0.23       104

    accuracy                           0.51      2011
   macro avg       0.52      0.41      0.43      2011
weighted avg       0.50

Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted punk,Predicted rnb,Predicted rock,Predicted romance,Predicted soul
Actual blues,34,0,29,2,0,6,0,3,2,0,1,23,0,0
Actual classical,0,12,4,0,0,4,1,0,0,0,0,3,2,0
Actual country,1,0,277,0,2,2,0,0,6,0,0,18,6,0
Actual funk,2,0,12,25,2,4,0,1,6,0,0,13,0,6
Actual hiphop,0,0,4,0,163,0,0,0,13,0,0,5,0,0
Actual indie_alt,1,1,25,2,1,30,1,8,14,4,0,77,2,0
Actual jazz,5,3,7,1,1,4,15,0,5,0,0,9,4,0
Actual metal,0,0,3,0,2,2,0,165,0,6,0,39,0,0
Actual pop,2,0,34,0,15,14,0,0,76,0,5,21,2,0
Actual punk,0,0,21,0,3,3,0,29,5,31,0,39,0,0


Accuracy Score: 0.5121829935355544
Classification Report
              precision    recall  f1-score   support

         0.0       0.63      0.34      0.44       100
         1.0       0.75      0.46      0.57        26
         2.0       0.52      0.89      0.66       312
         3.0       0.57      0.35      0.43        71
         4.0       0.72      0.88      0.79       185
         5.0       0.32      0.18      0.23       166
         6.0       0.75      0.28      0.41        54
         7.0       0.66      0.76      0.71       217
         8.0       0.45      0.45      0.45       169
         9.0       0.50      0.24      0.32       131
        10.0       0.46      0.12      0.20        97
        11.0       0.38      0.54      0.44       313
        12.0       0.16      0.08      0.10        66
        13.0       0.44      0.14      0.22       104

    accuracy                           0.51      2011
   macro avg       0.52      0.41      0.43      2011
weighted avg       0.50

In [16]:
# Print feature percents and accuracy scores
for percent in features_accuracy:
    acc_score = features_accuracy[percent]
    print(f'Accuracy Score for {percent*100}% of total features: {acc_score}')

Accuracy Score for 30.0% of total features: 0.5067130780706116
Accuracy Score for 40.0% of total features: 0.5087021382396818
Accuracy Score for 50.0% of total features: 0.5121829935355544


In [17]:
# Get the highest accuracy score
best_percent = max(features_accuracy, key=features_accuracy.get)
best_acc = features_accuracy[best_percent]
print(f'The highest accuracy score of {best_acc} was achieved using {best_percent*100}% of all features.')

The highest accuracy score of 0.5121829935355544 was achieved using 50.0% of all features.
