In [4]:
# Import dependencies
import pandas as pd
import numpy as np
import time
import operator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from nltk.stem.snowball import SnowballStemmer

In [5]:
# Create DataFrame from CSV
t0 = time.time()
filtered_lyric_SF_df = pd.read_csv('../../Data/filtered_lyric_SF.csv')
filtered_lyric_SF_df = filtered_lyric_SF_df.drop([0,1])
filtered_lyric_SF_df.index = filtered_lyric_SF_df.index - 2
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_lyric_SF_df.head(3)

Run time: 32.34048008918762 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,cut,ale,lack,slogan,libido,oop,scorch,muslim,heavyweight,mozambiqu
0,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WITHOUT YOU,The Kid LAROI,pop,8.0,['australian'],0.662,0.413,0.0,1.0,0.0299,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Create a dictionary of category names and ids
cat_name_id = {}
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    cat_id = list(filtered_lyric_SF_df[filtered_lyric_SF_df['category_name']==cat]['category_id'])[0]
    cat_name_id[cat] = cat_id
cat_name_id

{'blues': 0.0,
 'classical': 1.0,
 'country': 2.0,
 'funk': 3.0,
 'hiphop': 4.0,
 'indie_alt': 5.0,
 'jazz': 6.0,
 'metal': 7.0,
 'pop': 8.0,
 'punk': 9.0,
 'rnb': 10.0,
 'rock': 11.0,
 'romance': 12.0,
 'soul': 13.0}

In [7]:
# Define the features set
X = filtered_lyric_SF_df.copy()
X = X.drop(['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list'], axis=1)
X.head()

Unnamed: 0,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,...,cut,ale,lack,slogan,libido,oop,scorch,muslim,heavyweight,mozambiqu
0,0.392,0.574,7.0,1.0,0.17,0.833,0.00179,0.145,0.529,81.112,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.581,0.584,2.0,1.0,0.284,0.0805,0.0,0.366,0.756,179.954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.662,0.413,0.0,1.0,0.0299,0.213,0.0,0.134,0.467,93.005,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.761,0.525,11.0,1.0,0.0944,0.44,7e-06,0.0921,0.531,80.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666,0.593,2.0,1.0,0.0373,0.318,0.0,0.414,0.329,140.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Define the target set
y = filtered_lyric_SF_df['category_id'].ravel()
y[:5]

array([8., 8., 8., 8., 8.])

In [9]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6007, 9094)
(2003, 9094)
(6007,)
(2003,)


In [10]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Model

In [11]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [12]:
# Fit the model
t0 = time.time()
rf_model = rf_model.fit(X_train_scaled, y_train)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

Run time: 104.51782822608948 seconds


# Making Predictions

In [13]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([ 4.,  7.,  4., ..., 11., 11., 11.])

# Model Evaluation

In [14]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.4762855716425362


In [15]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([1.46572756e-02, 1.75094054e-02, 4.90917771e-03, ...,
       0.00000000e+00, 9.38612500e-07, 3.47818644e-06])

In [16]:
# Sort the features by importance
sorted_features = sorted(zip(importances, X.columns), reverse=True)
sorted_features

[(0.02036758181048321, 'audio_ft_acousticness'),
 (0.01750940539816833, 'audio_ft_energy'),
 (0.014657275586057351, 'audio_ft_danceability'),
 (0.014059530038994262, 'audio_ft_speechiness'),
 (0.012934004392029588, 'audio_ft_duration_ms'),
 (0.011745613968368348, 'audio_ft_instrumentalness'),
 (0.01138992872823068, 'audio_ft_valence'),
 (0.008083221675542438, 'audio_ft_tempo'),
 (0.007393848375712972, 'audio_ft_liveness'),
 (0.005705104978739002, 'love'),
 (0.005279117678818059, 'im'),
 (0.005036847963099851, 'babi'),
 (0.004958799246856015, 'aint'),
 (0.00490917770963651, 'audio_ft_key'),
 (0.004908729092319602, 'like'),
 (0.004885000485906144, 'know'),
 (0.0047117059649376715, 'oh'),
 (0.0046196673675253045, '?'),
 (0.004591347958176494, 'dont'),
 (0.004392317403626376, 'yeah'),
 (0.0043213773341498245, 'got'),
 (0.004221922584223984, 'nigga'),
 (0.0040546830873037665, 'get'),
 (0.0037426031665133944, 'audio_ft_mode'),
 (0.0036451276798007805, 'feel'),
 (0.003435600498543328, 'your')

# Dropping Lower Ranked Features

In [17]:
# Create a dictionary of features and accuracy scores
features_accuracy = {}
feature_percents = [0.4, 0.5, 0.6]
for percent in feature_percents:
    # Define the features and target sets
    important_features = []
    sum = 0
    count = 0
    while sum < percent:
        sum = sum + sorted_features[count][0]
        important_features.append(sorted_features[count][1])
        count+=1
    X = filtered_lyric_SF_df[important_features]
    y = filtered_lyric_SF_df['category_id'].ravel()
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    # Scale the data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    # Fit the random forest model
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
    t0 = time.time()
    rf_model = rf_model.fit(X_train_scaled, y_train)
    t1 = time.time()
    print(f'Run time to fit model with {percent*100}% of total features: {t1-t0} seconds')
    # Make predictions
    predictions = rf_model.predict(X_test_scaled)
    # Evaluate the model
    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=['Actual blues', 'Actual classical', 'Actual country', 'Actual funk',
               'Actual hiphop', 'Actual indie_alt', 'Actual jazz', 'Actual metal',
               'Actual pop', 'Actual punk', 'Actual rnb', 'Actual rock',
               'Actual romance', 'Actual soul'],
        columns=['Predicted blues', 'Predicted classical', 'Predicted country',
                 'Predicted funk', 'Predicted hiphop', 'Predicted indie_alt', 
                 'Predicted jazz', 'Predicted metal', 'Predicted pop', 'Predicted punk',
                 'Predicted rnb', 'Predicted rock', 'Predicted romance', 'Predicted soul'])
    acc_score = accuracy_score(y_test, predictions)
    print('Confusion Matrix')
    display(cm_df)
    print(f'Accuracy Score: {acc_score}')
    print('Classification Report')
    print(classification_report(y_test, predictions))
    features_accuracy[percent] = acc_score

Run time to fit model with 40.0% of total features: 11.472492218017578 seconds
Confusion Matrix


Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted punk,Predicted rnb,Predicted rock,Predicted romance,Predicted soul
Actual blues,32,0,27,6,0,5,1,1,0,0,0,22,0,1
Actual classical,1,15,3,0,0,1,2,0,1,0,0,1,0,0
Actual country,3,0,258,0,2,2,0,0,5,1,0,15,0,0
Actual funk,2,0,11,23,7,7,1,1,1,0,0,9,0,6
Actual hiphop,0,0,2,0,165,2,0,0,10,0,1,4,0,0
Actual indie_alt,4,1,26,2,3,42,1,7,15,4,0,61,1,1
Actual jazz,2,3,10,1,2,6,8,0,6,0,0,5,1,1
Actual metal,0,0,1,0,2,1,0,140,0,11,1,64,0,0
Actual pop,4,0,46,0,17,11,0,0,70,1,5,15,1,0
Actual punk,0,0,19,1,2,7,0,17,1,38,0,60,0,0


Accuracy Score: 0.5012481278082875
Classification Report
              precision    recall  f1-score   support

         0.0       0.54      0.34      0.42        95
         1.0       0.71      0.62      0.67        24
         2.0       0.48      0.90      0.63       286
         3.0       0.48      0.34      0.40        68
         4.0       0.69      0.90      0.78       184
         5.0       0.38      0.25      0.30       168
         6.0       0.47      0.18      0.26        45
         7.0       0.66      0.64      0.65       220
         8.0       0.48      0.41      0.44       170
         9.0       0.48      0.26      0.34       145
        10.0       0.50      0.17      0.25        95
        11.0       0.38      0.54      0.45       326
        12.0       0.67      0.11      0.19        73
        13.0       0.43      0.12      0.19       104

    accuracy                           0.50      2003
   macro avg       0.53      0.41      0.43      2003
weighted avg       0.51

Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted punk,Predicted rnb,Predicted rock,Predicted romance,Predicted soul
Actual blues,37,0,27,6,0,7,2,0,0,0,0,16,0,0
Actual classical,1,13,3,0,0,2,3,0,1,0,0,1,0,0
Actual country,1,0,261,0,4,1,0,0,4,0,0,15,0,0
Actual funk,2,0,9,22,9,9,1,1,2,0,0,8,0,5
Actual hiphop,1,0,1,0,167,1,0,0,11,1,0,2,0,0
Actual indie_alt,2,0,26,2,3,42,3,5,13,3,0,68,0,1
Actual jazz,2,3,11,1,2,8,7,0,4,0,0,6,0,1
Actual metal,0,0,1,0,0,1,0,153,0,6,0,59,0,0
Actual pop,2,0,41,2,17,12,0,0,76,1,4,13,2,0
Actual punk,0,0,16,1,2,5,0,17,2,37,0,65,0,0


Accuracy Score: 0.5162256615077384
Classification Report
              precision    recall  f1-score   support

         0.0       0.63      0.39      0.48        95
         1.0       0.72      0.54      0.62        24
         2.0       0.49      0.91      0.64       286
         3.0       0.45      0.32      0.38        68
         4.0       0.69      0.91      0.79       184
         5.0       0.38      0.25      0.30       168
         6.0       0.37      0.16      0.22        45
         7.0       0.69      0.70      0.69       220
         8.0       0.51      0.45      0.48       170
         9.0       0.55      0.26      0.35       145
        10.0       0.56      0.15      0.23        95
        11.0       0.38      0.56      0.46       326
        12.0       0.73      0.11      0.19        73
        13.0       0.54      0.14      0.23       104

    accuracy                           0.52      2003
   macro avg       0.55      0.42      0.43      2003
weighted avg       0.53

Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted punk,Predicted rnb,Predicted rock,Predicted romance,Predicted soul
Actual blues,36,0,28,4,0,7,2,1,0,0,0,17,0,0
Actual classical,1,14,3,0,0,1,2,0,1,0,0,2,0,0
Actual country,2,0,265,0,2,1,0,0,4,0,0,12,0,0
Actual funk,2,0,10,23,8,8,1,1,1,0,0,10,0,4
Actual hiphop,1,0,2,0,167,0,0,0,11,1,0,2,0,0
Actual indie_alt,2,0,31,2,1,35,2,7,13,3,1,70,0,1
Actual jazz,1,2,14,1,2,8,7,0,3,0,0,5,1,1
Actual metal,0,0,1,0,1,1,0,150,0,6,0,61,0,0
Actual pop,2,0,41,0,14,8,0,0,77,1,6,20,1,0
Actual punk,0,0,15,1,3,5,0,18,2,34,0,67,0,0


Accuracy Score: 0.5152271592611083
Classification Report
              precision    recall  f1-score   support

         0.0       0.68      0.38      0.49        95
         1.0       0.78      0.58      0.67        24
         2.0       0.49      0.93      0.64       286
         3.0       0.51      0.34      0.41        68
         4.0       0.71      0.91      0.80       184
         5.0       0.38      0.21      0.27       168
         6.0       0.41      0.16      0.23        45
         7.0       0.68      0.68      0.68       220
         8.0       0.53      0.45      0.49       170
         9.0       0.49      0.23      0.32       145
        10.0       0.52      0.14      0.22        95
        11.0       0.38      0.58      0.46       326
        12.0       0.73      0.11      0.19        73
        13.0       0.52      0.14      0.23       104

    accuracy                           0.52      2003
   macro avg       0.56      0.42      0.43      2003
weighted avg       0.53

In [18]:
# Print feature percents and accuracy scores
for percent in features_accuracy:
    acc_score = features_accuracy[percent]
    print(f'Accuracy Score for {percent*100}% of total features: {acc_score}')

Accuracy Score for 40.0% of total features: 0.5012481278082875
Accuracy Score for 50.0% of total features: 0.5162256615077384
Accuracy Score for 60.0% of total features: 0.5152271592611083


In [32]:
# Get the highest accuracy score
best_percent = max(features_accuracy, key=features_accuracy.get)
best_acc = features_accuracy[best_percent]
print(f'The highest accuracy score of {best_acc} was achieved using {best_percent*100}% of all features.')

The highest accuracy score of 0.5162256615077384 was achieved using 50.0% of all features.
