In [30]:
# Import dependencies
import pandas as pd
import numpy as np
import time
import operator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [4]:
# Create DataFrames from CSV files
lyric_TF_df = pd.read_csv('../Data/lyric_TF.csv')
filtered_lyric_TF_df = pd.read_csv('../Data/filtered_lyric_TF.csv')
lyric_TF_df = lyric_TF_df.drop(0)
filtered_lyric_TF_df = filtered_lyric_TF_df.drop([0,1])
lyric_TF_df.index = lyric_TF_df.index - 1
filtered_lyric_TF_df.index = filtered_lyric_TF_df.index - 2
filtered_lyric_TF_df.head()

Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,entirely,basket,car,shawn,nothingness,amused,corners,interlude,sting,axis
0,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WITHOUT YOU,The Kid LAROI,pop,8.0,['australian'],0.662,0.413,0.0,1.0,0.0299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Heat Waves,Glass Animals,pop,8.0,"['shiver', 'indietronica', 'gauze']",0.761,0.525,11.0,1.0,0.0944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,r u ok,Tate McRae,pop,8.0,"['dance', 'pop', 'electropop', 'post-teen']",0.666,0.593,2.0,1.0,0.0373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Define the features set
X = filtered_lyric_TF_df.copy()
X = X.drop(['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list'], axis=1)
X.head()

Unnamed: 0,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,...,entirely,basket,car,shawn,nothingness,amused,corners,interlude,sting,axis
0,0.392,0.574,7.0,1.0,0.17,0.833,0.00179,0.145,0.529,81.112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.581,0.584,2.0,1.0,0.284,0.0805,0.0,0.366,0.756,179.954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.662,0.413,0.0,1.0,0.0299,0.213,0.0,0.134,0.467,93.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.761,0.525,11.0,1.0,0.0944,0.44,7e-06,0.0921,0.531,80.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666,0.593,2.0,1.0,0.0373,0.318,0.0,0.414,0.329,140.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Define the target set
y = filtered_lyric_TF_df['category_id'].ravel()
y[:5]

array([8., 8., 8., 8., 8.])

In [7]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6010, 12020)
(2004, 12020)
(6010,)
(2004,)


In [8]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Model

In [9]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [12]:
# Fit the model
t0 = time.time()
rf_model = rf_model.fit(X_train_scaled, y_train)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

Run time: 108.46778917312622 seconds


# Making Predictions

In [13]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([ 4.,  5.,  2., ...,  7., 11., 11.])

# Model Evaluation

In [14]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.4875249500998004


In [15]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([1.28796022e-02, 1.61203484e-02, 4.58746405e-03, ...,
       1.87367985e-05, 6.17654453e-05, 3.25046964e-06])

In [16]:
# Sort the features by importance
sorted_features = sorted(zip(importances, X.columns), reverse=True)
sorted_features

[(0.018558419644226703, 'audio_ft_acousticness'),
 (0.016120348406361288, 'audio_ft_energy'),
 (0.013759492304239115, 'audio_ft_speechiness'),
 (0.012879602218548126, 'audio_ft_danceability'),
 (0.012287168267296962, 'audio_ft_duration_ms'),
 (0.011544071130427352, 'audio_ft_instrumentalness'),
 (0.011004135005541498, 'audio_ft_valence'),
 (0.007687367421617996, 'audio_ft_tempo'),
 (0.007407305306099179, 'audio_ft_liveness'),
 (0.005448093349164862, 'love'),
 (0.005135047608053254, 'im'),
 (0.00506156044111541, 'baby'),
 (0.0048104495613703075, 'aint'),
 (0.00469749015955842, 'know'),
 (0.0046808652187268355, '?'),
 (0.004587464048735523, 'audio_ft_key'),
 (0.004576035934733442, 'like'),
 (0.00440368734712412, 'oh'),
 (0.004360684564801681, 'dont'),
 (0.004305611097156661, 'got'),
 (0.004275101693409484, 'yeah'),
 (0.0034644088339752742, 'get'),
 (0.003394729373796402, 'youre'),
 (0.003214708040559218, 'cause'),
 (0.0030723106007657826, 'audio_ft_mode'),
 (0.002961236747112174, 'go'),


# Dropping Lower Ranked Features

In [29]:
# Create a dictionary of features and accuracy scores
features_accuracy = {}
feature_percents = [0.3, 0.4, 0.5, 0.6, 0.7]
for percent in feature_percents:
    # Define the features and target sets
    important_features = []
    sum = 0
    count = 0
    while sum < percent:
        sum = sum + sorted_features[count][0]
        important_features.append(sorted_features[count][1])
        count+=1
    X = filtered_lyric_TF_df[important_features]
    y = filtered_lyric_TF_df['category_id'].ravel()
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    # Scale the data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    # Fit the random forest model
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
    t0 = time.time()
    rf_model = rf_model.fit(X_train_scaled, y_train)
    t1 = time.time()
    print(f'Run time to fit model with {percent*100}% of total features: {t1-t0} seconds')
    # Make predictions
    predictions = rf_model.predict(X_test_scaled)
    acc_score = accuracy_score(y_test, predictions)
    features_accuracy[percent] = acc_score

Run time to fit model with 30.0% of total features: 7.6728739738464355 seconds
Run time to fit model with 40.0% of total features: 9.352822065353394 seconds
Run time to fit model with 50.0% of total features: 12.480792999267578 seconds
Run time to fit model with 60.0% of total features: 16.652825832366943 seconds
Run time to fit model with 70.0% of total features: 20.81318497657776 seconds


In [32]:
# Print feature percents and accuracy scores
for percent in features_accuracy:
    acc_score = features_accuracy[percent]
    print(f'Accuracy Score for {percent*100}% of total features: {acc_score}')

Accuracy Score for 30.0% of total features: 0.5099800399201597
Accuracy Score for 40.0% of total features: 0.5159680638722555
Accuracy Score for 50.0% of total features: 0.5129740518962076
Accuracy Score for 60.0% of total features: 0.5074850299401198
Accuracy Score for 70.0% of total features: 0.5029940119760479


In [37]:
# Get the highest accuracy score
best_percent = max(features_accuracy, key=features_accuracy.get)
best_acc = features_accuracy[best_percent]
print(f'The highest accuracy score of {best_acc} was achieved using {best_percent*100}% of all features.')

The highest accuracy score of 0.5159680638722555 was achieved using 40.0% of all features.
