In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [24]:
# Create DataFrames from CSV files
lyric_TF_df = pd.read_csv('../Data/lyric_TF.csv')
filtered_lyric_TF_df = pd.read_csv('../Data/filtered_lyric_TF.csv')
lyric_TF_df = lyric_TF_df.drop(0)
filtered_lyric_TF_df = filtered_lyric_TF_df.drop(0)
lyric_TF_df.index = lyric_TF_df.index - 1  
filtered_lyric_TF_df.index = filtered_lyric_TF_df.index - 1  
filtered_lyric_TF_df.head()

Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,filtered_genres,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,...,embrace,flames,tearin,situation,trojan,difference,head,time,end,aphrodite
0,Monster (Shawn Mendes & Justin Bieber),Shawn Mendes,toplists,32.0,"['pop', 'viral', 'canadian', 'post-teen', 'dan...","['pop', 'viral', 'canadian', 'dance']",0.652,0.383,2.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Therefore I Am,Billie Eilish,toplists,32.0,"['pop', 'electropop']",['pop'],0.889,0.34,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,positions,Ariana Grande,toplists,32.0,"['pop', 'post-teen']",['pop'],0.736,0.802,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,HOLIDAY,Lil Nas X,toplists,32.0,"['lgbtq+', 'pop', 'queer', 'country']","['pop', 'country']",0.81,0.511,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,On Me,Lil Baby,toplists,32.0,"['atl', 'rap', 'trap']","['atl', 'rap', 'trap']",0.856,0.564,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Define the features set
X = filtered_lyric_TF_df.copy()
X = X.drop(['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list', 'filtered_genres'], axis=1)
X.head()

Unnamed: 0,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,...,embrace,flames,tearin,situation,trojan,difference,head,time,end,aphrodite
0,0.652,0.383,2.0,0.0,0.0516,0.0676,0.0,0.0828,0.549,145.765,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.889,0.34,11.0,0.0,0.0697,0.218,0.13,0.055,0.716,94.009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.736,0.802,0.0,1.0,0.0864,0.468,0.0,0.094,0.675,144.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.81,0.511,5.0,0.0,0.164,0.12,0.0,0.0832,0.837,151.947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.856,0.564,6.0,0.0,0.392,0.00327,0.0,0.134,0.483,77.972,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Define the target set
y = filtered_lyric_TF_df['category_id'].ravel()
y[:5]

array([32., 32., 32., 32., 32.])

In [27]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(343, 3503)
(115, 3503)
(343,)
(115,)


In [28]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Model

In [29]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [30]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making Predictions

In [31]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([31., 30., 22., 27., 31., 27., 21.,  6., 21., 27., 15., 13., 32.,
       34., 13., 21., 24., 30., 31., 30., 30., 27.,  3., 15., 30., 15.,
        6., 14., 21., 31., 24., 27., 21., 30., 31., 27., 34.,  3., 21.,
       27., 27., 21., 13., 31., 30., 21., 30., 30., 13., 15., 30., 34.,
        3., 30., 30., 31., 27., 30., 30., 31., 29., 30., 30., 32., 30.,
       31., 22., 31., 31., 31.,  6., 13., 30.,  3., 26., 29., 21., 27.,
        8., 34., 30., 30., 13.,  6., 14., 30., 24., 34.,  3., 14., 31.,
       21., 13., 34., 25.,  2., 14.,  6., 27., 34., 30., 15., 21., 15.,
       14., 21., 15.,  8., 21., 27., 15., 13., 13.,  0., 21.])

# Model Evaluation

In [32]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.21739130434782608


In [33]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([0.01246891, 0.01492085, 0.00563795, ..., 0.00325996, 0.00121955,
       0.00010098])

In [34]:
# Sort the features by importance
sorted_features = sorted(zip(importances, X.columns), reverse=True)
sorted_features

[(0.016881727450647994, 'audio_ft_acousticness'),
 (0.014920850224812612, 'audio_ft_energy'),
 (0.012738362190869513, 'audio_ft_speechiness'),
 (0.012468912024795607, 'audio_ft_danceability'),
 (0.012359759032534874, 'audio_ft_valence'),
 (0.009678725379002572, 'audio_ft_duration_ms'),
 (0.009440622481246035, 'audio_ft_instrumentalness'),
 (0.009071739099956079, 'Unnamed: 18'),
 (0.007348560273207401, 'audio_ft_tempo'),
 (0.0069778696419316825, 'audio_ft_liveness'),
 (0.006168564466999422, 'know'),
 (0.006108189046258794, 'im'),
 (0.005637954824839265, 'audio_ft_key'),
 (0.005425392089849414, 'dont'),
 (0.005301905650121557, 'love'),
 (0.005100170870969183, '?'),
 (0.004910138577359585, 'got'),
 (0.004747877393528886, 'baby'),
 (0.004504047536876694, 'yeah'),
 (0.004410277286174885, 'oh'),
 (0.004375791195816822, 'like'),
 (0.004123310012993835, 'go'),
 (0.00391677285336589, 'say'),
 (0.0038916694578910497, 'aint'),
 (0.0038802922360005525, 'youre'),
 (0.0037914245437913395, 'never'),


# Dropping Lower Ranked Features

In [40]:
# Define the features and target sets
important_features = []
sum = 0
count = 0
while sum < 0.6:
    sum = sum + sorted_features[count][0]
    important_features.append(sorted_features[count][1])
    count+=1
X = filtered_lyric_TF_df[important_features]
y = filtered_lyric_TF_df['category_id'].ravel()
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
# Scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
# Fit the random forest model
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model = rf_model.fit(X_train_scaled, y_train)
# Make predictions
predictions = rf_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.23478260869565218
