In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Create DataFrame from CSV file
music_df = pd.read_csv('../preliminary_dataframes/practice_data.csv')
music_df = music_df.drop('Unnamed: 2', axis=1)
music_df.head()

Unnamed: 0,song_name,artist_name,south,triangular,shaun,inteprint,jamn,dayeleyves,myrioscope,gurus,...,puckered,industries,blisters,balacleivka,silliness,welders,영원,delilahs,okhorwan,category/genre
0,Monster (Shawn Mendes & Justin Bieber),Shawn Mendes,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
1,positions,Ariana Grande,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
2,Therefore I Am,Billie Eilish,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
3,Errbody,Lil Baby,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
4,Whoopty,CJ,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34


In [3]:
# Define the features set
X = music_df.copy()
X = X.drop(['song_name', 'artist_name', 'category/genre'], axis=1)
X.head()

Unnamed: 0,south,triangular,shaun,inteprint,jamn,dayeleyves,myrioscope,gurus,tubal,stimuli,...,sublumbunate,puckered,industries,blisters,balacleivka,silliness,welders,영원,delilahs,okhorwan
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Define the target set
y = music_df['category/genre'].ravel()
y[:5]

array([34, 34, 34, 34, 34])

In [5]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(399, 59443)
(134, 59443)
(399,)
(134,)


In [6]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Model

In [7]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [8]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making Predictions

In [20]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([17, 23, 18, 33, 31,  6, 19, 31, 31, 31,  1, 33, 17, 17, 17,  6, 28,
       17, 33, 31, 34, 18,  9,  4,  3, 18, 31, 28, 17, 31, 13, 36,  9, 31,
       31, 28, 33, 31, 31,  3, 17, 33, 37,  6, 31, 21, 31, 31, 36, 33, 31,
       36, 28, 33, 17,  6, 18,  3,  3, 20, 33,  9,  8,  5,  3, 35, 33, 30,
       31, 17,  8, 33, 31, 25, 35, 29, 33, 34,  0,  9, 31, 23,  8, 33, 16,
       35, 18, 17, 31,  1, 29, 33, 29, 35, 17, 17, 33, 31, 34, 31, 12, 33,
       17, 31, 34, 31, 30, 31, 33, 31,  0, 35, 31, 31, 31, 19, 31,  6, 33,
        6, 31, 31,  1, 18,  6, 33, 37, 33,  1, 33, 31,  8, 29, 17])

# Model Evaluation

In [22]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.13432835820895522


In [23]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([9.72811094e-05, 0.00000000e+00, 0.00000000e+00, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [24]:
# Sort the features by importance
sorted(zip(importances, X.columns), reverse=True)

[(0.004023984640439881, 'im'),
 (0.0037573708631012687, 'yeah'),
 (0.003391450140605273, 'know'),
 (0.0032968157619402204, 'oh'),
 (0.0032079225754901904, 'dont'),
 (0.0031293646418839283, 'love'),
 (0.0031206832595246496, 'baby'),
 (0.002730780591036965, 'youre'),
 (0.002723226360494005, 'say'),
 (0.002682999995958051, 'got'),
 (0.0026052084911618637, 'get'),
 (0.002593820563535668, 'like'),
 (0.0024949950802810065, 'aint'),
 (0.0024705944742657335, 'see'),
 (0.002374338106226387, 'feel'),
 (0.0023192805342145987, 'cause'),
 (0.0023181780945443213, 'time'),
 (0.0023176246869266097, 'one'),
 (0.0022983536841055885, 'wanna'),
 (0.002256427398144394, 'ooh'),
 (0.0022350697583531985, 'go'),
 (0.0022086245571163966, 'want'),
 (0.002146151546125573, 'take'),
 (0.002080390711773558, 'back'),
 (0.0020673882315370823, 'cant'),
 (0.002061231542028301, 'heart'),
 (0.0020572084552276174, 'never'),
 (0.0020546038663212148, 'make'),
 (0.002007872521454089, 'tell'),
 (0.001999181941348124, 'ill'),
 

# Dropping Lower Ranked Features

In [26]:
# Define the features and target sets
X = music_df[['im', 'yeah', 'know', 'oh', 'dont', 'love', 'baby', 'youre', 'say', 'got', 
              'get', 'like', 'aint', 'see', 'feel', 'cause', 'time', 'one', 'wanna', 'ooh',
              'go', 'want', 'take', 'back', 'cant', 'heart', 'never', 'make', 'tell',
              'ill', 'think', 'come', 'life', 'let', 'need', 'way', 'said', 'ive', 'well',
              'girl', 'night', 'right', 'thats', 'away', 'good', 'eyes', 'please', 'mind',
              'world', 'light', 'fuck', 'seh', 'shit', 'look', 'day', 'man']]
y = music_df['category/genre'].ravel()
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
# Scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
# Fit the random forest model
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model = rf_model.fit(X_train_scaled, y_train)
# Make predictions
predictions = rf_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.15671641791044777
