In [1]:
#pip install imblearn

In [88]:
# Import dependencies
import pandas as pd
import numpy as np
import time
import operator
from collections import Counter
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [89]:
# Create DataFrame from CSV
t0 = time.time()
filtered_lyric_SF_df = pd.read_csv('../../Data/filtered_lyric_SF.csv')
filtered_lyric_SF_df = filtered_lyric_SF_df.drop([0,1])
filtered_lyric_SF_df.index = filtered_lyric_SF_df.index - 2
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_lyric_SF_df.head(3)

Run time: 30.241528749465942 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,cut,ale,lack,slogan,libido,oop,scorch,muslim,heavyweight,mozambiqu
0,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WITHOUT YOU,The Kid LAROI,pop,8.0,['australian'],0.662,0.413,0.0,1.0,0.0299,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
# Create a dictionary of category names and ids
cat_name_id = {}
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    cat_id = list(filtered_lyric_SF_df[filtered_lyric_SF_df['category_name']==cat]['category_id'])[0]
    cat_name_id[cat] = cat_id
    print(cat, cat_id)

blues 0.0
classical 1.0
country 2.0
funk 3.0
hiphop 4.0
indie_alt 5.0
jazz 6.0
metal 7.0
pop 8.0
punk 9.0
rnb 10.0
rock 11.0
romance 12.0
soul 13.0


In [91]:
# Create a genre lists column
t0 = time.time()
genre_lists = []
for index, row in filtered_lyric_SF_df.iterrows():
    genres = row['genre_list'].replace('[', '').replace(']', '').replace("'", '').split(', ')
    genre_lists.append(genres)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
len(genre_lists)

Run time: 30.63402009010315 seconds


8010

In [92]:
# Replace the genre_list column
filtered_lyric_SF_df['genre_list'] = genre_lists

In [93]:
# Print how many songs there are in each category
for cat in category_list:
    song_count = len(filtered_lyric_SF_df[filtered_lyric_SF_df['category_name']==cat])
    print(f'There are {song_count} songs in the {cat} category.')

There are 423 songs in the blues category.
There are 85 songs in the classical category.
There are 1256 songs in the country category.
There are 280 songs in the funk category.
There are 682 songs in the hiphop category.
There are 654 songs in the indie_alt category.
There are 196 songs in the jazz category.
There are 892 songs in the metal category.
There are 692 songs in the pop category.
There are 569 songs in the punk category.
There are 362 songs in the rnb category.
There are 1228 songs in the rock category.
There are 280 songs in the romance category.
There are 411 songs in the soul category.


In [94]:
# Get the most popular genres for each category
t0 = time.time()
category_genres = {}
for category in category_list:
    genres = []
    for index, row in filtered_lyric_SF_df[filtered_lyric_SF_df['category_name']==category].iterrows():
        genres.extend(row['genre_list'])
    unique_genres = list(set(genres))
    genre_counts = []
    for genre in unique_genres:
        genre_counts.append((genre, genres.count(genre)))
    category_genres[category] = sorted(genre_counts, key = lambda x: x[1], reverse=True)[0:5]
    print(category, category_genres[category])
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

blues [('blues', 260), ('electric', 190), ('modern', 168), ('traditional', 146), ('classic', 106)]
classical [('classical', 13), ('operatic', 12), ('dance', 12), ('', 9), ('pop', 9)]
country [('country', 976), ('contemporary', 847), ('modern', 277), ('pop', 88), ('nashville', 78)]
funk [('funk', 132), ('soul', 117), ('motown', 96), ('quiet', 81), ('disco', 78)]
hiphop [('rap', 345), ('hip', 304), ('pop', 227), ('trap', 192), ('southern', 142)]
indie_alt [('indie', 290), ('alternative', 180), ('new', 153), ('modern', 148), ('art', 141)]
jazz [('vocal', 92), ('contemporary', 82), ('jazz', 77), ('adult', 63), ('neo', 38)]
metal [('metal', 365), ('alternative', 284), ('nu', 221), ('rock', 218), ('hard', 205)]
pop [('pop', 344), ('indie', 180), ('dance', 164), ('post-teen', 129), ('electropop', 129)]
punk [('pop', 195), ('punk', 138), ('skate', 94), ('alternative', 84), ('melodic', 72)]
rnb [('pop', 219), ('r&b', 213), ('urban', 200), ('dance', 169), ('hip', 136)]
rock [('rock', 638), ('mod

In [95]:
# Find indeces to keep and indeces to remove
t0 = time.time()
cat_genre_dict = {'pop': ['pop'], 
                  'hiphop': ['rap', 'hip', 'trap'],
                  'metal': ['metal'], 
                  'rock': ['classic'], 
                  'jazz': ['jazz'], 
                  'rnb': ['r&b'], 
                  'romance': ['soft'], 
                  'soul': ['soul'], 
                  'indie_alt': ['indie', 'alternative'],
                  'classical': ['classical', 'operatic', 'dance'],
                  'blues': ['blues', 'modern'],
                  'funk': ['funk', 'disco'], 
                  'punk': ['punk'],
                  'country': ['country']} 
indeces_to_keep = []
indeces_to_remove = []
for index, row in filtered_lyric_SF_df.iterrows():
    keep = False
    category = row['category_name']
    for genre in cat_genre_dict[category]:
        if genre in row['genre_list']:
            keep = True
            break
    if keep == True:
        indeces_to_keep.append(index)
    else:
        indeces_to_remove.append(index)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
print(len(indeces_to_keep))
len(indeces_to_remove)

Run time: 40.396340131759644 seconds
3975


4035

In [96]:
# Create non destructive copy and remove the unnecessary indeces
t0 = time.time()
filtered_lyric_SF_df2 = filtered_lyric_SF_df.drop(indeces_to_remove)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_lyric_SF_df2.head()

Run time: 0.24582409858703613 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,cut,ale,lack,slogan,libido,oop,scorch,muslim,heavyweight,mozambiqu
0,willow,Taylor Swift,pop,8.0,"[dance, pop]",0.392,0.574,7.0,1.0,0.17,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"[indie, pop, electropop]",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,r u ok,Tate McRae,pop,8.0,"[dance, pop, electropop, post-teen]",0.666,0.593,2.0,1.0,0.0373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,BOYSHIT,Madison Beer,pop,8.0,"[dance, pop, electropop, post-teen]",0.638,0.699,1.0,1.0,0.0892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Safe With Me (with Audrey Mika),Gryffin,pop,8.0,"[dance, pop, tropical, edm]",0.497,0.757,11.0,1.0,0.127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
# Remove the romance, punk, and blues categories
filtered_lyric_SF_df2 = filtered_lyric_SF_df2[filtered_lyric_SF_df2['category_name']!='romance']
filtered_lyric_SF_df2 = filtered_lyric_SF_df2[filtered_lyric_SF_df2['category_name']!='punk']
filtered_lyric_SF_df2 = filtered_lyric_SF_df2[filtered_lyric_SF_df2['category_name']!='blues']
#filtered_lyric_SF_df2 = filtered_lyric_SF_df2[filtered_lyric_SF_df2['category_name']!='indie_alt']
len(filtered_lyric_SF_df2)

3443

In [98]:
# Print how many songs there are in each category 
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    song_count = len(filtered_lyric_SF_df2[filtered_lyric_SF_df2['category_name']==cat])
    print(f'There are {song_count} songs in the {cat} category.')

There are 0 songs in the blues category.
There are 33 songs in the classical category.
There are 976 songs in the country category.
There are 162 songs in the funk category.
There are 446 songs in the hiphop category.
There are 374 songs in the indie_alt category.
There are 77 songs in the jazz category.
There are 365 songs in the metal category.
There are 344 songs in the pop category.
There are 0 songs in the punk category.
There are 213 songs in the rnb category.
There are 245 songs in the rock category.
There are 0 songs in the romance category.
There are 208 songs in the soul category.


In [99]:
# Define the features set
X = filtered_lyric_SF_df2.copy()
X = X.drop(['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list'], axis=1)
X.head()

Unnamed: 0,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,...,cut,ale,lack,slogan,libido,oop,scorch,muslim,heavyweight,mozambiqu
0,0.392,0.574,7.0,1.0,0.17,0.833,0.00179,0.145,0.529,81.112,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.581,0.584,2.0,1.0,0.284,0.0805,0.0,0.366,0.756,179.954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666,0.593,2.0,1.0,0.0373,0.318,0.0,0.414,0.329,140.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.638,0.699,1.0,1.0,0.0892,0.0203,0.0,0.0879,0.57,121.122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.497,0.757,11.0,1.0,0.127,0.0449,0.0,0.119,0.507,169.902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
# Define the target set
y = filtered_lyric_SF_df2['category_id'].ravel()
y[:5]

array([8., 8., 8., 8., 8.])

In [135]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2582, 138)
(861, 138)
(2582,)
(861,)


# Combination Sampling with SMOTEENN

In [123]:
# Perform combination sampling on the data
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [124]:
# Count the resampled classes
Counter(y_resampled)

Counter({1.0: 154,
         2.0: 12,
         3.0: 70,
         4.0: 80,
         5.0: 93,
         6.0: 30,
         7.0: 53,
         8.0: 90,
         10.0: 82,
         11.0: 60,
         13.0: 84})

# Fitting the Model

In [136]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [137]:
# Fit the model
t0 = time.time()
rf_model = rf_model.fit(X_resampled, y_resampled)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

Run time: 2.033416271209717 seconds


# Making Predictions

In [138]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test)
predictions

array([ 8.,  7.,  5., 11.,  8.,  8.,  5.,  4.,  4., 11.,  5.,  5., 13.,
        8.,  8.,  3.,  8.,  5.,  8.,  5., 13.,  5., 10.,  4., 13.,  5.,
        4.,  5.,  4.,  5.,  4.,  8.,  8.,  8.,  8.,  8.,  5.,  3.,  8.,
        4.,  5.,  4.,  8., 13.,  5., 10.,  5.,  5., 13.,  4.,  7.,  5.,
       10.,  7.,  4.,  3.,  4.,  5., 13., 13.,  8.,  5.,  8., 13.,  5.,
        4.,  3.,  5.,  5., 13., 13.,  4.,  8.,  7.,  5.,  4.,  8.,  7.,
       11.,  4., 10., 13.,  4.,  4.,  5.,  7.,  3.,  5.,  4., 13., 11.,
        8., 13., 11., 13., 13.,  8., 10.,  4., 13.,  8.,  7., 10.,  7.,
        5.,  5.,  8.,  5.,  5.,  8.,  8., 13., 10., 13., 13.,  5.,  4.,
        7.,  5.,  5.,  4., 13.,  3.,  4.,  4., 13.,  1.,  8.,  4.,  3.,
        8.,  3.,  7., 13.,  4.,  4., 13.,  4.,  5.,  8.,  8.,  5., 11.,
        5., 11.,  5., 10.,  7.,  5., 13.,  8.,  5.,  8.,  5.,  4.,  5.,
        7.,  4., 10.,  5.,  8.,  7., 10.,  4.,  8., 13.,  4., 10., 13.,
        8.,  4.,  8.,  4.,  5.,  5.,  8.,  7.,  8., 10.,  4.,  4

# Model Evaluation

In [139]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.45876887340301975


In [140]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([0.04290899, 0.04920765, 0.0335598 , 0.02842143, 0.02915993,
       0.02530337, 0.02898101, 0.01293977, 0.01364331, 0.01417236,
       0.01356199, 0.01334987, 0.00997996, 0.01118537, 0.01039962,
       0.01115894, 0.01149039, 0.01463172, 0.01072608, 0.00886504,
       0.01056742, 0.00996549, 0.00893309, 0.00950364, 0.0094527 ,
       0.0106991 , 0.00759391, 0.00763332, 0.00875018, 0.00896561,
       0.00729331, 0.00848379, 0.00686183, 0.00730961, 0.00725927,
       0.00914373, 0.00788822, 0.00621423, 0.00669363, 0.0062641 ,
       0.00887075, 0.01148827, 0.00585503, 0.00630596, 0.0095139 ,
       0.00827613, 0.0054218 , 0.00735251, 0.00668796, 0.00552145,
       0.00534294, 0.00515001, 0.00459441, 0.00612318, 0.00676322,
       0.00625312, 0.00643701, 0.00311644, 0.00598721, 0.00594406,
       0.00580684, 0.0056959 , 0.00546445, 0.0059808 , 0.00709179,
       0.00562711, 0.00631502, 0.00526508, 0.00608759, 0.00486613,
       0.00464449, 0.00501282, 0.00535472, 0.00314016, 0.00479

In [141]:
# Sort the features by importance
sorted_features = sorted(zip(importances, X.columns), reverse=True)
sorted_features

[(0.0492076462187857, 'audio_ft_duration_ms'),
 (0.04290898889754002, 'audio_ft_danceability'),
 (0.03355980038707812, 'audio_ft_acousticness'),
 (0.029159927820578024, 'audio_ft_speechiness'),
 (0.028981013836630206, 'audio_ft_valence'),
 (0.02842143214218124, 'audio_ft_instrumentalness'),
 (0.02530337486970485, 'audio_ft_energy'),
 (0.01463172065076934, 'nigga'),
 (0.014172357670675633, 'know'),
 (0.013643310468303879, 'love'),
 (0.013561988800401657, 'yeah'),
 (0.013349873154424562, 'audio_ft_tempo'),
 (0.01293977473905827, 'oh'),
 (0.011490389300540624, 'im'),
 (0.011488274645624996, 'feel'),
 (0.011185368702652078, 'aint'),
 (0.011158941983525374, 'dont'),
 (0.010726075186303497, 'gon'),
 (0.01069909778611902, 'your'),
 (0.010567418830040393, 'time'),
 (0.010399618163109811, 'like'),
 (0.009979963440806601, 'babi'),
 (0.009965494184957369, 'audio_ft_liveness'),
 (0.009513897597668505, 'english'),
 (0.00950363905950091, 'one'),
 (0.009452695652422516, 'got'),
 (0.009143734437059651

# Dropping Lower Ranked Features

In [146]:
# Create a dictionary of features and accuracy scores
features_accuracy = {}
feature_percents = [0.3, 0.4, 0.5]
for percent in feature_percents:
    # Define the features and target sets
    important_features = []
    sum = 0
    count = 0
    while sum < percent:
        sum = sum + sorted_features[count][0]
        important_features.append(sorted_features[count][1])
        count+=1
    X = filtered_lyric_SF_df2[important_features]
    y = filtered_lyric_SF_df2['category_id'].ravel()
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    # Scale the data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    # Fit the random forest model
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
    t0 = time.time()
    rf_model = rf_model.fit(X_train_scaled, y_train)
    t1 = time.time()
    print(f'Run time to fit model with {percent*100}% of total features: {t1-t0} seconds')
    # Make predictions
    predictions = rf_model.predict(X_test_scaled)
    # Evaluate the model
    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=['Actual classical', 'Actual country', 'Actual funk',
               'Actual hiphop', 'Actual indie_alt', 'Actual jazz', 'Actual metal',
               'Actual pop', 'Actual rnb', 'Actual rock', 'Actual soul'],
        columns=['Predicted classical', 'Predicted country',
                 'Predicted funk', 'Predicted hiphop', 'Predicted indie_alt', 
                 'Predicted jazz', 'Predicted metal', 'Predicted pop',
                 'Predicted rnb', 'Predicted rock', 'Predicted soul'])
    acc_score = accuracy_score(y_test, predictions)
    print('Confusion Matrix')
    display(cm_df)
    print(f'Accuracy Score: {acc_score}')
    print('Classification Report')
    print(classification_report(y_test, predictions))
    features_accuracy[percent] = acc_score

Run time to fit model with 30.0% of total features: 3.5804247856140137 seconds
Confusion Matrix


Unnamed: 0,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted rnb,Predicted rock,Predicted soul
Actual classical,4,0,0,0,2,0,2,0,0,0,0
Actual country,1,216,0,0,10,0,3,6,2,2,1
Actual funk,0,3,22,3,10,0,1,0,2,1,1
Actual hiphop,0,1,0,109,0,0,0,8,4,1,0
Actual indie_alt,1,20,1,1,41,0,13,2,0,4,0
Actual jazz,2,7,1,0,8,7,0,0,1,2,2
Actual metal,0,2,0,0,5,0,72,0,0,3,0
Actual pop,0,55,0,9,5,0,0,25,3,0,1
Actual rnb,0,10,3,7,0,0,1,6,17,2,0
Actual rock,0,29,4,0,5,0,7,1,2,16,0


Accuracy Score: 0.6236933797909407
Classification Report
              precision    recall  f1-score   support

         1.0       0.50      0.50      0.50         8
         2.0       0.60      0.90      0.72       241
         3.0       0.61      0.51      0.56        43
         4.0       0.84      0.89      0.86       123
         5.0       0.47      0.49      0.48        83
         6.0       1.00      0.23      0.38        30
         7.0       0.73      0.88      0.80        82
         8.0       0.51      0.26      0.34        98
        10.0       0.49      0.37      0.42        46
        11.0       0.46      0.25      0.32        64
        13.0       0.62      0.19      0.29        43

    accuracy                           0.62       861
   macro avg       0.62      0.50      0.51       861
weighted avg       0.62      0.62      0.59       861

Run time to fit model with 40.0% of total features: 3.7705259323120117 seconds
Confusion Matrix


Unnamed: 0,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted rnb,Predicted rock,Predicted soul
Actual classical,4,0,0,0,2,0,2,0,0,0,0
Actual country,1,222,0,1,9,0,3,2,2,1,0
Actual funk,0,7,24,3,6,0,0,0,1,0,2
Actual hiphop,0,1,1,114,2,0,0,4,1,0,0
Actual indie_alt,0,20,1,1,44,0,10,3,0,4,0
Actual jazz,2,7,1,1,8,7,0,0,1,2,1
Actual metal,0,4,0,0,4,0,72,0,0,2,0
Actual pop,0,48,0,7,10,0,0,33,0,0,0
Actual rnb,0,8,3,6,1,0,1,9,18,0,0
Actual rock,0,31,4,0,7,0,8,0,0,14,0


Accuracy Score: 0.6480836236933798
Classification Report
              precision    recall  f1-score   support

         1.0       0.57      0.50      0.53         8
         2.0       0.60      0.92      0.73       241
         3.0       0.60      0.56      0.58        43
         4.0       0.86      0.93      0.89       123
         5.0       0.46      0.53      0.49        83
         6.0       1.00      0.23      0.38        30
         7.0       0.75      0.88      0.81        82
         8.0       0.65      0.34      0.44        98
        10.0       0.64      0.39      0.49        46
        11.0       0.56      0.22      0.31        64
        13.0       0.67      0.14      0.23        43

    accuracy                           0.65       861
   macro avg       0.67      0.51      0.53       861
weighted avg       0.66      0.65      0.61       861

Run time to fit model with 50.0% of total features: 3.8400771617889404 seconds
Confusion Matrix


Unnamed: 0,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted indie_alt,Predicted jazz,Predicted metal,Predicted pop,Predicted rnb,Predicted rock,Predicted soul
Actual classical,4,0,0,0,3,0,1,0,0,0,0
Actual country,1,224,0,0,8,0,4,2,2,0,0
Actual funk,0,6,21,3,8,0,0,0,3,1,1
Actual hiphop,0,0,0,115,3,0,0,3,2,0,0
Actual indie_alt,0,23,0,1,45,0,10,1,0,3,0
Actual jazz,3,7,1,1,8,8,0,0,0,2,0
Actual metal,0,4,0,0,4,0,72,0,0,2,0
Actual pop,0,50,0,8,10,0,0,29,1,0,0
Actual rnb,0,11,3,8,2,0,0,8,14,0,0
Actual rock,0,30,2,1,6,0,9,0,0,16,0


Accuracy Score: 0.6445993031358885
Classification Report
              precision    recall  f1-score   support

         1.0       0.50      0.50      0.50         8
         2.0       0.60      0.93      0.73       241
         3.0       0.64      0.49      0.55        43
         4.0       0.84      0.93      0.88       123
         5.0       0.45      0.54      0.49        83
         6.0       1.00      0.27      0.42        30
         7.0       0.75      0.88      0.81        82
         8.0       0.67      0.30      0.41        98
        10.0       0.52      0.30      0.38        46
        11.0       0.59      0.25      0.35        64
        13.0       0.88      0.16      0.27        43

    accuracy                           0.64       861
   macro avg       0.68      0.50      0.53       861
weighted avg       0.67      0.64      0.61       861



In [147]:
# Print feature percents and accuracy scores
for percent in features_accuracy:
    acc_score = features_accuracy[percent]
    print(f'Accuracy Score for {percent*100}% of total features: {acc_score}')

Accuracy Score for 30.0% of total features: 0.6236933797909407
Accuracy Score for 40.0% of total features: 0.6480836236933798
Accuracy Score for 50.0% of total features: 0.6445993031358885


In [148]:
# Get the highest accuracy score
best_percent = max(features_accuracy, key=features_accuracy.get)
best_acc = features_accuracy[best_percent]
print(f'The highest accuracy score of {best_acc} was achieved using {best_percent*100}% of all features.')

The highest accuracy score of 0.6480836236933798 was achieved using 40.0% of all features.


# Accuracy score for SMOTEENN
- The highest accuracy score of 0.6434378629500581 was achieved using 20.0% of all features.

# Accuracy scores with different data
- Original <br>
The highest accuracy score of 0.5296296296296297 was achieved using 30.0% of all features.
- Remove romance category <br>
The highest accuracy score of 0.5805843543826579 was achieved using 30.0% of all features.
- Remove punksteam genre from punk category <br>
The highest accuracy score of 0.5872865275142315 was achieved using 40.0% of all features.
<br> [0.2 - 0.5] <br>
- Remove electropop genre from pop category <br>
The highest accuracy score of 0.5916030534351145 was achieved using 20.0% of all features. 
- Remove motown genre from soul category <br>
The highest accuracy score of 0.6032660902977905 was achieved using 40.0% of all features.
- Add dance genre to classical category <br>
The highest accuracy score of 0.6120689655172413 was achieved using 20.0% of all features.
<br> [0.1 - 0.5] <br>
- Add trap genre to hiphop category <br>
The highest accuracy score of 0.6213408876298395 was achieved using 30.0% of all features.
- Add nashville genre to country category <br>
The highest accuracy score of 0.6053604436229205 was achieved using 40.0% of all features. ??
- Replace rock genre with classic <br>
The highest accuracy score of 0.639344262295082 was achieved using 20.0% of all features.
- Replace blues genre with classic <br>
The highest accuracy score of 0.650319829424307 was achieved using 20.0% of all features.
<br> [0.1 - 0.4] <br>
- Replace blues genre with modern <br>
The highest accuracy score of 0.6509433962264151 was achieved using 30.0% of all features.

<br>
The highest accuracy score of 0.6119402985074627 was achieved using 40.0% of all features.
<br>
- Remove punk category <br>
The highest accuracy score of 0.6245847176079734 was achieved using 20.0% of all features. <br>
- Add blues genre <br>
The highest accuracy score of 0.6446808510638298 was achieved using 20.0% of all features. <br>
- Remove blues category <br>
The highest accuracy score of 0.6515679442508711 was achieved using 20.0% of all features. <br>
- Remove indie category <br>
The highest accuracy score of 0.7044270833333334 was achieved using 30.0% of all features. <br>

