In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import time
import operator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Create DataFrames from CSV files
t0 = time.time()
filtered_lyric_TF_df = pd.read_csv('../../Data/filtered_lyric_TF.csv')
filtered_lyric_TF_df = filtered_lyric_TF_df.drop([0,1])
filtered_lyric_TF_df.index = filtered_lyric_TF_df.index - 2
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_lyric_TF_df.head(3)

Run time: 55.6153769493103 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,professed,plottin,sideline,sufficient,girly,reek,duffel,bitter,staff,eighth
0,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WITHOUT YOU,The Kid LAROI,pop,8.0,['australian'],0.662,0.413,0.0,1.0,0.0299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Create a dictionary of category names and ids
cat_name_id = {}
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    cat_id = list(filtered_lyric_TF_df[filtered_lyric_TF_df['category_name']==cat]['category_id'])[0]
    cat_name_id[cat] = cat_id
    print(cat, cat_id)

blues 0.0
classical 1.0
country 2.0
funk 3.0
hiphop 4.0
indie_alt 5.0
jazz 6.0
metal 7.0
pop 8.0
punk 9.0
rnb 10.0
rock 11.0
romance 12.0
soul 13.0


In [4]:
# Create a genre lists column
t0 = time.time()
genre_lists = []
for index, row in filtered_lyric_TF_df.iterrows():
    genres = row['genre_list'].replace('[', '').replace(']', '').replace("'", '').split(', ')
    genre_lists.append(genres)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
len(genre_lists)

Run time: 43.99636101722717 seconds


8041

In [5]:
# Replace the genre_list column
filtered_lyric_TF_df['genre_list'] = genre_lists

In [6]:
# Print how many songs there are in each category
for cat in category_list:
    song_count = len(filtered_lyric_TF_df[filtered_lyric_TF_df['category_name']==cat])
    print(f'There are {song_count} songs in the {cat} category.')

There are 424 songs in the blues category.
There are 95 songs in the classical category.
There are 1257 songs in the country category.
There are 282 songs in the funk category.
There are 683 songs in the hiphop category.
There are 660 songs in the indie_alt category.
There are 198 songs in the jazz category.
There are 898 songs in the metal category.
There are 693 songs in the pop category.
There are 569 songs in the punk category.
There are 362 songs in the rnb category.
There are 1229 songs in the rock category.
There are 280 songs in the romance category.
There are 411 songs in the soul category.


In [7]:
# Get the most popular genres for each category
t0 = time.time()
category_genres = {}
for category in category_list:
    genres = []
    for index, row in filtered_lyric_TF_df[filtered_lyric_TF_df['category_name']==category].iterrows():
        genres.extend(row['genre_list'])
    unique_genres = list(set(genres))
    genre_counts = []
    for genre in unique_genres:
        genre_counts.append((genre, genres.count(genre)))
    category_genres[category] = sorted(genre_counts, key = lambda x: x[1], reverse=True)[0:5]
    print(category, category_genres[category])
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

blues [('blues', 260), ('electric', 190), ('modern', 168), ('traditional', 146), ('classic', 106)]
classical [('classical', 17), ('operatic', 14), ('dance', 12), ('', 9), ('soundtrack', 9)]
country [('country', 977), ('contemporary', 847), ('modern', 277), ('pop', 88), ('nashville', 78)]
funk [('funk', 133), ('soul', 118), ('motown', 97), ('quiet', 82), ('disco', 79)]
hiphop [('rap', 346), ('hip', 305), ('pop', 228), ('trap', 192), ('southern', 143)]
indie_alt [('indie', 291), ('alternative', 181), ('new', 155), ('modern', 148), ('art', 144)]
jazz [('vocal', 93), ('contemporary', 83), ('jazz', 79), ('adult', 64), ('neo', 38)]
metal [('metal', 368), ('alternative', 286), ('nu', 222), ('rock', 221), ('hard', 207)]
pop [('pop', 344), ('indie', 180), ('dance', 164), ('electropop', 129), ('post-teen', 129)]
punk [('pop', 195), ('punk', 138), ('skate', 94), ('alternative', 84), ('melodic', 72)]
rnb [('pop', 219), ('r&b', 213), ('urban', 200), ('dance', 169), ('hip', 136)]
rock [('rock', 638)

In [8]:
# Find indeces to keep and indeces to remove
t0 = time.time()
cat_genre_dict = {'pop': ['pop'], 
                  'hiphop': ['rap', 'hip', 'trap'],
                  'metal': ['metal'], 
                  'rock': ['classic'], 
                  'jazz': ['jazz'], 
                  'rnb': ['r&b'], 
                  'romance': ['soft'], 
                  'soul': ['soul'], 
                  'indie_alt': ['indie', 'alternative'],
                  'classical': ['classical', 'operatic', 'dance'], 
                  'blues': ['modern', 'blues'],
                  'funk': ['funk', 'disco'], 
                  'punk': ['punk'],
                  'country': ['country']} 
indeces_to_keep = []
indeces_to_remove = []
for index, row in filtered_lyric_TF_df.iterrows():
    keep = False
    category = row['category_name']
    for genre in cat_genre_dict[category]:
        if genre in row['genre_list']:
            keep = True
            break
    if keep == True:
        indeces_to_keep.append(index)
    else:
        indeces_to_remove.append(index)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
print(len(indeces_to_keep))
len(indeces_to_remove)

Run time: 64.14870500564575 seconds
3990


4051

In [29]:
# Create non destructive copy and remove the unnecessary indeces
t0 = time.time()
filtered_lyric_TF_df2 = filtered_lyric_TF_df.drop(indeces_to_remove)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_lyric_TF_df2.head(3)

Run time: 1.8691949844360352 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,professed,plottin,sideline,sufficient,girly,reek,duffel,bitter,staff,eighth
0,willow,Taylor Swift,pop,8.0,"[dance, pop]",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"[indie, pop, electropop]",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,r u ok,Tate McRae,pop,8.0,"[dance, pop, electropop, post-teen]",0.666,0.593,2.0,1.0,0.0373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# Remove categories
filtered_lyric_TF_df2 = filtered_lyric_TF_df2[filtered_lyric_TF_df2['category_name']!='romance']
filtered_lyric_TF_df2 = filtered_lyric_TF_df2[filtered_lyric_TF_df2['category_name']!='punk']
filtered_lyric_TF_df2 = filtered_lyric_TF_df2[filtered_lyric_TF_df2['category_name']!='blues']
filtered_lyric_TF_df2 = filtered_lyric_TF_df2[filtered_lyric_TF_df2['category_name']!='indie_alt']
len(filtered_lyric_TF_df2)

3082

In [51]:
# Print how many songs there are in each category 
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    song_count = len(filtered_lyric_TF_df2[filtered_lyric_TF_df2['category_name']==cat])
    print(f'There are {song_count} songs in the {cat} category.')

There are 0 songs in the blues category.
There are 38 songs in the classical category.
There are 977 songs in the country category.
There are 163 songs in the funk category.
There are 447 songs in the hiphop category.
There are 0 songs in the indie_alt category.
There are 79 songs in the jazz category.
There are 368 songs in the metal category.
There are 344 songs in the pop category.
There are 0 songs in the punk category.
There are 213 songs in the rnb category.
There are 245 songs in the rock category.
There are 0 songs in the romance category.
There are 208 songs in the soul category.


In [52]:
# Define the features set
X = filtered_lyric_TF_df2.copy()
X = X.drop(['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list'], axis=1)
X.head(3)

Unnamed: 0,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,...,professed,plottin,sideline,sufficient,girly,reek,duffel,bitter,staff,eighth
0,0.392,0.574,7.0,1.0,0.17,0.833,0.00179,0.145,0.529,81.112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.581,0.584,2.0,1.0,0.284,0.0805,0.0,0.366,0.756,179.954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666,0.593,2.0,1.0,0.0373,0.318,0.0,0.414,0.329,140.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Define the target set
y = filtered_lyric_TF_df2['category_id'].ravel()
y[:5]

array([8., 8., 8., 8., 8.])

In [54]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2311, 12081)
(771, 12081)
(2311,)
(771,)


In [55]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Model

In [56]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [57]:
# Fit the model
t0 = time.time()
rf_model = rf_model.fit(X_train_scaled, y_train)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

Run time: 32.57457399368286 seconds


# Making Predictions

In [58]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([ 2.,  7.,  2.,  4.,  2.,  2., 11.,  2.,  2.,  2.,  3.,  2.,  4.,
        4.,  2.,  2.,  2.,  2.,  2.,  7.,  2.,  2.,  7.,  4.,  2.,  2.,
        7.,  2.,  2.,  2.,  8.,  4.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  4.,  2., 11.,  2., 13.,  4.,  2.,  7.,  4.,  2.,  2.,
        7.,  2.,  2.,  7.,  4.,  2.,  8.,  7.,  2.,  2.,  2.,  2.,  4.,
        2.,  2.,  7.,  2.,  4.,  7.,  2.,  2.,  4.,  4.,  2.,  7.,  2.,
        2.,  2.,  4.,  2.,  2.,  2.,  2.,  7.,  7.,  4.,  2.,  4.,  4.,
        2.,  2.,  4.,  2.,  4.,  2.,  2.,  7.,  7.,  4.,  4., 13., 11.,
        2.,  2.,  2.,  4., 10.,  2.,  2.,  2.,  2.,  4.,  8.,  2.,  2.,
        2.,  2., 13.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  4.,  4.,  4.,
        7.,  2.,  2.,  2.,  2.,  4.,  2.,  4.,  3.,  2.,  2.,  4.,  4.,
        2.,  2.,  4.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  4.,  2.,  7.,
        2.,  2.,  2.,  2.,  2.,  7.,  2.,  7.,  2.,  2.,  4.,  2.,  3.,
        2.,  2.,  2.,  7.,  2.,  2.,  2.,  2.,  2.,  7.,  4.,  2

# Model Evaluation

In [60]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from cm
cm_df = pd.DataFrame(
    cm, index=['Actual classical', 'Actual country', 'Actual funk',
               'Actual hiphop', 'Actual jazz', 'Actual metal',
               'Actual pop', 'Actual rnb', 'Actual rock', 'Actual soul'],
        columns=['Predicted classical', 'Predicted country',
                 'Predicted funk', 'Predicted hiphop',
                 'Predicted jazz', 'Predicted metal', 'Predicted pop',
                 'Predicted rnb', 'Predicted rock', 'Predicted soul'])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [61]:
# Display the results
print('Confusion Matrix')
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print('Classification Report')
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted jazz,Predicted metal,Predicted pop,Predicted rnb,Predicted rock,Predicted soul
Actual classical,0,4,0,0,0,3,0,0,0,0
Actual country,0,251,0,0,0,0,0,0,0,0
Actual funk,0,17,5,2,0,2,0,0,0,2
Actual hiphop,0,5,0,112,0,0,0,0,0,0
Actual jazz,0,21,0,2,2,1,0,0,0,0
Actual metal,0,11,0,0,0,72,0,0,0,0
Actual pop,0,84,0,7,0,0,8,0,0,0
Actual rnb,0,26,1,16,0,0,0,7,0,0
Actual rock,0,50,0,0,0,5,0,0,5,0
Actual soul,0,33,4,0,1,1,0,1,0,10


Accuracy Score: 0.6121919584954605
Classification Report
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         7
         2.0       0.50      1.00      0.67       251
         3.0       0.50      0.18      0.26        28
         4.0       0.81      0.96      0.88       117
         6.0       0.67      0.08      0.14        26
         7.0       0.86      0.87      0.86        83
         8.0       1.00      0.08      0.15        99
        10.0       0.88      0.14      0.24        50
        11.0       1.00      0.08      0.15        60
        13.0       0.83      0.20      0.32        50

    accuracy                           0.61       771
   macro avg       0.70      0.36      0.37       771
weighted avg       0.73      0.61      0.52       771



  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([1.53733969e-02, 1.72578062e-02, 3.64715267e-03, ...,
       1.71522061e-04, 1.11989092e-05, 0.00000000e+00])

In [63]:
# Sort the features by importance
sorted_features = sorted(zip(importances, X.columns), reverse=True)
sorted_features

[(0.020251366926751647, 'audio_ft_acousticness'),
 (0.0181206103294219, 'audio_ft_speechiness'),
 (0.017257806210607494, 'audio_ft_energy'),
 (0.015373396868211937, 'audio_ft_danceability'),
 (0.013884911382525738, 'audio_ft_instrumentalness'),
 (0.012851121826203299, 'audio_ft_duration_ms'),
 (0.010879247577774465, 'audio_ft_valence'),
 (0.006586959890248623, 'audio_ft_mode'),
 (0.006447175530099414, 'audio_ft_liveness'),
 (0.006003781057984915, 'love'),
 (0.005937995689179072, 'audio_ft_tempo'),
 (0.005925932685882083, 'shit'),
 (0.005731212981330154, 'niggas'),
 (0.005445124829506516, '?'),
 (0.005428653824782276, 'im'),
 (0.005410813370576549, 'oh'),
 (0.005361101231935757, 'nigga'),
 (0.005302438294613615, 'like'),
 (0.004884859202791989, 'baby'),
 (0.004767016819806854, 'know'),
 (0.004672300242319825, 'aint'),
 (0.004574692609614122, 'yeah'),
 (0.004536695151928711, 'fuck'),
 (0.004354246999593807, 'got'),
 (0.004225705510508866, 'dont'),
 (0.004177679929690217, 'bitch'),
 (0.00

# Dropping Lower Ranked Features

In [64]:
# Create a dictionary of features and accuracy scores
features_accuracy = {}
feature_percents = [0.1, 0.2, 0.3]
for percent in feature_percents:
    # Define the features and target sets
    important_features = []
    sum = 0
    count = 0
    while sum < percent:
        sum = sum + sorted_features[count][0]
        important_features.append(sorted_features[count][1])
        count+=1
    X = filtered_lyric_TF_df2[important_features]
    y = filtered_lyric_TF_df2['category_id'].ravel()
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    # Scale the data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    # Fit the random forest model
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
    t0 = time.time()
    rf_model = rf_model.fit(X_train_scaled, y_train)
    t1 = time.time()
    print(f'Run time to fit model with {percent*100}% of total features: {t1-t0} seconds')
    # Make predictions
    predictions = rf_model.predict(X_test_scaled)
    # Evaluations
    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=['Actual classical', 'Actual country', 'Actual funk',
                   'Actual hiphop', 'Actual jazz', 'Actual metal',
                   'Actual pop', 'Actual rnb', 'Actual rock', 'Actual soul'],
            columns=['Predicted classical', 'Predicted country',
                     'Predicted funk', 'Predicted hiphop',
                     'Predicted jazz', 'Predicted metal', 'Predicted pop',
                     'Predicted rnb', 'Predicted rock', 'Predicted soul'])
    acc_score = accuracy_score(y_test, predictions)
    print('Confusion Matrix')
    display(cm_df)
    print(f'Accuracy Score: {acc_score}')
    print('Classification Report')
    print(classification_report(y_test, predictions))
    features_accuracy[percent] = acc_score

Run time to fit model with 10.0% of total features: 3.047375202178955 seconds
Confusion Matrix


Unnamed: 0,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted jazz,Predicted metal,Predicted pop,Predicted rnb,Predicted rock,Predicted soul
Actual classical,3,0,0,0,1,2,0,0,1,0
Actual country,0,222,2,5,1,2,7,3,4,5
Actual funk,0,7,13,2,1,0,1,2,1,1
Actual hiphop,0,3,3,96,0,0,10,3,1,1
Actual jazz,2,6,2,1,12,0,1,1,1,0
Actual metal,0,8,0,0,0,70,0,1,4,0
Actual pop,0,43,1,13,1,0,35,4,0,2
Actual rnb,0,10,4,11,0,0,9,13,3,0
Actual rock,2,26,4,3,0,5,2,1,15,2
Actual soul,1,24,4,1,1,0,2,4,5,8


Accuracy Score: 0.6316472114137484
Classification Report
              precision    recall  f1-score   support

         1.0       0.38      0.43      0.40         7
         2.0       0.64      0.88      0.74       251
         3.0       0.39      0.46      0.43        28
         4.0       0.73      0.82      0.77       117
         6.0       0.71      0.46      0.56        26
         7.0       0.89      0.84      0.86        83
         8.0       0.52      0.35      0.42        99
        10.0       0.41      0.26      0.32        50
        11.0       0.43      0.25      0.32        60
        13.0       0.42      0.16      0.23        50

    accuracy                           0.63       771
   macro avg       0.55      0.49      0.50       771
weighted avg       0.61      0.63      0.60       771

Run time to fit model with 20.0% of total features: 2.949061870574951 seconds
Confusion Matrix


Unnamed: 0,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted jazz,Predicted metal,Predicted pop,Predicted rnb,Predicted rock,Predicted soul
Actual classical,3,1,0,0,0,2,0,0,1,0
Actual country,0,236,1,0,2,3,4,2,2,1
Actual funk,0,5,17,0,0,0,0,1,2,3
Actual hiphop,0,1,0,109,0,0,7,0,0,0
Actual jazz,2,8,1,2,11,0,0,1,1,0
Actual metal,0,7,0,0,0,71,1,0,4,0
Actual pop,0,37,3,6,1,0,50,2,0,0
Actual rnb,0,4,3,8,0,0,10,22,2,1
Actual rock,1,27,6,0,1,5,1,1,17,1
Actual soul,0,26,6,0,1,0,1,3,4,9


Accuracy Score: 0.7068741893644618
Classification Report
              precision    recall  f1-score   support

         1.0       0.50      0.43      0.46         7
         2.0       0.67      0.94      0.78       251
         3.0       0.46      0.61      0.52        28
         4.0       0.87      0.93      0.90       117
         6.0       0.69      0.42      0.52        26
         7.0       0.88      0.86      0.87        83
         8.0       0.68      0.51      0.58        99
        10.0       0.69      0.44      0.54        50
        11.0       0.52      0.28      0.37        60
        13.0       0.60      0.18      0.28        50

    accuracy                           0.71       771
   macro avg       0.65      0.56      0.58       771
weighted avg       0.70      0.71      0.68       771

Run time to fit model with 30.0% of total features: 3.130047082901001 seconds
Confusion Matrix


Unnamed: 0,Predicted classical,Predicted country,Predicted funk,Predicted hiphop,Predicted jazz,Predicted metal,Predicted pop,Predicted rnb,Predicted rock,Predicted soul
Actual classical,3,1,0,0,0,3,0,0,0,0
Actual country,0,242,1,0,2,2,4,0,0,0
Actual funk,0,6,14,0,0,1,1,1,2,3
Actual hiphop,0,2,0,112,0,0,3,0,0,0
Actual jazz,1,9,1,2,11,0,0,0,2,0
Actual metal,0,5,0,0,0,75,1,0,2,0
Actual pop,0,46,1,7,1,1,42,1,0,0
Actual rnb,0,8,3,13,0,0,7,19,0,0
Actual rock,2,30,6,0,1,5,0,0,15,1
Actual soul,1,29,3,0,1,0,1,3,3,9


Accuracy Score: 0.7029831387808041
Classification Report
              precision    recall  f1-score   support

         1.0       0.43      0.43      0.43         7
         2.0       0.64      0.96      0.77       251
         3.0       0.48      0.50      0.49        28
         4.0       0.84      0.96      0.89       117
         6.0       0.69      0.42      0.52        26
         7.0       0.86      0.90      0.88        83
         8.0       0.71      0.42      0.53        99
        10.0       0.79      0.38      0.51        50
        11.0       0.62      0.25      0.36        60
        13.0       0.69      0.18      0.29        50

    accuracy                           0.70       771
   macro avg       0.68      0.54      0.57       771
weighted avg       0.71      0.70      0.67       771



In [65]:
# Print feature percents and accuracy scores
for percent in features_accuracy:
    acc_score = features_accuracy[percent]
    print(f'Accuracy Score for {percent*100}% of total features: {acc_score}')

Accuracy Score for 10.0% of total features: 0.6316472114137484
Accuracy Score for 20.0% of total features: 0.7068741893644618
Accuracy Score for 30.0% of total features: 0.7029831387808041


In [66]:
# Get the highest accuracy score
best_percent = max(features_accuracy, key=features_accuracy.get)
best_acc = features_accuracy[best_percent]
print(f'The highest accuracy score of {best_acc} was achieved using {best_percent*100}% of all features.')

The highest accuracy score of 0.7068741893644618 was achieved using 20.0% of all features.
