In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import time
import operator
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Create DataFrames from CSV files
t0 = time.time()
filtered_coocurring_df = pd.read_csv('../Data/filtered_coocurring_TF.csv')
filtered_coocurring_df = filtered_coocurring_df.drop([0])
filtered_coocurring_df.index = filtered_coocurring_df.index - 1
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_coocurring_df.head(3)

Run time: 58.270630836486816 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,good time,dont mind,last night,cause youre,dont worry,brand new,like dont,know like,im feeling,feel good
0,willow,Taylor Swift,pop,8.0,"['dance', 'pop']",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"['indie', 'pop', 'electropop']",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WITHOUT YOU,The Kid LAROI,pop,8.0,['australian'],0.662,0.413,0.0,1.0,0.0299,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Create a dictionary of category names and ids
cat_name_id = {}
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    cat_id = list(filtered_coocurring_df[filtered_coocurring_df['category_name']==cat]['category_id'])[0]
    cat_name_id[cat] = cat_id
    print(cat, cat_id)

blues 0.0
classical 1.0
country 2.0
funk 3.0
hiphop 4.0
indie_alt 5.0
jazz 6.0
metal 7.0
pop 8.0
punk 9.0
rnb 10.0
rock 11.0
romance 12.0
soul 13.0


In [5]:
# Create a genre lists column
t0 = time.time()
genre_lists = []
for index, row in filtered_coocurring_df.iterrows():
    genres = row['genre_list'].replace('[', '').replace(']', '').replace("'", '').split(', ')
    genre_lists.append(genres)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
len(genre_lists)

Run time: 59.471962690353394 seconds


8014

In [6]:
# Replace the genre_list column
filtered_coocurring_df['genre_list'] = genre_lists

In [7]:
# Print how many songs there are in each category
for cat in category_list:
    song_count = len(filtered_coocurring_df[filtered_coocurring_df['category_name']==cat])
    print(f'There are {song_count} songs in the {cat} category.')

There are 423 songs in the blues category.
There are 83 songs in the classical category.
There are 1257 songs in the country category.
There are 281 songs in the funk category.
There are 683 songs in the hiphop category.
There are 657 songs in the indie_alt category.
There are 196 songs in the jazz category.
There are 892 songs in the metal category.
There are 691 songs in the pop category.
There are 569 songs in the punk category.
There are 362 songs in the rnb category.
There are 1229 songs in the rock category.
There are 280 songs in the romance category.
There are 411 songs in the soul category.


In [8]:
# Get the most popular genres for each category
t0 = time.time()
category_genres = {}
for category in category_list:
    genres = []
    for index, row in filtered_coocurring_df[filtered_coocurring_df['category_name']==category].iterrows():
        genres.extend(row['genre_list'])
    unique_genres = list(set(genres))
    genre_counts = []
    for genre in unique_genres:
        genre_counts.append((genre, genres.count(genre)))
    category_genres[category] = sorted(genre_counts, key = lambda x: x[1], reverse=True)[0:5]
    print(category, category_genres[category])
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

blues [('blues', 259), ('electric', 189), ('modern', 168), ('traditional', 145), ('classic', 106)]
classical [('dance', 12), ('classical', 12), ('operatic', 11), ('', 9), ('pop', 9)]
country [('country', 977), ('contemporary', 847), ('modern', 277), ('pop', 88), ('nashville', 78)]
funk [('funk', 133), ('soul', 118), ('motown', 97), ('quiet', 82), ('disco', 79)]
hiphop [('rap', 345), ('hip', 304), ('pop', 227), ('trap', 192), ('southern', 142)]
indie_alt [('indie', 290), ('alternative', 181), ('new', 155), ('modern', 148), ('art', 142)]
jazz [('vocal', 92), ('contemporary', 82), ('jazz', 77), ('adult', 63), ('neo', 38)]
metal [('metal', 366), ('alternative', 284), ('nu', 221), ('rock', 220), ('hard', 207)]
pop [('pop', 343), ('indie', 180), ('dance', 163), ('electropop', 129), ('post-teen', 128)]
punk [('pop', 195), ('punk', 138), ('skate', 94), ('alternative', 84), ('melodic', 72)]
rnb [('pop', 219), ('r&b', 213), ('urban', 200), ('dance', 169), ('hip', 136)]
rock [('rock', 638), ('mod

In [9]:
# Find indeces to keep and indeces to remove
t0 = time.time()
cat_genre_dict = {'pop': ['pop'], 
                  'hiphop': ['rap', 'hip', 'trap'],
                  'metal': ['metal'], 
                  'rock': ['classic'], 
                  'jazz': ['jazz'], 
                  'rnb': ['r&b'], 
                  'romance': ['soft'], 
                  'soul': ['soul'], 
                  'indie_alt': ['indie', 'alternative'],
                  'classical': ['classical', 'operatic', 'dance'], 
                  'blues': ['modern', 'blues'],
                  'funk': ['funk', 'disco'], 
                  'punk': ['punk'],
                  'country': ['country']} 
indeces_to_keep = []
indeces_to_remove = []
for index, row in filtered_coocurring_df.iterrows():
    keep = False
    category = row['category_name']
    for genre in cat_genre_dict[category]:
        if genre in row['genre_list']:
            keep = True
            break
    if keep == True:
        indeces_to_keep.append(index)
    else:
        indeces_to_remove.append(index)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
print(len(indeces_to_keep))
len(indeces_to_remove)

Run time: 43.56559991836548 seconds
3975


4039

In [10]:
# Create non destructive copy and remove the unnecessary indeces
t0 = time.time()
filtered_coocurring_df2 = filtered_coocurring_df.drop(indeces_to_remove)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
filtered_coocurring_df2.head(3)

Run time: 0.37855100631713867 seconds


Unnamed: 0,song_name,artist_name,category_name,category_id,genre_list,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,...,good time,dont mind,last night,cause youre,dont worry,brand new,like dont,know like,im feeling,feel good
0,willow,Taylor Swift,pop,8.0,"[dance, pop]",0.392,0.574,7.0,1.0,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Stay Next To Me (with Chelsea Cutler),Quinn XCII,pop,8.0,"[indie, pop, electropop]",0.581,0.584,2.0,1.0,0.284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,r u ok,Tate McRae,pop,8.0,"[dance, pop, electropop, post-teen]",0.666,0.593,2.0,1.0,0.0373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,BOYSHIT,Madison Beer,pop,8.0,"[dance, pop, electropop, post-teen]",0.638,0.699,1.0,1.0,0.0892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Safe With Me (with Audrey Mika),Gryffin,pop,8.0,"[dance, pop, tropical, edm]",0.497,0.757,11.0,1.0,0.127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Remove the romance punk, and jazz categories
filtered_coocurring_df2 = filtered_coocurring_df2[filtered_coocurring_df2['category_name']!='romance']
filtered_coocurring_df2 = filtered_coocurring_df2[filtered_coocurring_df2['category_name']!='punk']
filtered_lyric_TF_df2 = filtered_lyric_TF_df2[filtered_lyric_TF_df2['category_name']!='blues']
len(filtered_lyric_TF_df2)

In [12]:
# Print how many songs there are in each category 
category_list = ['blues', 'classical', 'country', 'funk', 'hiphop', 'indie_alt', 'jazz', 
                 'metal', 'pop', 'punk', 'rnb', 'rock', 'romance', 'soul']
for cat in category_list:
    song_count = len(filtered_coocurring_df2[filtered_coocurring_df2['category_name']==cat])
    print(f'There are {song_count} songs in the {cat} category.')

There are 315 songs in the blues category.
There are 31 songs in the classical category.
There are 977 songs in the country category.
There are 163 songs in the funk category.
There are 446 songs in the hiphop category.
There are 375 songs in the indie_alt category.
There are 77 songs in the jazz category.
There are 366 songs in the metal category.
There are 343 songs in the pop category.
There are 0 songs in the punk category.
There are 213 songs in the rnb category.
There are 245 songs in the rock category.
There are 0 songs in the romance category.
There are 208 songs in the soul category.


In [13]:
# Create a list of all genres
t0 = time.time()
genres_list = []
genre_column = list(filtered_coocurring_df2['genre_list'])
for artist_genres in genre_column:
    genres_list.extend(artist_genres)
genres_list = list(set(genres_list))
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
len(genres_list)

Run time: 0.003610849380493164 seconds


425

In [14]:
# Find genres that are unique to one category
t0 = time.time()
unique_genres = []
for genre in genres_list:
    genre_df = filtered_coocurring_df2[pd.DataFrame(filtered_coocurring_df2['genre_list'].tolist()).isin(np.tile(genre, len(filtered_coocurring_df2))).any(1).values]
    if len(set(genre_df['category'])) == 1:
        unique_genres.append(genre)
unique_genres = list(set(unique_genres))
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
len(unique_genres)

Run time: 7.496011018753052 seconds


422

In [16]:
# Define the features set
X = filtered_coocurring_df2.copy()
X = X.drop(['song_name', 'artist_name', 'category_name', 'category_id', 'genre_list'], axis=1)
X.head()

Unnamed: 0,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,...,good time,dont mind,last night,cause youre,dont worry,brand new,like dont,know like,im feeling,feel good
0,0.392,0.574,7.0,1.0,0.17,0.833,0.00179,0.145,0.529,81.112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.581,0.584,2.0,1.0,0.284,0.0805,0.0,0.366,0.756,179.954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666,0.593,2.0,1.0,0.0373,0.318,0.0,0.414,0.329,140.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.638,0.699,1.0,1.0,0.0892,0.0203,0.0,0.0879,0.57,121.122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.497,0.757,11.0,1.0,0.127,0.0449,0.0,0.119,0.507,169.902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Define the target set
y = filtered_coocurring_df2['category_id'].ravel()
y[:5]

array([8., 8., 8., 8., 8.])

In [18]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2819, 14171)
(940, 14171)
(2819,)
(940,)


In [19]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Model

In [20]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [21]:
# Fit the model
t0 = time.time()
rf_model = rf_model.fit(X_train_scaled, y_train)
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

Run time: 48.817034006118774 seconds


# Making Predictions

In [22]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([ 7.,  7.,  2.,  2.,  5.,  5.,  2.,  4.,  2.,  4.,  2., 11.,  8.,
        2.,  2.,  2.,  4.,  3.,  4.,  2.,  2.,  2.,  2.,  5.,  4.,  2.,
        7.,  4.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  4.,  5.,
        4.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  7.,  2.,  2.,  2.,  2.,
        2.,  2.,  5., 11.,  2., 10.,  2.,  2.,  2.,  2.,  2.,  0.,  2.,
        4.,  2.,  5.,  7.,  2.,  5.,  2.,  2.,  2.,  2.,  2.,  2.,  7.,
        4.,  4.,  2.,  2.,  2.,  2.,  2.,  2.,  6.,  2.,  2.,  2.,  4.,
        2.,  2.,  2.,  2.,  7.,  5.,  2.,  2.,  7., 13.,  2.,  8.,  2.,
        4.,  2.,  2.,  2.,  2.,  2.,  0.,  5.,  4.,  2.,  2.,  2.,  2.,
        7.,  7.,  8.,  4.,  5.,  7.,  2.,  2.,  4.,  7.,  2.,  2.,  0.,
        2.,  2.,  7.,  7.,  4.,  2.,  2.,  2., 10.,  2.,  2.,  2.,  2.,
        2.,  3.,  5.,  4., 13.,  2.,  2.,  2.,  5.,  2.,  7.,  2.,  7.,
        2.,  2.,  5.,  7.,  2.,  2.,  2.,  5.,  2.,  2.,  2.,  4., 10.,
        2.,  7.,  2.,  2.,  2.,  4.,  0.,  2.,  4.,  2.,  2.,  2

In [107]:
# Create a dictionary of category names and percent of correct predictions
classical = 0
incorrect_classical = []
country = 0
incorrect_country = []
funk = 0
incorrect_funk = []
hiphop = 0
incorrect_hiphop = []
#indie_alt = 0
#incorrect_indie_alt = []
jazz = 0
incorrect_jazz = []
metal = 0
incorrect_metal = []
pop = 0
incorrect_pop = []
rnb = 0
incorrect_rnb = []
rock = 0
incorrect_rock = []
soul = 0
incorrect_soul = []
for i in range(len(y_test)):
    if predictions[i] != y_test[i]:
        if y_test[i] == 1:
            classical+=1
            incorrect_classical.append(predictions[i])
        if y_test[i] == 2:
            country+=1
            incorrect_country.append(predictions[i])
        if y_test[i] == 3:
            funk+=1
            incorrect_funk.append(predictions[i])
        if y_test[i] == 4:
            hiphop+=1
            incorrect_hiphop.append(predictions[i])
        #if y_test[i] == 5:
        #    indie_alt+=1
        #    incorrect_indie_alt.append(predictions[i])
        if y_test[i] == 6:
            jazz+=1
            incorrect_jazz.append(predictions[i])
        if y_test[i] == 7:
            metal+=1
            incorrect_metal.append(predictions[i])
        if y_test[i] == 8:
            pop+=1
            incorrect_pop.append(predictions[i])
        if y_test[i] == 10:
            rnb+=1
            incorrect_rnb.append(predictions[i])
        if y_test[i] == 11:
            rock+=1
            incorrect_rock.append(predictions[i])
        if y_test[i] == 13:
            soul+=1
            incorrect_soul.append(predictions[i])
cat_percents = {}
cat_percents['classical'] = (31-classical)/31*100
cat_percents['country'] = (977-country)/977*100
cat_percents['funk'] = (163-funk)/163*100
cat_percents['hiphop'] = (446-hiphop)/446*100
#cat_percents['indie_alt'] = (375-indie_alt)/375*100
cat_percents['jazz'] = (77-jazz)/77*100
cat_percents['metal'] = (366-metal)/366*100
cat_percents['pop'] = (343-pop)/343*100
cat_percents['rnb'] = (213-rnb)/213*100
cat_percents['rock'] = (245-rock)/245*100
cat_percents['soul'] = (208-soul)/208*100
cat_percents = {key: val for key, val in sorted(cat_percents.items(), key=lambda item: item[1], reverse=True)}
cat_percents

{'country': 99.79529170931423,
 'hiphop': 99.10313901345292,
 'metal': 98.36065573770492,
 'funk': 91.41104294478528,
 'classical': 87.09677419354838,
 'rnb': 82.15962441314554,
 'pop': 81.92419825072886,
 'rock': 80.81632653061224,
 'soul': 78.36538461538461,
 'jazz': 77.92207792207793}

In [108]:
# Get the most popular incorrect predictions for each category
incorrect_preds = {}

classical_preds = []
for pred in incorrect_classical:
    classical_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(classical_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, classical_preds.count(pred)))
incorrect_preds['classical'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

country_preds = []
for pred in incorrect_country:
    country_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(country_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, country_preds.count(pred)))
incorrect_preds['country'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

funk_preds = []
for pred in incorrect_funk:
    funk_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(funk_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, funk_preds.count(pred)))
incorrect_preds['funk'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

hiphop_preds = []
for pred in incorrect_hiphop:
    hiphop_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(hiphop_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, hiphop_preds.count(pred)))
incorrect_preds['hiphop'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

#indie_alt_preds = []
#for pred in incorrect_indie_alt:
#    indie_alt_preds.append(list(cat_name_id.keys())[int(pred)])
#unique_preds = list(set(indie_alt_preds))
#pred_counts = []
#for pred in unique_preds:
#    pred_counts.append((pred, indie_alt_preds.count(pred)))
#incorrect_preds['indie_alt'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

jazz_preds = []
for pred in incorrect_jazz:
    jazz_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(jazz_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, jazz_preds.count(pred)))
incorrect_preds['jazz'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

metal_preds = []
for pred in incorrect_metal:
    metal_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(metal_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, metal_preds.count(pred)))
incorrect_preds['metal'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

pop_preds = []
for pred in incorrect_pop:
    pop_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(pop_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, pop_preds.count(pred)))
incorrect_preds['pop'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

rnb_preds = []
for pred in incorrect_rnb:
    rnb_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(rnb_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, rnb_preds.count(pred)))
incorrect_preds['rnb'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

rock_preds = []
for pred in incorrect_rock:
    rock_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(rock_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, rock_preds.count(pred)))
incorrect_preds['rock'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]

soul_preds = []
for pred in incorrect_soul:
    soul_preds.append(list(cat_name_id.keys())[int(pred)])
unique_preds = list(set(soul_preds))
pred_counts = []
for pred in unique_preds:
    pred_counts.append((pred, soul_preds.count(pred)))
incorrect_preds['soul'] = sorted(pred_counts, key = lambda x: x[1], reverse=True)[0:5]


In [109]:
incorrect_preds

{'classical': [('country', 2), ('funk', 1), ('rock', 1)],
 'country': [('metal', 1), ('rock', 1)],
 'funk': [('country', 10), ('metal', 2), ('hiphop', 2)],
 'hiphop': [('country', 3), ('pop', 1)],
 'jazz': [('country', 12), ('rock', 3), ('hiphop', 1), ('classical', 1)],
 'metal': [('country', 3), ('hiphop', 2), ('rock', 1)],
 'pop': [('country', 50), ('hiphop', 8), ('rnb', 3), ('metal', 1)],
 'rnb': [('hiphop', 14), ('country', 14), ('pop', 9), ('funk', 1)],
 'rock': [('country', 33), ('metal', 8), ('funk', 5), ('pop', 1)],
 'soul': [('country', 27), ('funk', 7), ('rnb', 3), ('rock', 3), ('jazz', 2)]}

In [110]:
# Print the best and worst categories of predictions
country_percent = cat_percents['country']
jazz_percent = cat_percents['jazz']
print(f'The country category performed the best with {country_percent}% of predictions being correct.')
print(f'The jazz category performed the worst with {jazz_percent}% of predictions being correct.')

The country category performed the best with 99.79529170931423% of predictions being correct.
The jazz category performed the worst with 77.92207792207793% of predictions being correct.


# Model Evaluation

In [23]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.5670212765957446


In [24]:
# Calculate the feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([0.01228562, 0.01392564, 0.00405643, ..., 0.00028572, 0.00014918,
       0.00021985])

In [25]:
# Sort the features by importance
sorted_features = sorted(zip(importances, X.columns), reverse=True)
sorted_features

[(0.014611802614889027, 'audio_ft_acousticness'),
 (0.014387978927179574, 'audio_ft_instrumentalness'),
 (0.013932207649325158, 'audio_ft_speechiness'),
 (0.013925640596728689, 'audio_ft_energy'),
 (0.012285623362592833, 'audio_ft_danceability'),
 (0.010996758240428577, 'audio_ft_duration_ms'),
 (0.009083128018462007, 'audio_ft_valence'),
 (0.0059499964730990885, 'audio_ft_liveness'),
 (0.0058990065620330165, 'audio_ft_tempo'),
 (0.005350804961876678, 'like'),
 (0.005180009339749748, 'baby'),
 (0.00489154054335957, 'love'),
 (0.004866594023553748, 'aint'),
 (0.004820937635935022, 'im'),
 (0.0046426921909782275, 'yeah'),
 (0.004580014259630781, '?'),
 (0.004462316717936589, 'niggas'),
 (0.004451687634574958, 'oh'),
 (0.004353805488105356, 'got'),
 (0.004295923146260811, 'audio_ft_mode'),
 (0.004201020913605584, 'know'),
 (0.004056430375974844, 'audio_ft_key'),
 (0.003874710444906292, 'shit'),
 (0.0037718204875979706, 'dont'),
 (0.0037026908663942734, 'bitch'),
 (0.0036309214613475366, '

In [36]:
# Get frequencies of top 50% features
percent = 0.5

# Define the features and target sets
important_features = []
sum = 0
count = 0
while sum < percent:
    sum = sum + sorted_features[count][0]
    important_features.append(sorted_features[count][1])
    count+=1
X = filtered_coocurring_df2[important_features]
X.loc[-1] = X.sum(numeric_only=True)
X.index = X.index + 1  
X = X.sort_index()
X = X.sort_values(by=[0], axis=1, ascending=False)
X.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,audio_ft_duration_ms,audio_ft_tempo,audio_ft_key,audio_ft_time_signature,im,dont,like,yeah,?,oh,...,hoes,? oh,truck,audio_ft_instrumentalness,bar,g,rap,whiskey,love baby,bust
0,871566987.0,451081.841,20138.0,14798.0,11708.0,8653.0,8451.0,8279.0,7749.0,7572.0,...,252.0,249.0,223.0,211.328835,210.0,189.0,185.0,176.0,162.0,152.0
1,214707.0,81.112,7.0,4.0,10.0,0.0,7.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.00179,0.0,0.0,0.0,0.0,0.0,0.0
2,206046.0,179.954,2.0,4.0,0.0,9.0,0.0,10.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Dropping Lower Ranked Features

In [27]:
# Create a dictionary of features and accuracy scores
features_accuracy = {}
feature_percents = [0.1, 0.2, 0.3, 0.4]
for percent in feature_percents:
    # Define the features and target sets
    important_features = []
    sum = 0
    count = 0
    while sum < percent:
        sum = sum + sorted_features[count][0]
        important_features.append(sorted_features[count][1])
        count+=1
    X = filtered_coocurring_df2[important_features]
    y = filtered_coocurring_df2['category_id'].ravel()
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    # Scale the data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    # Fit the random forest model
    rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
    t0 = time.time()
    rf_model = rf_model.fit(X_train_scaled, y_train)
    t1 = time.time()
    print(f'Run time to fit model with {percent*100}% of total features: {t1-t0} seconds')
    # Make predictions
    predictions = rf_model.predict(X_test_scaled)
    acc_score = accuracy_score(y_test, predictions)
    features_accuracy[percent] = acc_score

Run time to fit model with 10.0% of total features: 4.38977575302124 seconds
Run time to fit model with 20.0% of total features: 4.264497995376587 seconds
Run time to fit model with 30.0% of total features: 4.907401084899902 seconds
Run time to fit model with 40.0% of total features: 5.12242317199707 seconds


In [28]:
# Print feature percents and accuracy scores
for percent in features_accuracy:
    acc_score = features_accuracy[percent]
    print(f'Accuracy Score for {percent*100}% of total features: {acc_score}')

Accuracy Score for 10.0% of total features: 0.5765957446808511
Accuracy Score for 20.0% of total features: 0.6361702127659574
Accuracy Score for 30.0% of total features: 0.6297872340425532
Accuracy Score for 40.0% of total features: 0.6319148936170212


In [29]:
# Get the highest accuracy score
best_percent = max(features_accuracy, key=features_accuracy.get)
best_acc = features_accuracy[best_percent]
print(f'The highest accuracy score of {best_acc} was achieved using {best_percent*100}% of all features.')

The highest accuracy score of 0.6361702127659574 was achieved using 20.0% of all features.
