In [148]:
import pandas as pd
import numpy as np

# Test set generation
To generate a test set of new songs from years the original model has not seen (2020 onwards) we combine data from two additional Kaggle datasets. The first dataset has song attributes from 1 million tracks from Spotify. The second dataset, made in 2023, contains all the billboard hot 100 songs going back decades (making this list is the main criteria for hit vs flop). If a song is in the latter dataset, the "popularity" column is set to true; else false, only including those songs which are from 2020 and onward. The data is also preprocessed almost identically to the train and validation dataset.

Sources: 
https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks
https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs


In [None]:
# read in the data
dataset = pd.read_csv('spotify_data.csv')
charts = pd.read_csv('charts.csv')

In [150]:
print(dataset.head())

   Unnamed: 0    artist_name        track_name                track_id  \
0           0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6   
1           1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218   
2           2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F   
3           3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz   
4           4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8   

   popularity  year     genre  danceability  energy  key  loudness  mode  \
0          68  2012  acoustic         0.483   0.303    4   -10.058     1   
1          50  2012  acoustic         0.572   0.454    3   -10.286     1   
2          57  2012  acoustic         0.409   0.234    3   -13.711     1   
3          58  2012  acoustic         0.392   0.251   10    -9.845     1   
4          54  2012  acoustic         0.430   0.791    6    -5.419     0   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0429      

In [151]:

print(charts.head())

         date  rank           song                         artist  last-week  \
0  2021-11-06     1     Easy On Me                          Adele        1.0   
1  2021-11-06     2           Stay  The Kid LAROI & Justin Bieber        2.0   
2  2021-11-06     3  Industry Baby        Lil Nas X & Jack Harlow        3.0   
3  2021-11-06     4     Fancy Like                   Walker Hayes        4.0   
4  2021-11-06     5     Bad Habits                     Ed Sheeran        5.0   

   peak-rank  weeks-on-board  
0          1               3  
1          1              16  
2          1              14  
3          3              19  
4          2              18  


In [152]:
print(dataset["genre"].unique())

['acoustic' 'afrobeat' 'alt-rock' 'ambient' 'black-metal' 'blues'
 'breakbeat' 'cantopop' 'chicago-house' 'chill' 'classical' 'club'
 'comedy' 'country' 'dance' 'dancehall' 'death-metal' 'deep-house'
 'detroit-techno' 'disco' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'guitar' 'hard-rock' 'hardcore'
 'hardstyle' 'heavy-metal' 'hip-hop' 'house' 'indian' 'indie-pop'
 'industrial' 'jazz' 'k-pop' 'metal' 'metalcore' 'minimal-techno'
 'new-age' 'opera' 'party' 'piano' 'pop' 'pop-film' 'power-pop'
 'progressive-house' 'psych-rock' 'punk' 'punk-rock' 'rock' 'rock-n-roll'
 'romance' 'sad' 'salsa' 'samba' 'sertanejo' 'show-tunes'
 'singer-songwriter' 'ska' 'sleep' 'songwriter' 'soul' 'spanish' 'swedish'
 'tango' 'techno' 'trance' 'trip-hop']


Since the genre labels are slightly different than those in our original hit predictor dataset (a bit more fine grained), we consolidate a few genres together into larger labels (i.e. 'electronic' should be classified as edm).

In [None]:
# focus on the last 5 years of data, which have been unseen by the hit predictor model
dataset['years_to_2025'] = 2025 - dataset['year']
dataset = dataset[dataset["years_to_2025"] < 6]

# consolidate genres into the larger labels used in the hit predictor dataset 
dataset["genre"] = dataset["genre"].replace({'salsa': 'latin', 
                                             'samba': 'latin', 
                                             'hip-hop': 'rap',
                                             'soul': 'r&b',
                                             'funk': 'r&b',
                                             'electronic': 'edm'})
dataset = dataset[dataset["genre"].isin(['pop', 'r&b', 'rock', 'latin', 'rap', 'edm'])]

# same preprocessing applied to the hit predictor dataset
dataset['key_sin'] = np.sin(2 * np.pi * dataset['key'] / 12)
dataset['key_cos'] = np.cos(2 * np.pi * dataset['key'] / 12)
dataset['duration_s'] = dataset['duration_ms'].apply(lambda x: x / 1000) # convert ms to seconds

dataset.head()


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,years_to_2025,key_sin,key_cos,duration_s
439400,439400,BRONSON,CALL OUT,4WIAD59mZw4c4j36fjTvFo,57,2020,edm,0.512,0.894,0,...,0.727,0.165,0.18,105.024,179453,4,5,0.0,1.0,179.453
439401,439401,Rezz,Someone Else,4hR3hQKELrARHNZJ51D14s,58,2020,edm,0.644,0.349,2,...,3.4e-05,0.125,0.246,94.971,198947,4,5,0.866025,0.5,198.947
439402,439402,Cadmium,No Friends,4U7G8dgUodMEVSv96QRcDb,60,2020,edm,0.771,0.644,4,...,7e-06,0.076,0.831,145.029,235024,4,5,0.866025,-0.5,235.024
439403,439403,Lane 8,Road,6kckNpuM5oXZrObLExRg6d,60,2020,edm,0.689,0.514,11,...,0.449,0.0952,0.0963,121.985,238033,4,5,-0.5,0.866025,238.033
439404,439404,Over Easy,Don't Know You (Gabe Ceribelli Remix),04JNdQ6NMhrCivy2SmDzTT,56,2020,edm,0.596,0.834,7,...,0.0235,0.105,0.674,123.97,156801,4,5,-0.5,-0.866025,156.801


This is a custom function for checking if the song name and artist appears in the Billboard hot 100.

In [154]:
def is_in_charts(row):
    if row.name%100 == 0:
        print(f"Processing row {row.name}")
    return ((charts['artist'] == row['artist_name']) & (charts['song'] == row['track_name'])).any()
dataset['popularity'] = dataset.apply(is_in_charts, axis=1)



Processing row 439400
Processing row 439500
Processing row 439600
Processing row 439700
Processing row 439800
Processing row 440200
Processing row 440300
Processing row 440400
Processing row 440500
Processing row 440600
Processing row 440700
Processing row 444100
Processing row 444200
Processing row 444300
Processing row 444400
Processing row 444500
Processing row 444600
Processing row 444700
Processing row 444800
Processing row 452900
Processing row 453000
Processing row 453100
Processing row 453200
Processing row 453300
Processing row 453400
Processing row 453500
Processing row 453600
Processing row 462000
Processing row 462100
Processing row 462200
Processing row 462300
Processing row 465500
Processing row 467300
Processing row 467400
Processing row 467500
Processing row 467600
Processing row 467700
Processing row 467800
Processing row 467900
Processing row 468000
Processing row 468100
Processing row 468200
Processing row 468300
Processing row 468400
Processing row 468500
Processing

In [155]:
dataset.drop(columns=['key', 'duration_ms', 'Unnamed: 0', 'track_id', 'year', 'track_name', 'artist_name'], inplace=True)


Our test dataset is imbalanced, with only about 1.3% of songs being hits, but it doesn't particularly matter since we're not training on it.

In [163]:

dataset['popularity'] = dataset['popularity'].astype(int)
dataset.describe()

Unnamed: 0,popularity,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,years_to_2025,key_sin,key_cos,duration_s
count,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0,18742.0
mean,0.012912,0.656593,0.659796,-7.341629,0.556824,0.110551,0.309014,0.119971,0.205339,0.555688,121.591726,3.645129,-0.030245,0.05018285,219.549177
std,0.112899,0.139296,0.190526,3.315528,0.496774,0.111299,0.281869,0.268313,0.178614,0.257109,29.168149,1.069206,0.661324,0.7478444,82.407533
min,0.0,0.0,0.0213,-29.511,0.0,0.0,2e-06,0.0,0.0134,0.0,0.0,2.0,-1.0,-1.0,18.919
25%,0.0,0.566,0.537,-8.929,0.0,0.0411,0.048,0.0,0.0975,0.35,97.2065,3.0,-0.5,-0.8660254,168.4755
50%,0.0,0.667,0.68,-6.884,1.0,0.0598,0.232,1.9e-05,0.131,0.573,120.0895,4.0,0.0,6.123234000000001e-17,207.1905
75%,0.0,0.756,0.807,-5.187,1.0,0.132,0.528,0.0145,0.26,0.774,140.002,5.0,0.5,0.8660254,255.662
max,1.0,0.985,0.999,3.795,1.0,0.958,0.996,0.994,0.993,0.988,227.895,5.0,1.0,1.0,1675.444


In [158]:
dataset.to_csv("new_songs.csv", index=False)