In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

In [2]:
# Read the spotify data in from data.csv
url = 'https://media.githubusercontent.com/media/jossharlequin/spotify-popularity-project/main/Resources/data.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [3]:
# Drop unused columns
spotify_df = df.drop(columns=['artists', 'name', 'id', 'release_date', 'year'])
spotify_df['seconds'] = spotify_df['duration_ms']/1000
# spotify_df.info()
spotify_df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,seconds
0,0.995,0.708,158648,0.195,0,0.563,10,0.151,-12.428,1,0,0.0506,118.469,0.779,158.648
1,0.994,0.379,282133,0.0135,0,0.901,8,0.0763,-28.454,1,0,0.0462,83.972,0.0767,282.133
2,0.604,0.749,104300,0.22,0,0.0,5,0.119,-19.924,0,0,0.929,107.177,0.88,104.3
3,0.995,0.781,180760,0.13,0,0.887,1,0.111,-14.734,0,0,0.0926,108.003,0.72,180.76
4,0.99,0.21,687733,0.204,0,0.908,11,0.098,-16.829,1,1,0.0424,62.149,0.0693,687.733


In [4]:
# Binning the target variable into groups of 5 for improved model accuracy 
bins = [-1,20,40,60,80,100]
labels = (0,.2,.4,.6,.8)
spotify_df['popularity'] = pd.cut(spotify_df['popularity'], bins=bins, labels=labels)
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169909 entries, 0 to 169908
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   acousticness      169909 non-null  float64 
 1   danceability      169909 non-null  float64 
 2   duration_ms       169909 non-null  int64   
 3   energy            169909 non-null  float64 
 4   explicit          169909 non-null  int64   
 5   instrumentalness  169909 non-null  float64 
 6   key               169909 non-null  int64   
 7   liveness          169909 non-null  float64 
 8   loudness          169909 non-null  float64 
 9   mode              169909 non-null  int64   
 10  popularity        169909 non-null  category
 11  speechiness       169909 non-null  float64 
 12  tempo             169909 non-null  float64 
 13  valence           169909 non-null  float64 
 14  seconds           169909 non-null  float64 
dtypes: category(1), float64(10), int64(4)
memory usage:

In [5]:
# Setting popularity as the target variable and setting the remaining columns as features
y = spotify_df.popularity.values
X = spotify_df.drop(columns='popularity').values

# Splitting training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
# Scaling the data using StandarScaler as a preprocessing step for the neural network
scaler = StandardScaler()

# Fitting the StandardScaler
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)