In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [68]:
df = pd.read_csv("SpotifyFeatures.csv")

In [69]:
df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [70]:
df.isnull().sum()

genre               0
artist_name         0
track_name          1
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [71]:
df.describe()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0
mean,41.127502,0.36856,0.554364,235122.3,0.570958,0.148301,0.215009,-9.569885,0.120765,117.666585,0.454917
std,18.189948,0.354768,0.185608,118935.9,0.263456,0.302768,0.198273,5.998204,0.185518,30.898907,0.260065
min,0.0,0.0,0.0569,15387.0,2e-05,0.0,0.00967,-52.457,0.0222,30.379,0.0
25%,29.0,0.0376,0.435,182857.0,0.385,0.0,0.0974,-11.771,0.0367,92.959,0.237
50%,43.0,0.232,0.571,220427.0,0.605,4.4e-05,0.128,-7.762,0.0501,115.778,0.444
75%,55.0,0.722,0.692,265768.0,0.787,0.0358,0.264,-5.501,0.105,139.054,0.66
max,100.0,0.996,0.989,5552917.0,0.999,0.999,1.0,3.744,0.967,242.903,1.0


In [72]:
# Add artist_avg_popularity feature
df['artist_avg_popularity'] = df['artist_name'].map(df.groupby('artist_name')['popularity'].mean())

In [73]:
df.head(2)

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artist_avg_popularity
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814,4.651899
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816,3.0


In [78]:
df= df.drop(['track_id', 'track_name', 'artist_name'], axis=1)

In [79]:
for col in ['key', 'mode', 'time_signature']:
    df[col] = LabelEncoder().fit_transform(df[col])

In [80]:
df['danceability_per_loudness'] = df['danceability'] / (df['loudness'] + 1e-5)
df['energy_valence'] = df['energy'] * df['valence']
df['duration_mins'] = df['duration_ms'] / 60000

In [84]:
genre_avg_pop = df.groupby('genre')['popularity'].mean()
df['genre_avg_popularity'] = df['genre'].map(genre_avg_pop)

In [86]:
df= df.drop(['genre','duration_ms'], axis=1)

In [87]:
# One-hot encode categorical features
#df_encoded = pd.get_dummies(df_model, columns=['genre', 'key', 'mode', 'time_signature'],dtype=int, drop_first=True)

In [94]:
df.head()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artist_avg_popularity,danceability_per_loudness,energy_valence,duration_mins,genre_avg_popularity
0,0,0.611,0.389,0.91,0.0,4,0.346,-1.828,0,0.0525,166.969,3,0.814,4.651899,-0.212802,0.74074,1.656217,12.174097
1,1,0.246,0.59,0.737,0.0,9,0.151,-5.559,1,0.0868,174.003,3,0.816,3.0,-0.106134,0.601392,2.28955,12.174097
2,3,0.952,0.663,0.131,0.0,3,0.103,-13.879,1,0.0362,99.488,4,0.368,9.076923,-0.04777,0.048208,2.837783,12.174097
3,0,0.703,0.24,0.326,0.0,4,0.0985,-12.178,0,0.0395,171.758,3,0.227,4.651899,-0.019708,0.074002,2.54045,12.174097
4,4,0.95,0.331,0.225,0.123,8,0.202,-21.15,0,0.0456,140.576,3,0.39,8.7,-0.01565,0.08775,1.377083,12.174097


In [89]:
X = df.drop('popularity', axis=1)
y = df['popularity']

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [91]:
rf = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

In [92]:
#y_pred = rf.predict(X_test)
#rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#r2 = r2_score(y_test, y_pred)

In [93]:
#print(f"✅ RMSE: {rmse:.2f}")
#print(f"✅ R² Score: {r2:.4f}")

✅ RMSE: 7.58
✅ R² Score: 0.8274


In [97]:
import pickle
# open a file, where you ant to store the data
file = open('spotify_model.pkl', 'wb')

# dump information to that file
pickle.dump(rf, file)

In [98]:
df.columns

Index(['popularity', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence',
       'artist_avg_popularity', 'danceability_per_loudness', 'energy_valence',
       'duration_mins', 'genre_avg_popularity'],
      dtype='object')