In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import TargetEncoder

In [19]:
spotify = pd.read_csv('../data/Spotify dataset export 2025-02-10 09-00-03.csv')

In [20]:
spotify.head()

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
0,Gorillaz,Feel Good Inc.,Demon Days,album,0.818,0.705,-6.679,0.177,0.00836,0.00233,...,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221,6220896,169907,True,True,1040234854,1.150082,Spotify
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.0869,0.000687,...,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645,1079128,31003,True,True,310083733,15.183585,Spotify
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.93,0.0522,0.0425,0.0469,...,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055,282142,7399,True,True,63063467,7.956897,Spotify
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,0.689,0.739,-5.81,0.026,1.5e-05,0.509,...,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952,1788577,55229,True,True,434663559,11.546875,Spotify
4,Gorillaz,Clint Eastwood,Gorillaz,album,0.663,0.694,-8.627,0.171,0.0253,0.0,...,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958,6197318,155930,True,True,617259738,9.942693,Youtube


In [21]:
spotify.columns

Index(['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Duration_min', 'Title', 'Channel',
       'Views', 'Likes', 'Comments', 'Licensed', 'official_video', 'Stream',
       'EnergyLiveness', 'most_playedon'],
      dtype='object')

In [22]:
X = spotify.drop(columns=['Stream', 'Track', 'Channel', 'most_playedon', 'Likes', 'Views', 'Comments'])
y = spotify['Stream']

In [23]:
X

Unnamed: 0,Artist,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Title,Licensed,official_video,EnergyLiveness
0,Gorillaz,Demon Days,album,0.818,0.705,-6.679,0.1770,0.008360,0.002330,0.6130,0.7720,138.559,3.710667,Gorillaz - Feel Good Inc. (Official Video),True,True,1.150082
1,Gorillaz,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.086900,0.000687,0.0463,0.8520,92.761,3.336217,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,True,True,15.183585
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.930,0.0522,0.042500,0.046900,0.1160,0.5510,108.014,3.585833,Gorillaz - New Gold ft. Tame Impala & Bootie B...,True,True,7.956897
3,Gorillaz,Plastic Beach,album,0.689,0.739,-5.810,0.0260,0.000015,0.509000,0.0640,0.5780,120.423,3.897783,Gorillaz - On Melancholy Hill (Official Video),True,True,11.546875
4,Gorillaz,Gorillaz,album,0.663,0.694,-8.627,0.1710,0.025300,0.000000,0.0698,0.5250,167.953,5.682000,Gorillaz - Clint Eastwood (Official Video),True,True,9.942693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20589,SICK LEGEND,JUST DANCE HARDSTYLE,single,0.582,0.926,-6.344,0.0328,0.448000,0.000000,0.0839,0.6580,90.002,1.577783,JUST DANCE HARDSTYLE,True,True,11.036949
20590,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE,single,0.531,0.936,-1.786,0.1370,0.028000,0.000000,0.0923,0.6570,174.869,2.514283,SET FIRE TO THE RAIN HARDSTYLE,True,True,10.140845
20591,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP,single,0.443,0.830,-4.679,0.0647,0.024300,0.000000,0.1540,0.4190,168.388,2.280700,OUTSIDE HARDSTYLE SPED UP,True,True,5.389610
20592,SICK LEGEND,ONLY GIRL HARDSTYLE,single,0.417,0.767,-4.004,0.4190,0.356000,0.018400,0.1080,0.5390,155.378,1.806450,ONLY GIRL HARDSTYLE,True,True,7.101852


In [24]:
transformer = ColumnTransformer(transformers=[
    ('num', StandardScaler(), ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 
                               'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_min']),
    ('cat_album', OneHotEncoder(handle_unknown='ignore'), ['Album_type']),
    ('cat_artist', OneHotEncoder(handle_unknown='ignore', sparse_output=True, max_categories=50), ['Artist']),
    ('text_title', TfidfVectorizer(stop_words='english', max_features=100), 'Title'),
    ('text_album', TfidfVectorizer(stop_words='english', max_features=100), 'Album')
])

pipeline = make_pipeline(transformer, 
                         RandomForestRegressor(n_estimators=100, random_state=42))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('MSE:', mean_squared_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

MSE: 4.351113411044235e+16
R2: 0.23143422171098593


In [None]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print("Cross-Validation R2 Scores:", scores)
print("Mean R2 Score:", scores.mean())