In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import TargetEncoder
import numpy as np

In [37]:
spotify = pd.read_csv('../data/Spotify dataset export 2025-02-10 09-00-03.csv')

In [38]:
spotify.head()

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
0,Gorillaz,Feel Good Inc.,Demon Days,album,0.818,0.705,-6.679,0.177,0.00836,0.00233,...,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221,6220896,169907,True,True,1040234854,1.150082,Spotify
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.0869,0.000687,...,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645,1079128,31003,True,True,310083733,15.183585,Spotify
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.93,0.0522,0.0425,0.0469,...,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055,282142,7399,True,True,63063467,7.956897,Spotify
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,0.689,0.739,-5.81,0.026,1.5e-05,0.509,...,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952,1788577,55229,True,True,434663559,11.546875,Spotify
4,Gorillaz,Clint Eastwood,Gorillaz,album,0.663,0.694,-8.627,0.171,0.0253,0.0,...,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958,6197318,155930,True,True,617259738,9.942693,Youtube


In [39]:
spotify.columns

Index(['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Duration_min', 'Title', 'Channel',
       'Views', 'Likes', 'Comments', 'Licensed', 'official_video', 'Stream',
       'EnergyLiveness', 'most_playedon'],
      dtype='object')

In [44]:
X = spotify.drop(columns=['Stream', 'Track', 'Channel', 'most_playedon', 'Likes', 'Views', 'Comments'])
y = np.log1p(spotify['Stream'])

In [45]:
transformer = ColumnTransformer(transformers=[
   ('num', StandardScaler(), ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 
                               'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_min']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Album_type', 'Artist']),
    ('text_title', TfidfVectorizer(stop_words='english', max_features=100), 'Title'),
    ('text_album', TfidfVectorizer(stop_words='english', max_features=100), 'Album')
])

pipeline = make_pipeline(transformer, 
                         RandomForestRegressor(n_estimators=100, random_state=42))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('MSE:', mean_squared_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

MSE: 7.331437529213359
R2: 0.33172327031076165


In [46]:
np.expm1(mean_squared_error(y_test, y_pred))

1526.5761296980882

In [35]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print("Cross-Validation R2 Scores:", scores)
print("Mean R2 Score:", scores.mean())

KeyboardInterrupt: 

In [None]:
X