In [5]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PowerTransformer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer

df = pd.read_csv(r"spotify_songs.csv")

df['track_album_release_date'] = pd.to_datetime(df['track_album_release_date'], errors='coerce')
df = df.dropna(subset=['track_album_release_date'])
df.set_index('track_album_release_date', inplace=True)

df = df.drop_duplicates()

df = df.dropna()

df['duration_min'] = df['duration_ms'] / 60000

numeric_columns = ['track_popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                   'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_min']

pt = PowerTransformer(method='yeo-johnson')
df[numeric_columns] = pt.fit_transform(df[numeric_columns])
skewness = df[['track_popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 
               'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 
               'tempo', 'duration_ms']].skew()

df_cleaned = df.drop(columns=['track_id', 'track_album_id', 'playlist_name', 'playlist_id', 'playlist_genre'])

categorical_columns = ['track_name', 'track_artist', 'track_album_name', 'playlist_subgenre']
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_columns, drop_first=True)

df_encoded.head()

skewness

track_popularity   -0.481222
danceability       -0.043595
energy             -0.099639
key                -0.248088
loudness           -0.003601
mode               -0.245655
speechiness         0.501093
acousticness        0.401657
instrumentalness    1.731268
liveness            0.306455
valence            -0.037829
tempo               0.020758
duration_ms         1.169463
dtype: float64

In [49]:
numeric_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'track_popularity']

scaler = StandardScaler()
data = scaler.fit_transform(df[numeric_columns])


In [41]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [42]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel

In [43]:
#for kernel regression
kernels = [ConstantKernel(constant_value=1.0) + WhiteKernel(noise_level=1.0),
           ConstantKernel() * RBF(length_scale=1.0),
           ConstantKernel() * RBF() + WhiteKernel(),
           None #default added back
          ]

In [44]:
classifiers = [linear_model.LinearRegression(),
               linear_model.Lasso(), 
               linear_model.Ridge(), 
               KNeighborsRegressor()]
parameters = [{'fit_intercept':[True, False], 'positive':[True, False]},
            {'alpha':[1, 5, 10, 50, 100, 250, 500, 1000]}, 
              {'alpha':[0.5, 1, 5, 10, 50, 100, 250, 500]}, 
              {'n_neighbors':[1, 3, 5, 7, 9, 11], 'p':[1, 2, 3], 'weights':('uniform', 'distance')}]

In [45]:
model_tracking = None

for i in np.arange(len(classifiers)):
    reg = GridSearchCV(classifiers[i], parameters[i], cv = 5)
    reg.fit(X_train, y_train)
    training_score = reg.best_score_
    test_score = reg.score(X_test, y_test)
    best_pars = str(reg.best_params_)
    test_predictions = reg.predict(X_test)
    test_mse = mean_squared_error(y_test, test_predictions)
    
    new_row = pd.DataFrame({
    'Cross Val Training Score': training_score,
    'Test Score': test_score,
    'MSE': test_mse,
    'Paramaters': best_pars},index=[classifiers[i]])
    model_tracking=pd.concat([model_tracking, new_row])

In [46]:
model_tracking

Unnamed: 0,Cross Val Training Score,Test Score,MSE,Paramaters
LinearRegression(),0.076886,0.077402,0.945369,"{'fit_intercept': False, 'positive': False}"
Lasso(),-0.000451,-0.00021,1.024896,{'alpha': 1}
Ridge(),0.076769,0.077252,0.945522,{'alpha': 50}
KNeighborsRegressor(),0.163024,0.149977,0.871002,"{'n_neighbors': 11, 'p': 3, 'weights': 'distan..."


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters from Grid Search:", best_params)

best_rf = RandomForestRegressor(**best_params, random_state=42)
best_rf.fit(X_train, y_train)

test_predictions = best_rf.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
print("Test MSE of the best Random Forest model:", test_mse)

importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': numeric_columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
