In [1]:
# Step 1 Import Libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib
import pandas as pd
import time

print("Imports completed")

Imports completed


In [2]:
# Step 2 Load and Prepare Data
mySpotify_DF = pd.read_csv('/Users/juanhuerta/Desktop/INFS3325/spotify_songs.csv') 
X = mySpotify_DF[['danceability', 'energy', 'key', 'loudness', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
y = mySpotify_DF['track_popularity']

print("Data loaded and prepared")

Data loaded and prepared


In [3]:
# Step 3 Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print("Data split into training and test sets")

Data split into training and test sets


In [4]:
# Step 4 
spotify_n_jobs = -1  # Utilize all available cores
rfr_Spotify = RandomForestRegressor(n_estimators=50, random_state=0, n_jobs=spotify_n_jobs)
start_time8 = time.time()
rfr_Spotify.fit(X_train, y_train)

print("Random Forest Regressor initialized and trained")

Random Forest Regressor initialized and trained


In [5]:
# Step 5 Make Predictions and Evaluate Model

y_predictions = rfr_Spotify.predict(X_test)
mae = mean_absolute_error(y_test, y_predictions)
mse = mean_squared_error(y_test, y_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_predictions)
stop_time8 = time.time()
elapsed_time8 = stop_time8 - start_time8

print(f"Model evaluation:\nMean Absolute Error:"
      f"{mae}\nMean Squared Error: {mse}\nRoot Mean Squared Error:" 
      f"{rmse}\nR-squared: {r2}\nElapsed time: {elapsed_time8}")

Model evaluation:
Mean Absolute Error:16.78021944555627
Mean Squared Error: 456.76089318521275
Root Mean Squared Error:21.371965122215897
R-squared: 0.26627596745767834
Elapsed time: 8.244935035705566


In [6]:
# Step 6 Create Baseline for Comparison
y_train_mean = np.mean(y_train)
y_predictions_mean_only = np.full_like(y_test, fill_value=y_train_mean)
mse_mean_only = mean_squared_error(y_test, y_predictions_mean_only)
rmse_mean_only = np.sqrt(mse_mean_only)
r2_mean_only = r2_score(y_test, y_predictions_mean_only)

print(f"MSE mean only baseline: {mse_mean_only}\nRMSE mean only baseline:" 
      f"{rmse_mean_only}\nR-squared for the baseline model (mean only): {r2_mean_only}")

MSE mean only baseline: 622.6485564624193
RMSE mean only baseline:24.952926811546963
R-squared for the baseline model (mean only): -0.00019992192941953313


In [7]:
# Step 7 Perform K - Fold Cross Validation
np.random.seed(42)
k = 10
start_time9 = time.time()
cross_val_scores = cross_val_score(rfr_Spotify, X, y, cv=k, scoring='neg_mean_squared_error')
mse_scores = -cross_val_scores
rmse_scores = np.sqrt(mse_scores)
stop_time9 = time.time()
elapsed_time9 = stop_time9 - start_time9

print(f"Crossfold MSE Scores: {mse_scores}\nAverage of Crossfold MSE's:" 
      f"{mse_scores.mean()}\nCrossfold RMSE Scores: {rmse_scores}\nAverage of Crossfold RMSE's:" 
      f"{rmse_scores.mean()}\nElapsed time: {elapsed_time9}")

Crossfold MSE Scores: [434.42046706 450.21384025 578.18684602 450.2472655  570.13889335
 392.10260948 436.41443747 526.61803891 352.66116844 428.95395555]
Average of Crossfold MSE's:461.9957522024624
Crossfold RMSE Scores: [20.84275575 21.2182431  24.04551613 21.21903074 23.8775814  19.80158098
 20.89053464 22.94815982 18.77927497 20.71120362]
Average of Crossfold RMSE's:21.433388114487464
Elapsed time: 87.3809289932251


In [8]:
# Step 8 Additional Cross Validation for MAE
start_time10 = time.time()
mae_scores = cross_val_score(rfr_Spotify, X, y, cv=k, scoring='neg_mean_absolute_error')
stop_time10 = time.time()
elapsed_time10 = stop_time10 - start_time10

print(f"Crossfold MAE Scores: {mae_scores}\nAverage of Crossfold MAE's:" 
      f"{mae_scores.mean()}\nElapsed time: {elapsed_time10}")

Crossfold MAE Scores: [-16.22951517 -16.83639239 -18.73010842 -16.80298946 -20.1308226
 -15.07710206 -16.07900977 -18.04646326 -14.3576821  -15.78747336]
Average of Crossfold MAE's:-16.807755859308987
Elapsed time: 88.58501696586609


In [9]:
# Step 9 Display Numbers of CPU Cores Used
n_cores_used = joblib.effective_n_jobs(rfr_Spotify.n_jobs)

print(f"Number of CPU cores used: {n_cores_used}")

Number of CPU cores used: 4
