In [1]:
import pandas as pd

# Load the feature-engineered dataset
file_path = 'resources/feature_engineered_music_dataset.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,track_uri,name,artists_names,is_playable,artists_uris,playlist_uris,danceability,energy,key,loudness,...,is_not_live,time_signature_4_4,time_signature_other,release_year,release_day_of_week,release_days_since,popularity_log,artists_popularities_avg,artists_followers_avg,main_genre
0,spotify:track:0GYpisgsrUSd8B4UqksmfY,新緑,['上野大樹'],True,['spotify:artist:5YPkOSqagMwjOWf7PLjHNF'],['spotify:playlist:37i9dQZF1DWSt89CX9de4L'],0.218212,-0.431011,2.0,0.138033,...,1,1,0,2023.0,2.0,424.0,3.258097,48.0,69413.0,Pop
1,spotify:track:3b9eglykqfxtTaSpSidP9u,またね,['Lucky Kilimanjaro'],True,['spotify:artist:2V8UZPMR1EbkXhzvEGBTrV'],['spotify:playlist:37i9dQZF1DXahYFr91pFvG'],1.335658,0.548891,5.0,0.833688,...,1,1,0,2023.0,2.0,424.0,3.871201,49.0,89318.0,R&B/Soul/Funk
2,spotify:track:2iOn50LwBLQxhL5pMoIMpW,Sueño Flamenco - Original Mix,['Marksman'],True,['spotify:artist:59ggRM2BowvTe9nlnwGlBT'],['spotify:playlist:1ILlyqabDcTv0mbnTxE7Od'],0.228754,0.930161,6.0,0.425916,...,1,1,0,2023.0,1.0,425.0,1.791759,26.0,6409.0,Miscellaneous
3,spotify:track:68aueb4O4xxqwsBPiP7dLS,Nightshift,['Khainz'],True,['spotify:artist:71yD5VENn9Wy1IECnpYWvX'],['spotify:playlist:37i9dQZF1DX8AliSIsGeKd'],1.07211,1.282926,7.0,0.424298,...,1,1,0,2023.0,4.0,429.0,3.332205,28.0,12983.0,Electronic/Dance
4,spotify:track:2lRBoCWxkUcEicwVQjQugG,The Spider's thread,['Penthouse'],True,['spotify:artist:50QaWH5OLY3Pkt1XNCGk6L'],['spotify:playlist:37i9dQZF1DWSt89CX9de4L'],0.460677,0.730618,1.0,0.792352,...,1,1,0,2023.0,2.0,431.0,2.772589,52.0,66153.0,Pop


In [2]:
pd.set_option('display.max_columns', None)
# Identify non-numeric columns to drop or encode
non_numeric_columns = ['track_uri', 'name', 'artists_names', 'artists_uris', 'playlist_uris', 'analysis_url', 'artists_genres']

# Drop non-numeric columns
df.drop(columns=non_numeric_columns, inplace=True)

# Check for any remaining non-numeric columns
remaining_non_numeric_columns = df.select_dtypes(include=['object']).columns
print("Remaining non-numeric columns:", remaining_non_numeric_columns)

# One-hot encode remaining non-numeric columns if any
df = pd.get_dummies(df, columns=remaining_non_numeric_columns, drop_first=True)

# Ensure all columns are numeric now
print("Data types after processing:", df.dtypes)

Remaining non-numeric columns: Index(['pitch_names', 'main_genre'], dtype='object')
Data types after processing: is_playable                          bool
danceability                      float64
energy                            float64
key                               float64
loudness                          float64
                                   ...   
main_genre_R&B/Soul/Funk             bool
main_genre_Rock                      bool
main_genre_Seasonal/Holiday          bool
main_genre_Soundtrack/Theme          bool
main_genre_World/International       bool
Length: 96, dtype: object


In [3]:
df.columns

Index(['is_playable', 'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'album', 'compilation', 'single',
       'release_dayofweek', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Saturday', 'Sunday', 'release_month', 'January', 'February',
       'March', 'April', 'May', 'June', 'July', 'August', 'September',
       'October', 'November', 'December', 'time_since_release', 'C', 'C♯/D♭',
       'D', 'D♯/E♭', 'E', 'F', 'F♯/G♭', 'G', 'G♯/A♭', 'A', 'A♯/B♭', 'B',
       'mode_minor', 'mode_major', 'speechiness_mixed',
       'speechiness_non-speech', 'speechiness_speech',
       'instrumentalness_above_0.5', 'instrumentalness_below_0.5', 'is_live',
       'is_not_live', 'time_signature_4_4', 'time_signature_other',
       'release_year', 'release_day_of_week', 'release_days_since',
       'popularity_log', 'artists_popularities_avg', 'artists_followers_avg',
       'p

In [4]:
new_feature_engineered_file_path = 'resources/new_feature_engineered_music_dataset.csv'
df.to_csv(new_feature_engineered_file_path, index=False)

In [5]:
# Define the features (X) and the target variable (y)
X = df.drop(columns=['popularity_log'])
y = df['popularity_log']

# Ensure no non-numeric columns are left
print("Features (X) data types:")
print(X.dtypes)


Features (X) data types:
is_playable                          bool
danceability                      float64
energy                            float64
key                               float64
loudness                          float64
                                   ...   
main_genre_R&B/Soul/Funk             bool
main_genre_Rock                      bool
main_genre_Seasonal/Holiday          bool
main_genre_Soundtrack/Theme          bool
main_genre_World/International       bool
Length: 95, dtype: object


In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (181520, 95)
Testing data shape: (45381, 95)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
