In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import sklearn as skl
import tensorflow as tf
import keras_tuner as kt

# Load the dataset
file_path = 'resources/feature_engineered_music_dataset.csv'
df = pd.read_csv(file_path)


In [2]:
# Drop non-numeric columns
non_numeric_columns = ['track_uri', 'name', 'artists_names', 'artists_uris', 'playlist_uris', 'analysis_url', 'artists_genres']
df.drop(columns=non_numeric_columns, inplace=True)

# One-hot encode remaining non-numeric columns if any
remaining_non_numeric_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=remaining_non_numeric_columns, drop_first=True)

# Check for NaN and infinite values
print("NaN values in dataset:\n", df.isna().sum())
print("Infinite values in dataset:\n", np.isfinite(df).all())

# Handle NaN values if any
df.fillna(df.mean(), inplace=True)  # or other imputation method

# Replace infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mean(), inplace=True)

# Define X (features) and y (target)
X_features = df.drop(columns=['popularity_log'])  # Use the appropriate target column name
y_popularity = df['popularity_log']  # Use the appropriate target column name

# Check the distribution of the target variable
print("Target variable distribution:\n", y_popularity.describe())


NaN values in dataset:
 is_playable                       0
danceability                      0
energy                            0
key                               0
loudness                          0
                                 ..
main_genre_R&B/Soul/Funk          0
main_genre_Rock                   0
main_genre_Seasonal/Holiday       0
main_genre_Soundtrack/Theme       0
main_genre_World/International    0
Length: 96, dtype: int64
Infinite values in dataset:
 is_playable                       True
danceability                      True
energy                            True
key                               True
loudness                          True
                                  ... 
main_genre_R&B/Soul/Funk          True
main_genre_Rock                   True
main_genre_Seasonal/Holiday       True
main_genre_Soundtrack/Theme       True
main_genre_World/International    True
Length: 96, dtype: bool
Target variable distribution:
 count    226901.000000
mean          3.363

In [3]:
# Use sklearn to split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, y_popularity, random_state=78, test_size=0.2)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Verify scaling
print("Finite values in scaled training data:", np.isfinite(X_train_scaled).all())
print("Finite values in scaled test data:", np.isfinite(X_test_scaled).all())


Finite values in scaled training data: True
Finite values in scaled test data: True


In [4]:
# Create a simple model to start with
def create_simple_model():
    nn_model = tf.keras.models.Sequential()
    
    # Input layer
    nn_model.add(tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)))
    
    # Single hidden layer
    nn_model.add(tf.keras.layers.Dense(units=16, activation='relu', kernel_initializer='he_normal'))
    
    nn_model.add(tf.keras.layers.Dense(units=1))

    # Compile the model with Adam optimizer
    nn_model.compile(loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=["mean_squared_error"])
    
    return nn_model

# Create and train the simple model
simple_model = create_simple_model()
simple_model.fit(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test))


Epoch 1/10
[1m5673/5673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.7825 - mean_squared_error: 0.7825 - val_loss: 0.3324 - val_mean_squared_error: 0.3324
Epoch 2/10
[1m5673/5673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.3237 - mean_squared_error: 0.3237 - val_loss: 0.3237 - val_mean_squared_error: 0.3237
Epoch 3/10
[1m5673/5673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.3147 - mean_squared_error: 0.3147 - val_loss: 0.3160 - val_mean_squared_error: 0.3160
Epoch 4/10
[1m5673/5673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.3115 - mean_squared_error: 0.3115 - val_loss: 0.3132 - val_mean_squared_error: 0.3132
Epoch 5/10
[1m5673/5673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 994us/step - loss: 0.3085 - mean_squared_error: 0.3085 - val_loss: 0.3128 - val_mean_squared_error: 0.3128
Epoch 6/10
[1m5673/5673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m

<keras.src.callbacks.history.History at 0x27f995e1700>

In [5]:
# Evaluate the simple model
model_loss, model_accuracy = simple_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Simple Model Loss: {model_loss}, Mean Squared Error: {model_accuracy}")


1419/1419 - 1s - 665us/step - loss: 0.3090 - mean_squared_error: 0.3090
Simple Model Loss: 0.3090307414531708, Mean Squared Error: 0.3090307414531708


In [6]:
# Define a learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)


In [7]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()
    
    # Input layer
    nn_model.add(tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)))
    
    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation', ['relu', 'tanh'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=8,
        max_value=16,
        step=4), activation=activation, kernel_initializer='he_normal'))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 2)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=8,
            max_value=16,
            step=4),
            activation=activation, kernel_initializer='he_normal'))
    
    nn_model.add(tf.keras.layers.Dense(units=1))

    # Compile the model with Adam optimizer
    nn_model.compile(loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=["mean_squared_error"])
    
    return nn_model


In [None]:
# Import the kerastuner library
tuner = kt.Hyperband(
    create_model,
    objective="val_mean_squared_error",
    max_epochs=20,
    hyperband_iterations=2)

# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test), callbacks=[callback])

# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
print(best_hyper.values)


Trial 9 Complete [00h 00m 34s]
val_mean_squared_error: 0.3084581792354584

Best val_mean_squared_error So Far: 0.30634915828704834
Total elapsed time: 00h 04m 29s

Search: Running Trial #10

Value             |Best Value So Far |Hyperparameter
tanh              |tanh              |activation
8                 |16                |first_units
2                 |2                 |num_layers
8                 |12                |units_0
12                |8                 |units_1
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3


In [None]:
# Reinitialize the best model with best hyperparameters
def create_best_model(hp):
    nn_model = tf.keras.models.Sequential()
    
    # Input layer
    nn_model.add(tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)))
    
    # Hidden layers with best hyperparameters
    activation = hp['activation']
    nn_model.add(tf.keras.layers.Dense(units=hp['first_units'], activation=activation, kernel_initializer='he_normal'))
    for i in range(hp['num_layers']):
        nn_model.add(tf.keras.layers.Dense(units=hp[f'units_{i}'], activation=activation, kernel_initializer='he_normal'))
    
    nn_model.add(tf.keras.layers.Dense(units=1))

    # Compile the model with Adam optimizer
    nn_model.compile(loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=["mean_squared_error"])
    
    return nn_model

# Create the best model with the best hyperparameters
best_model = create_best_model(best_hyper)
best_model.fit(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test))

# Evaluate the best model
model_loss, model_accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Tuned Model Loss: {model_loss}, Mean Squared Error: {model_accuracy}")


In [None]:
import matplotlib.pyplot as plt

# Predict the popularity on the test set
y_pred = best_model.predict(X_test_scaled)

# Scatter plot of actual vs. predicted values
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Popularity')
plt.ylabel('Predicted Popularity')
plt.title('Actual vs. Predicted Popularity')
plt.show()

# Calculate additional metrics
from sklearn.metrics import mean_absolute_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


My model has a Mean Absolute Error (MAE) of about 0.396, meaning its predictions are, on average, 0.396 units off from the actual popularity. The R-squared value is around 0.42, indicating that the model explains 42% of the variation in the actual popularity. This shows a moderate level of accuracy: the model captures some trends but is not highly precise. The scatter plot reveals that while predictions generally follow the actual values, there's a significant spread, suggesting variability in predictions. To improve, you could refine features, try more complex models, fine-tune parameters, handle outliers better, and use cross-validation for more robust results.