Find what my liked songs have in common

In [36]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming you have a DataFrame df containing your songs and API metadata
df = pd.read_csv('spotify-datasets/kaggle-nelgiriyewithana-dataset/spotify-2023_Like_column_JL.csv', encoding='latin-1')

In [45]:
df.columns

Index(['track_name', 'artist(s)_name', 'artist_count', 'released_year',
       'released_month', 'released_day', 'in_spotify_playlists',
       'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts',
       'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm',
       'key', 'mode', 'danceability_%', 'valence_%', 'energy_%',
       'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%',
       'Reggaeton'],
      dtype='object')

In [50]:

# Define the features and target variable
features = ['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%'] # feature columns
target = 'Reggaeton'  # target variable

In [51]:
features

['bpm',
 'danceability_%',
 'valence_%',
 'energy_%',
 'acousticness_%',
 'instrumentalness_%',
 'liveness_%',
 'speechiness_%']

In [96]:
# Separate features and target variable
X = df[features]
y = df[target]

In [97]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [121]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # scaled X

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # unscaled X


In [122]:
X_train

array([[-0.83943709,  0.82270986,  0.53556182, ..., -0.188132  ,
        -0.52634245,  0.3904883 ],
       [-1.12471249,  0.68593839,  0.3651192 , ..., -0.188132  ,
        -0.74525619, -0.51789743],
       [-0.51850226,  1.16463856,  0.15206593, ..., -0.188132  ,
         0.13039878,  5.43707569],
       ...,
       [ 0.40864279, -1.50240527, -1.46713891, ..., -0.188132  ,
         1.07902499, -0.61882917],
       [-0.26888628,  1.91688169, -0.35926191, ..., -0.188132  ,
        -0.67228495,  2.10632802],
       [ 0.33732394,  1.71172447,  0.06684462, ..., -0.188132  ,
         3.26816242, -0.01323869]])

In [123]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score


def get_feature_importance(model, features):
    model.fit(X_train, y_train)
    feature_importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    
    # Calculate model metrics
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    
    return importance_df, accuracy, recall, precision

# Try RandomForestClassifier with default parameters
rf_classifier = RandomForestClassifier(random_state=42)
rf_importance_df, rf_accuracy, rf_recall, rf_precision = get_feature_importance(rf_classifier, features)
print("Random Forest Feature Importance:")
print(rf_importance_df)
print(f"Model Accuracy: {rf_accuracy:.4f}")
print(f"Model Recall: {rf_recall:.4f}")
print(f"Model Precision: {rf_precision:.4f}")

# Try GradientBoostingClassifier with different hyperparameters
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_importance_df, gb_accuracy, gb_recall, gb_precision = get_feature_importance(gb_classifier, features)
print("\nGradient Boosting Feature Importance:")
print(gb_importance_df)
print(f"Model Accuracy: {gb_accuracy:.4f}")
print(f"Model Recall: {gb_recall:.4f}")
print(f"Model Precision: {gb_precision:.4f}")

Random Forest Feature Importance:
              Feature  Importance
0                 bpm    0.228172
1      danceability_%    0.175050
3            energy_%    0.131409
2           valence_%    0.122113
4      acousticness_%    0.119642
7       speechiness_%    0.117827
6          liveness_%    0.093578
5  instrumentalness_%    0.012209
Model Accuracy: 0.8901
Model Recall: 0.1429
Model Precision: 0.5000

Gradient Boosting Feature Importance:
              Feature  Importance
0                 bpm    0.391494
1      danceability_%    0.235086
3            energy_%    0.102232
2           valence_%    0.082278
4      acousticness_%    0.079896
7       speechiness_%    0.060449
6          liveness_%    0.046181
5  instrumentalness_%    0.002383
Model Accuracy: 0.8953
Model Recall: 0.2381
Model Precision: 0.5556


In [125]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense

# Build a simple neural network model
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

# Get the weights of the first layer
weights_first_layer = model.layers[0].get_weights()[0]

# Calculate feature importances based on weights
feature_importances = np.mean(np.abs(weights_first_layer), axis=1)

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

# Sort features by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Calculate model metrics
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)  # Threshold predictions to binary values

nn_accuracy = accuracy_score(y_test, y_pred)
nn_recall = recall_score(y_test, y_pred)
nn_precision = precision_score(y_test, y_pred)

# Display the sorted DataFrame and model metrics
print("\nNeural Network Feature Importance:")
print(importance_df)
print(f"Model Accuracy: {nn_accuracy:.4f}")
print(f"Model Recall: {nn_recall:.4f}")
print(f"Model Precision: {nn_precision:.4f}")


Neural Network Feature Importance:
              Feature  Importance
1      danceability_%    0.173602
2           valence_%    0.170540
3            energy_%    0.162433
4      acousticness_%    0.162214
5  instrumentalness_%    0.159262
0                 bpm    0.155945
7       speechiness_%    0.136558
6          liveness_%    0.131220
Model Accuracy: 0.8901
Model Recall: 0.0000
Model Precision: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))


- Need to look into how to improve Model accuracy, recall and precision
- Overall, the importan features are BPM, Dancability, and energy