In [1]:
# dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import tensorflow as tf
import numpy as np
from sklearn.svm import SVC

# set max columns
pd.set_option('display.max_columns', None)

# Import lyrics data
platinum_features_df = pd.read_csv('https://platinum-lyric-bucket.s3.us-east-2.amazonaws.com/platinum_features.csv')

In [2]:
platinum_features_df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,feature_tempo,feature_mode,feature_danceability,feature_energy,feature_liveness,feature_loudness,feature_speechiness,feature_valence,feature_explicit,target_success,target_weeks,target_peak
0,0,TRRBOBU128F4293068,texas,i don't want a lover,1989,country,61,300600,7,0.196,0.000487,120.484,1,0.756,0.47,0.126,-12.615,0.0394,0.43,0,1,77,6
1,1,TRVCPQS128F4285928,the youngbloods,ride the wind,1988,rock,20,396600,2,0.91,0.651,119.033,1,0.558,0.307,0.0866,-20.492,0.0343,0.674,0,0,0,0
2,2,TRZRMWW128F426E797,babyface,tender lover,1990,pop,37,259267,5,0.226,0.000422,102.459,1,0.743,0.86,0.0513,-6.346,0.0445,0.687,0,1,17,89
3,3,TRVSRVI128F4261843,reo speedwagon,one lonely night,1984,country,31,201467,5,0.0561,0.000149,76.051,1,0.408,0.579,0.0712,-10.277,0.026,0.397,0,1,19,16
4,4,TRSHXOI128F146B1AE,john waite,change,1982,rock,49,196693,11,0.246,0.0,149.028,1,0.403,0.848,0.616,-11.615,0.0625,0.622,0,1,54,10


In [3]:
# drop non-word columns
features_df = platinum_features_df.drop(['feature_popularity', 'Unnamed: 0', 'track_id', 'artist_name', 'song_title', 'song_year', 'target_weeks', 'target_peak'], 1)

# Binary encoding using Pandas (single column)
features_df = pd.get_dummies(features_df, columns=["feature_genre"])
features_df

Unnamed: 0,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,feature_tempo,feature_mode,feature_danceability,feature_energy,feature_liveness,feature_loudness,feature_speechiness,feature_valence,feature_explicit,target_success,feature_genre_blues,feature_genre_country,feature_genre_edm,feature_genre_electronic,feature_genre_folk,feature_genre_hip hop,feature_genre_jam,feature_genre_jazz,feature_genre_latin,feature_genre_metal,feature_genre_pop,feature_genre_punk,feature_genre_rap,feature_genre_reggae,feature_genre_rhythm and blues,feature_genre_rock
0,300600,7,0.19600,0.000487,120.484,1,0.756,0.470,0.1260,-12.615,0.0394,0.430,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,396600,2,0.91000,0.651000,119.033,1,0.558,0.307,0.0866,-20.492,0.0343,0.674,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,259267,5,0.22600,0.000422,102.459,1,0.743,0.860,0.0513,-6.346,0.0445,0.687,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,201467,5,0.05610,0.000149,76.051,1,0.408,0.579,0.0712,-10.277,0.0260,0.397,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,196693,11,0.24600,0.000000,149.028,1,0.403,0.848,0.6160,-11.615,0.0625,0.622,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8687,199067,9,0.00962,0.000000,140.219,1,0.395,0.955,0.3600,-3.799,0.0488,0.658,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8688,288080,4,0.02280,0.224000,113.250,0,0.704,0.709,0.0832,-10.453,0.0589,0.848,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8689,318773,11,0.31500,0.000006,177.872,1,0.391,0.514,0.0567,-6.398,0.0447,0.437,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8690,202760,7,0.12300,0.000031,185.034,1,0.443,0.885,0.2800,-6.172,0.0414,0.536,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Seperate features and target
y = features_df["target_success"]
X = features_df.drop(columns="target_success")

# Split data into testing and training
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
X_train.shape

(6519, 29)

In [5]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random forest model

In [6]:
# Creating the decision tree classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, ..., 1, 1, 1])

In [7]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
accuracy_score(y_test, predictions)

0.6907501150483203

In [8]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_

# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.09392521343331216, 'feature_duration'),
 (0.09235527882533268, 'feature_danceability'),
 (0.08953662835838501, 'feature_acousticness'),
 (0.0893009669759923, 'feature_speechiness'),
 (0.08719692946330021, 'feature_energy'),
 (0.0830058329403259, 'feature_instrumentalness'),
 (0.08152161747988391, 'feature_valence'),
 (0.0797540730427805, 'feature_loudness'),
 (0.07812576923779825, 'feature_liveness'),
 (0.07780376721539606, 'feature_tempo'),
 (0.04629553008605502, 'feature_key'),
 (0.02684385150305692, 'feature_genre_pop'),
 (0.011473464590204966, 'feature_mode'),
 (0.009107536240150635, 'feature_genre_rock'),
 (0.00870601724103195, 'feature_genre_metal'),
 (0.008407860328712576, 'feature_genre_country'),
 (0.0054140394897028874, 'feature_explicit'),
 (0.004548498373635888, 'feature_genre_latin'),
 (0.004412828664016654, 'feature_genre_folk'),
 (0.0043432725409339335, 'feature_genre_blues'),
 (0.0035536411588197227, 'feature_genre_rhythm and blues'),
 (0.0033575078920412213, 'featu

# SVC Model

In [9]:
# Instantiate a linear SVM model
model = SVC(kernel='linear')

# Fit the data
model.fit(X_train_scaled, y_train)

# Make predictions using the test data
y_pred = model.predict(X_test_scaled)
results = pd.DataFrame({
    "Prediction": y_pred, 
    "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,0,1
3,0,0
4,1,1


In [10]:
accuracy_score(y_test, y_pred)

0.6576161988034974

# NN Model

In [11]:
# # Creating StandardScaler instance
# scaler = MinMaxScaler()

# # Fitting Standard Scaller
# X_scaler = scaler.fit(X_train)

# # Scaling data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = number_input_features
hidden_nodes_layer2 = number_input_features

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, np.asarray(y_train), epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,np.asarray(y_test),verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Train on 6519 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
2173/1 - 0s - loss: 0.6837 - accuracy: 0.6452
Loss: 0.6507526129772188, Accuracy: 0.645190954208374
