In [50]:
# Import packages 
import json
import os
import pandas as pd
import numpy as np

# ML packages
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [94]:
# Read json file
original_df = pd.read_json("/Users/bach/Documents/MP3-Project/complete_df_creation/music_vector_metadata.json")

# Because there are duplicates in column name so we will drop it
original_df.drop_duplicates(subset=['track_id'],inplace=True)

# We only take vector and characteristic columns
unclean_df = original_df[['vector', 'characteristic']]

In [52]:
# Check for missing values 
unclean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57712 entries, 0 to 66530
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   vector          57712 non-null  object
 1   characteristic  46329 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [95]:
# Drop N/A values in the characteristic
unclean_df.dropna(subset=['characteristic'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unclean_df.dropna(subset=['characteristic'], inplace = True)


In [37]:
# Recheck df 
unclean_df.info()
unclean_df.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 46329 entries, 4 to 66530
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   vector          46329 non-null  object
 1   characteristic  46329 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


Unnamed: 0,vector,characteristic
4,"[-0.8194460272789, 0.10938329249620402, 0.4214...","quirky, eclectic, abstract, energetic, passion..."
5,"[-0.823113977909088, -0.23747463524341503, 0.2...","introspective, melancholic, energetic, summer,..."
8,"[-0.8000821471214291, -0.11462888121604901, 0....",
9,"[-0.660261571407318, -0.089793100953102, 0.160...",
10,"[-0.722541749477386, 0.068502597510814, 0.2457...",
11,"[-0.8456231951713561, -0.12404702603816901, 0....",
12,"[-0.742057025432586, 0.010221602395176001, 0.2...",
13,"[-0.8483147025108331, -0.09521148353815001, 0....",
14,"[-0.7372666597366331, 0.06824257969856201, 0.4...",
15,"[-0.7887574434280391, -0.095258809626102, 0.09...",


In [96]:
# We can see that some records has empty value so we need to process them also
clean_df = unclean_df[unclean_df['characteristic']!='']

In [97]:
import pandas as pd

# Assume clean_df is your cleaned DataFrame ready to be processed

# Define mood groups
mood_groups = {
    'aggressive': 'Energetic/Excited', 'energetic': 'Energetic/Excited', 'epic': 'Energetic/Excited', 'noisy': 'Energetic/Excited', 'passionate': 'Energetic/Excited',
    'happy': 'Happy/Positive', 'optimistic': 'Happy/Positive', 'playful': 'Energetic/Excited', 'uplifting': 'Happy/Positive',
    'calm': 'Calm/Peaceful', 'peaceful': 'Calm/Peaceful', 'soothing': 'Calm/Peaceful', 'meditative': 'Calm/Peaceful', 'soft': 'Calm/Peaceful',
    'sad': 'Sad/Negative', 'depressive': 'Sad/Negative', 'melancholic': 'Sad/Negative', 'sombre': 'Sad/Negative', 'pessimistic': 'Sad/Negative', 'lonely': 'Sad/Negative', 'longing': 'Sad/Negative',
    'dark': 'Dark/Intense', 'scary': 'Dark/Intense', 'ominous': 'Dark/Intense', 'suspenseful': 'Dark/Intense', 'chaotic': 'Dark/Intense',
    'romantic': 'Romantic/Emotional', 'love': 'Romantic/Emotional', 'sensual': 'Romantic/Emotional', 'sentimental': 'Romantic/Emotional', 'sexual': 'Romantic/Emotional',
    'introspective': 'Thoughtful/Contemplative', 'existential': 'Thoughtful/Contemplative', 'conscious': 'Thoughtful/Contemplative',
    'mysterious': 'Mysterious/Abstract', 'surreal': 'Mysterious/Abstract', 'ethereal': 'Mysterious/Abstract', 'hypnotic': 'Mysterious/Abstract',
        'mellow': 'Calm/Peaceful', 'cold': 'Dark/Intense', 'manic': 'Energetic/Excited',  'bittersweet': 'Sad/Negative', 'anxious': 'Dark/Intense','angry': 'Dark/Intense',    'heavy': 'Dark/Intense',    'lush': 'Romantic/Emotional',  'warm': 'Happy/Positive',  'lethargic': 'Sad/Negative', 'eclectic': 'Mysterious/Abstract'
}

# Define moods to exclude before grouping
moods_to_exclude = ['abstract', 'anthemic', 'aquatic', 'boastful', 'breakup', 'cryptic', 'death', 'dense', 'dissonant', 'drugs', 'fantasy', 'futuristic', 'hedonistic',
                    'humorous', 'mechanical', 'nature', 'nocturnal', 'orchestral', 'party', 'pastoral', 'poetic', 'psychedelic', 'quirky', 'raw', 'rebellious', 'sarcastic',
                    'sparse', 'spiritual', 'spring', 'summer', 'triumphant']

# Function to filter and map moods to groups
def filter_and_map_moods(moods):
    filtered_moods = []
    for mood in moods.split(', '):
        if mood not in moods_to_exclude and mood in mood_groups:
            filtered_moods.append(mood_groups[mood])
    result = ', '.join(set(filtered_moods))  # Remove duplicates and convert to string
    return result

# Apply filtering and mapping to the 'characteristic' column
clean_df['characteristic'] = clean_df['characteristic'].apply(filter_and_map_moods)

# Check what the 'characteristic' column contains now
print("Processed Characteristics:")
print(clean_df['characteristic'])
# Data is clean, now we will need to turn each of the characteristic into a separate column
expanded = clean_df['characteristic'].str.get_dummies(sep=', ')
# Merge the expanded characteristic into the old df
df = pd.concat([clean_df.drop('characteristic', axis=1), expanded], axis=1)

# Reset index
df = df.reset_index(drop=True)
df.to_csv("full_df.csv")
print("Final DataFrame:")
print(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['characteristic'] = clean_df['characteristic'].apply(filter_and_map_moods)


Processed Characteristics:
4                   Energetic/Excited, Mysterious/Abstract
5        Sad/Negative, Energetic/Excited, Thoughtful/Co...
23       Calm/Peaceful, Romantic/Emotional, Mysterious/...
24       Calm/Peaceful, Romantic/Emotional, Mysterious/...
25       Calm/Peaceful, Romantic/Emotional, Mysterious/...
                               ...                        
66526                                                     
66527                                                     
66528                                                     
66529                                                     
66530                                                     
Name: characteristic, Length: 39697, dtype: object
Final DataFrame:
                                                  vector  Calm/Peaceful  \
0      [-0.8194460272789, 0.10938329249620402, 0.4214...              0   
1      [-0.823113977909088, -0.23747463524341503, 0.2...              0   
2      [-0.7328528761863701, 0.

In [39]:
# Data is clean, now we will need to turn each of the characteristic into a seperate column
expanded = clean_df['characteristic'].str.get_dummies(sep=', ')

# Merge the expanded characteristic into the old df
df = pd.concat([clean_df.drop('characteristic', axis=1), expanded], axis=1)

# Reset index
df = df.reset_index(drop=True)

# Choose neccessary columns

columns_to_exclude = ['abstract', 'anthemic', 'aquatic', 'boastful', 'breakup', 'cryptic', 'death', 'dense', 'dissonant', 'drugs', 'fantasy', 'futuristic', 'hedonistic',
                      'humorous', 'mechanical', 'nature', 'nocturnal', 'orchestral', 'party', 'pastoral', 'poetic', 'psychedelic', 'quirky', 'raw', 'rebellious', 'sarcastic',
                      'sparse', 'spiritual', 'spring', 'summer', 'triumphant']

df = df.drop(columns=columns_to_exclude,axis=1)

print(df.columns)



Index(['vector', 'aggressive', 'angry', 'anxious', 'bittersweet', 'calm',
       'chaotic', 'cold', 'conscious', 'dark', 'depressive', 'eclectic',
       'energetic', 'epic', 'ethereal', 'existential', 'happy', 'heavy',
       'hypnotic', 'introspective', 'lethargic', 'lonely', 'longing', 'love',
       'lush', 'manic', 'meditative', 'melancholic', 'mellow', 'mysterious',
       'noisy', 'ominous', 'optimistic', 'passionate', 'peaceful',
       'pessimistic', 'playful', 'romantic', 'sad', 'scary', 'sensual',
       'sentimental', 'sexual', 'soft', 'sombre', 'soothing', 'surreal',
       'suspenseful', 'uplifting', 'warm'],
      dtype='object')


# Machine Learning

In [71]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assume df['vector'] contains lists of vectors and df.iloc[:,1:] contains the labels

# Convert lists of vectors into a numpy array
X = np.stack(df['vector'].values)

# Extract labels
y = df.iloc[:,1:].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
#X_train = scaler.fit_transform(X_train)

# Transform the test data with the same scaler
#X_test = scaler.transform(X_test)

# Your data is now normalized and ready for training


In [85]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE
X_smote, y_smote = smote.fit_resample(X_train, y_train)


ValueError: Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

classifier = DecisionTreeClassifier(random_state=42)

# Wrap the classifier in MultiOutputClassifier for multi-label classification
multi_label_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

# Train the multi-label classifier
multi_label_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = multi_label_classifier.predict(X_test)

# Evaluate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.05327455919395466


In [None]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', probability=True, random_state = 42)

# Wrap the classifier in MultiOutputClassifier for multi-label classification
svm_multi_label_classifier = MultiOutputClassifier(svm_classifier, n_jobs=-1)

# Train the multi-label classifier
svm_multi_label_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_multi_label_classifier.predict(X_test)

# Evaluate the accuracy of the predictions
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy_svm)

Accuracy: 0.0681360201511335


In [None]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(SVC(kernel='linear', random_state = 42))

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

accuracy_score(y_test,predictions)

0.11309823677581864

In [11]:
from sklearn.svm import SVC
from skmultilearn.problem_transform import LabelPowerset

# Initialize SVM classifier with RBF kernel
svm_classifier = SVC(kernel='rbf', probability=True, random_state=42)

# Initialize Label Powerset multi-label classifier with the SVM classifier
lp_classifier = LabelPowerset(classifier=svm_classifier)

# Train the classifier on the training data
lp_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = lp_classifier.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with RBF kernel:", accuracy)

Accuracy with RBF kernel: 0.10188916876574307


In [13]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

# Define the parameter distributions rather than a parameter grid
param_distributions = {
    'classifier__C': reciprocal(0.1, 100),
    'classifier__gamma': expon(scale=1.0)
}

# Initialize the randomized search
random_search = RandomizedSearchCV(
    estimator=lp_classifier, 
    param_distributions=param_distributions, 
    n_iter=5,  # The number of parameter settings that are sampled, reduce if necessary
    cv=3, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1
)

# Perform randomized search on the training data
random_search.fit(X_train, y_train)

# Get the best parameters and score
best_parameters = random_search.best_params_
best_score = random_search.best_score_
print("Best Parameters:", best_parameters)
print("Best Score:", best_score)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END classifier__C=0.2938027938703535, classifier__gamma=0.16959629191460518; total time= 7.2min
[CV] END classifier__C=0.2938027938703535, classifier__gamma=0.16959629191460518; total time= 7.2min
[CV] END classifier__C=0.2938027938703535, classifier__gamma=0.16959629191460518; total time= 7.2min
[CV] END classifier__C=15.702970884055382, classifier__gamma=0.9129425537759532; total time=12.6min
[CV] END classifier__C=15.702970884055382, classifier__gamma=0.9129425537759532; total time=12.6min
[CV] END classifier__C=15.702970884055382, classifier__gamma=0.9129425537759532; total time=12.7min
[CV] END classifier__C=0.14936568554617632, classifier__gamma=2.0112308644799395; total time=13.0min
[CV] END classifier__C=1.3292918943162166, classifier__gamma=3.010121430917521; total time=13.3min
[CV] END classifier__C=1.3292918943162166, classifier__gamma=3.010121430917521; total time=13.3min
[CV] END classifier__C=1.3292918943162

Neural Network

In [82]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(128, activation='relu', input_shape=(128,)),  # First hidden layer
    Dense(64, activation='relu'),                        # Second hidden layer
    Dense(8, activation='sigmoid')                       # Output layer with 8 nodes
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [79]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Dense(256, activation='relu', input_shape=(128,)),  # Increase neurons
    BatchNormalization(),
    Dropout(0.3),  # Adjust dropout rate
    
    Dense(128, activation='relu'),  # Add an additional layer
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(64, activation='relu'),  # Existing layer
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(32, activation='relu'),  # Add an additional layer
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(8, activation='sigmoid')  # Output layer with 8 nodes
])

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [86]:
from sklearn.utils import class_weight

# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train.flatten())

# Convert class weights to a dictionary to pass to Keras
class_weight_dict = dict(enumerate(class_weights))

# Use class weights in model training
model.fit(X_train, y_train, class_weight=class_weight_dict, epochs=50, validation_data=(X_test, y_test))


Epoch 1/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 637us/step - accuracy: 0.2681 - loss: 0.5043 - val_accuracy: 0.2271 - val_loss: 0.5568
Epoch 2/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 622us/step - accuracy: 0.2669 - loss: 0.5075 - val_accuracy: 0.2389 - val_loss: 0.5522
Epoch 3/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 591us/step - accuracy: 0.2669 - loss: 0.5037 - val_accuracy: 0.2293 - val_loss: 0.5567
Epoch 4/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 621us/step - accuracy: 0.2714 - loss: 0.5048 - val_accuracy: 0.2275 - val_loss: 0.5529
Epoch 5/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 606us/step - accuracy: 0.2651 - loss: 0.5047 - val_accuracy: 0.2270 - val_loss: 0.5575
Epoch 6/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 618us/step - accuracy: 0.2710 - loss: 0.5026 - val_accuracy: 0.2345 - val_loss: 0.5538
Epoch 7/50
[1m9

<keras.src.callbacks.history.History at 0x38901d910>

In [60]:
# Create labels list to match with the array

columns_list = list(df.columns)

labels = columns_list[1:]

In [92]:
from sklearn.metrics import classification_report

# Predict labels for the validation set
predictions = model.predict(X_test) > 0.5  # Apply threshold to get binary outputs

# Print detailed classification report
print(classification_report(y_test, predictions, target_names=labels))


[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 304us/step
                          precision    recall  f1-score   support

           Calm/Peaceful       0.65      0.30      0.41      2390
            Dark/Intense       0.48      0.38      0.42      1581
       Energetic/Excited       0.62      0.53      0.58      2743
          Happy/Positive       0.55      0.43      0.48      2862
     Mysterious/Abstract       0.57      0.37      0.45      2691
      Romantic/Emotional       0.59      0.56      0.58      3223
            Sad/Negative       0.57      0.46      0.51      2937
Thoughtful/Contemplative       0.53      0.19      0.28      1822

               micro avg       0.58      0.42      0.49     20249
               macro avg       0.57      0.40      0.46     20249
            weighted avg       0.58      0.42      0.48     20249
             samples avg       0.55      0.43      0.45     20249



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
# Check the first record in test data
flattened_array=predictions[2].flatten()

In [64]:
print(labels)

['Calm/Peaceful', 'Dark/Intense', 'Energetic/Excited', 'Happy/Positive', 'Mysterious/Abstract', 'Romantic/Emotional', 'Sad/Negative', 'Thoughtful/Contemplative']


In [89]:
# Choose index where value is 1
indices_with_ones = [index for index, value in enumerate(flattened_array) if value == 1]

# Map indices to labels
selected_labels = [labels[index] for index in indices_with_ones]

print(selected_labels)

['Happy/Positive', 'Sad/Negative']


Based on the accuracy, we will use the LabelPowerSet transform and SVM as the model to predict songs' characteristic

In [70]:
from joblib import dump, load
dump(classifier, 'multi_char_classifier.joblib')

['multi_char_classifier.joblib']

In [90]:
import pickle

with open('nn_model.pkl', 'wb') as file:
    pickle.dump(model, file)

AttributeError: Can't pickle local object 'Layer._initialize_tracker.<locals>.<lambda>'

In [91]:
model.save('my_model.h5')

