In [132]:
# Import packages 
import json
import os
import pandas as pd
import numpy as np

# ML packages
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [138]:
# Read json file
original_df = pd.read_json("/Users/bach/Documents/MP3-Project/complete_df_creation/music_vector_metadata.json")

# Because there are duplicates in column name so we will drop it
original_df.drop_duplicates(subset=['track_id'],inplace=True)

# We only take vector and characteristic columns
unclean_df = original_df[['vector', 'characteristic']]

In [139]:
# Drop N/A values in the characteristic
unclean_df.dropna(subset=['characteristic'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unclean_df.dropna(subset=['characteristic'], inplace = True)


In [140]:
# We can see that some records has empty value so we need to process them also
clean_df = unclean_df[unclean_df['characteristic']!='']

In [141]:
import pandas as pd

# Assume clean_df is your cleaned DataFrame ready to be processed

# Define mood groups
mood_groups = {
    'aggressive': 'Energetic/Excited', 'energetic': 'Energetic/Excited', 'epic': 'Energetic/Excited', 'noisy': 'Energetic/Excited', 'passionate': 'Energetic/Excited',
    'happy': 'Happy/Positive', 'optimistic': 'Happy/Positive', 'playful': 'Energetic/Excited', 'uplifting': 'Happy/Positive',
    'calm': 'Calm/Peaceful', 'peaceful': 'Calm/Peaceful', 'soothing': 'Calm/Peaceful', 'meditative': 'Calm/Peaceful', 'soft': 'Calm/Peaceful',
    'sad': 'Sad/Negative', 'depressive': 'Sad/Negative', 'melancholic': 'Sad/Negative', 'sombre': 'Sad/Negative', 'pessimistic': 'Sad/Negative', 'lonely': 'Sad/Negative', 'longing': 'Sad/Negative',
    'dark': 'Dark/Intense', 'scary': 'Dark/Intense', 'ominous': 'Dark/Intense', 'suspenseful': 'Dark/Intense', 'chaotic': 'Dark/Intense',
    'romantic': 'Romantic/Emotional', 'love': 'Romantic/Emotional', 'sensual': 'Romantic/Emotional', 'sentimental': 'Romantic/Emotional', 'sexual': 'Romantic/Emotional',
    'introspective': 'Thoughtful/Contemplative', 'existential': 'Thoughtful/Contemplative', 'conscious': 'Thoughtful/Contemplative',
    'mysterious': 'Mysterious/Abstract', 'surreal': 'Mysterious/Abstract', 'ethereal': 'Mysterious/Abstract', 'hypnotic': 'Mysterious/Abstract',
        'mellow': 'Calm/Peaceful', 'cold': 'Dark/Intense', 'manic': 'Energetic/Excited',  'bittersweet': 'Sad/Negative', 'anxious': 'Dark/Intense','angry': 'Dark/Intense',    'heavy': 'Dark/Intense',    'lush': 'Romantic/Emotional',  'warm': 'Happy/Positive',  'lethargic': 'Sad/Negative', 'eclectic': 'Mysterious/Abstract'
}

# Define moods to exclude before grouping
moods_to_exclude = ['abstract', 'anthemic', 'aquatic', 'boastful', 'breakup', 'cryptic', 'death', 'dense', 'dissonant', 'drugs', 'fantasy', 'futuristic', 'hedonistic',
                    'humorous', 'mechanical', 'nature', 'nocturnal', 'orchestral', 'party', 'pastoral', 'poetic', 'psychedelic', 'quirky', 'raw', 'rebellious', 'sarcastic',
                    'sparse', 'spiritual', 'spring', 'summer', 'triumphant']

# Function to filter and map moods to groups
def filter_and_map_moods(moods):
    filtered_moods = []
    for mood in moods.split(', '):
        if mood not in moods_to_exclude and mood in mood_groups:
            filtered_moods.append(mood_groups[mood])
    result = ', '.join(set(filtered_moods))  # Remove duplicates and convert to string
    return result

# Apply filtering and mapping to the 'characteristic' column
clean_df['characteristic'] = clean_df['characteristic'].apply(filter_and_map_moods)

# Check what the 'characteristic' column contains now
print("Processed Characteristics:")
print(clean_df['characteristic'])
# Data is clean, now we will need to turn each of the characteristic into a separate column
expanded = clean_df['characteristic'].str.get_dummies(sep=', ')
# Merge the expanded characteristic into the old df
df = pd.concat([clean_df.drop('characteristic', axis=1), expanded], axis=1)

# Reset index
df = df.reset_index(drop=True)
print("Final DataFrame:")
print(df)

Processed Characteristics:
4                   Energetic/Excited, Mysterious/Abstract
5        Sad/Negative, Energetic/Excited, Thoughtful/Co...
23       Calm/Peaceful, Romantic/Emotional, Mysterious/...
24       Calm/Peaceful, Romantic/Emotional, Mysterious/...
25       Calm/Peaceful, Romantic/Emotional, Mysterious/...
                               ...                        
66526                                                     
66527                                                     
66528                                                     
66529                                                     
66530                                                     
Name: characteristic, Length: 39697, dtype: object
Final DataFrame:
                                                  vector  Calm/Peaceful  \
0      [-0.8194460272789, 0.10938329249620402, 0.4214...              0   
1      [-0.823113977909088, -0.23747463524341503, 0.2...              0   
2      [-0.7328528761863701, 0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['characteristic'] = clean_df['characteristic'].apply(filter_and_map_moods)


# Machine Learning

In [142]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assume df['vector'] contains lists of vectors and df.iloc[:,1:] contains the labels

# Convert lists of vectors into a numpy array
X = np.stack(df['vector'].values)

# Extract labels
y = df.iloc[:,1:].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Neural Network

In [143]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(128, activation='relu', input_shape=(128,)),  # First hidden layer
    Dense(64, activation='relu'),                        # Second hidden layer
    Dense(8, activation='sigmoid')                       # Output layer with 8 nodes
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [144]:
def calculate_sample_weights(y, class_weights):
    # Initialize sample weights with ones
    sample_weights = np.ones(shape=(y.shape[0],))
    
    # Iterate through all the class weights and apply them to the sample weights
    for class_index, weight in class_weights.items():
        # Apply class weight by multiplying the sample weight by the class weight where the label is present
        sample_weights *= np.where(y[:, class_index] == 1, weight, 1)
    
    return sample_weights

In [145]:
# Your pre-computed class weights
pre_computed_class_weights = {
    'Calm/Peaceful': 1.3485355648535564, 
    'Dark/Intense': 2.038583175205566, 
    'Energetic/Excited': 1.1749908858913598, 
    'Happy/Positive': 1.126135569531796, 
    'Mysterious/Abstract': 1.1976960237829803, 
    'Romantic/Emotional': 1.0, 
    'Sad/Negative': 1.097378277153558, 
    'Thoughtful/Contemplative': 1.7689352360043908
}

# Convert the class weights to indices (assuming the order of your output layer matches the dictionary keys)
class_weight_indices = {i: pre_computed_class_weights[label] for i, label in enumerate(pre_computed_class_weights)}

# Calculate sample weights
sample_weights = calculate_sample_weights(y_train, class_weight_indices)

# Now fit the model with sample weights
history = model.fit(
    X_train, y_train, 
    sample_weight=sample_weights, 
    epochs=50, 
    validation_data=(X_test, y_test)
)

Epoch 1/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 755us/step - accuracy: 0.2624 - loss: 1.1382 - val_accuracy: 0.2399 - val_loss: 0.5613
Epoch 2/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 588us/step - accuracy: 0.2791 - loss: 1.0837 - val_accuracy: 0.3057 - val_loss: 0.5556
Epoch 3/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 578us/step - accuracy: 0.2815 - loss: 1.0795 - val_accuracy: 0.2846 - val_loss: 0.5608
Epoch 4/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 615us/step - accuracy: 0.2835 - loss: 1.0709 - val_accuracy: 0.2901 - val_loss: 0.5526
Epoch 5/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 577us/step - accuracy: 0.2789 - loss: 1.0767 - val_accuracy: 0.2966 - val_loss: 0.5560
Epoch 6/50
[1m993/993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 560us/step - accuracy: 0.2807 - loss: 1.0627 - val_accuracy: 0.2940 - val_loss: 0.5624
Epoch 7/50
[1m9

In [146]:
# Create labels list to match with the array

columns_list = list(df.columns)

labels = columns_list[1:]

In [147]:
from sklearn.metrics import classification_report

# Predict labels for the validation set
predictions = model.predict(X_test) > 0.5  # Apply threshold to get binary outputs

# Print detailed classification report
print(classification_report(y_test, predictions, target_names=labels))


[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 341us/step
                          precision    recall  f1-score   support

           Calm/Peaceful       0.62      0.38      0.48      2390
            Dark/Intense       0.46      0.44      0.45      1581
       Energetic/Excited       0.68      0.56      0.62      3464
          Happy/Positive       0.53      0.12      0.19      2097
     Mysterious/Abstract       0.55      0.45      0.49      2691
      Romantic/Emotional       0.60      0.52      0.56      3223
            Sad/Negative       0.52      0.62      0.56      2937
Thoughtful/Contemplative       0.38      0.42      0.40      1822

               micro avg       0.55      0.46      0.50     20205
               macro avg       0.54      0.44      0.47     20205
            weighted avg       0.56      0.46      0.49     20205
             samples avg       0.54      0.47      0.47     20205



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [151]:
# Check the first record in test data
flattened_array=predictions[0].flatten()

In [149]:
print(labels)

['Calm/Peaceful', 'Dark/Intense', 'Energetic/Excited', 'Happy/Positive', 'Mysterious/Abstract', 'Romantic/Emotional', 'Sad/Negative', 'Thoughtful/Contemplative']


In [152]:
# Choose index where value is 1
indices_with_ones = [index for index, value in enumerate(flattened_array) if value == 1]

# Map indices to labels
selected_labels = [labels[index] for index in indices_with_ones]

print(selected_labels)

['Energetic/Excited', 'Mysterious/Abstract', 'Sad/Negative', 'Thoughtful/Contemplative']


Based on the accuracy, we will use the LabelPowerSet transform and SVM as the model to predict songs' characteristic

In [153]:
model.save('data/NN_feature2.h5')

