In [2]:
# Import packages 
import json
import os
import pandas as pd
import numpy as np

# ML packages
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [66]:
# Read json file
original_df = pd.read_json("/data/music_vector_metadata.json")

# Because there are duplicates in column name so we will drop it
original_df.drop_duplicates(subset=['track_id'],inplace=True)

# We only take vector and characteristic columns
unclean_df = original_df[['vector', 'characteristic']]

In [67]:
# Check for missing values 
unclean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57712 entries, 0 to 66530
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   vector          57712 non-null  object
 1   characteristic  46329 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [68]:
# Drop N/A values in the characteristic
unclean_df.dropna(subset=['characteristic'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unclean_df.dropna(subset=['characteristic'], inplace = True)


In [69]:
# Recheck df 
unclean_df.info()
unclean_df.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 46329 entries, 4 to 66530
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   vector          46329 non-null  object
 1   characteristic  46329 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


Unnamed: 0,vector,characteristic
4,"[-0.8194460272789, 0.10938329249620402, 0.4214...","quirky, eclectic, abstract, energetic, passion..."
5,"[-0.823113977909088, -0.23747463524341503, 0.2...","introspective, melancholic, energetic, summer,..."
8,"[-0.8000821471214291, -0.11462888121604901, 0....",
9,"[-0.660261571407318, -0.089793100953102, 0.160...",
10,"[-0.722541749477386, 0.068502597510814, 0.2457...",
11,"[-0.8456231951713561, -0.12404702603816901, 0....",
12,"[-0.742057025432586, 0.010221602395176001, 0.2...",
13,"[-0.8483147025108331, -0.09521148353815001, 0....",
14,"[-0.7372666597366331, 0.06824257969856201, 0.4...",
15,"[-0.7887574434280391, -0.095258809626102, 0.09...",


In [70]:
# We can see that some records has empty value so we need to process them also
clean_df = unclean_df[unclean_df['characteristic']!='']

In [71]:
# Printing head of the df again
clean_df.head(10)

Unnamed: 0,vector,characteristic
4,"[-0.8194460272789, 0.10938329249620402, 0.4214...","quirky, eclectic, abstract, energetic, passion..."
5,"[-0.823113977909088, -0.23747463524341503, 0.2...","introspective, melancholic, energetic, summer,..."
23,"[-0.7328528761863701, 0.077868662774562, 0.301...","introspective, calm, ethereal, nature, lush, p..."
24,"[-0.551648676395416, 0.077182106673717, 0.1172...","introspective, calm, ethereal, nature, lush, p..."
25,"[-0.7803764939308161, 0.030515028163790002, 0....","introspective, calm, ethereal, nature, lush, p..."
26,"[-0.601978361606597, -0.017637291923165002, 0....","introspective, calm, ethereal, nature, lush, p..."
27,"[-0.6483022570610041, 0.398851931095123, 0.259...","introspective, calm, ethereal, nature, lush, p..."
28,"[-0.6882724761962891, -0.07933159172534901, 0....","introspective, calm, ethereal, nature, lush, p..."
29,"[-0.6760290861129761, 0.14368520677089602, 0.0...","introspective, calm, ethereal, nature, lush, p..."
30,"[-0.809983432292938, 0.132193565368652, 0.2557...","introspective, calm, ethereal, nature, lush, p..."


In [169]:
# Data is clean, now we will need to turn each of the characteristic into a seperate column
expanded = clean_df['characteristic'].str.get_dummies(sep=', ')

# Merge the expanded characteristic into the old df
df = pd.concat([clean_df.drop('characteristic', axis=1), expanded], axis=1)

# Reset index
df = df.reset_index(drop=True)

# Choose neccessary columns

columns_to_exclude = ['abstract', 'anthemic', 'aquatic', 'boastful', 'breakup', 'cryptic', 'death', 'dense', 'dissonant', 'drugs', 'fantasy', 'futuristic', 'hedonistic',
                      'humorous', 'mechanical', 'nature', 'nocturnal', 'orchestral', 'party', 'pastoral', 'poetic', 'psychedelic', 'quirky', 'raw', 'rebellious', 'sarcastic',
                      'sparse', 'spiritual', 'spring', 'summer', 'triumphant']

df = df.drop(columns=columns_to_exclude,axis=1)



# Machine Learning

In [170]:
# Define X and Y 

X = np.stack(df['vector'].values) # Use np to turn list to array
y = df.iloc[:,1:].values

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [178]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

classifier = DecisionTreeClassifier(random_state=42)

# Wrap the classifier in MultiOutputClassifier for multi-label classification
multi_label_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

# Train the multi-label classifier
multi_label_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = multi_label_classifier.predict(X_test)

# Evaluate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.00831234256926952


In [181]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', probability=True, random_state = 42)

# Wrap the classifier in MultiOutputClassifier for multi-label classification
svm_multi_label_classifier = MultiOutputClassifier(svm_classifier, n_jobs=-1)

# Train the multi-label classifier
svm_multi_label_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_multi_label_classifier.predict(X_test)

# Evaluate the accuracy of the predictions
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy_svm)

Accuracy: 0.023551637279596978


In [227]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(SVC(kernel='linear', random_state = 42))

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

accuracy_score(y_test,predictions)

0.11120906801007557

In [235]:
# Check the first record in test data
flattened_array=predictions[0].toarray().flatten()

In [236]:
# Create labels list to match with the array

columns_list = list(df.columns)

labels = columns_list[1:]

In [237]:
# Choose index where value is 1
indices_with_ones = [index for index, value in enumerate(flattened_array) if value == 1]

# Map indices to labels
selected_labels = [labels[index] for index in indices_with_ones]

print(selected_labels)

['eclectic', 'lush', 'surreal']


Based on the accuracy, we will use the LabelPowerSet transform and SVM as the model to predict songs' characteristic

In [228]:
from joblib import dump, load
dump(classifier, 'multi_char_classifier.joblib')

['multi_char_classifier.joblib']

In [296]:
import pickle

with open('multi_char_classifier.pkl', 'wb') as file:
    pickle.dump(classifier, file)