In [9]:
# 155 GUARDIAN
import pathlib
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from util import audio_feature_util, clustering_util, preprocessing_util

In [10]:
# Global
RANDOM_STATE = 42
AUDIO_PATH = pathlib.Path("../data/custom_data/audio_files")
SPLIT_AUDIO_PATH = pathlib.Path("../data/custom_data/raw_features_based/split_audio_files")

# Stage 0: Preprocessing
AUDIO_DURATION = 45
DISCARD_SHORT_AUDIO = True
PERCENTAGE_FOR_TESTING = 5

# Stage 1: Recommendation
CLUSTER_COUNT = 10

In [11]:
"""
STAGE 0: PREPROCESSING

Generate audio features and prepare data.
"""

preprocessing_util.split_model_resources(
  AUDIO_PATH, 
  SPLIT_AUDIO_PATH,
  percentage_for_testing=PERCENTAGE_FOR_TESTING,
  percentage_for_validation=0
)

features = audio_feature_util.load_audio_features(
  SPLIT_AUDIO_PATH / "train", 
  max_duration=AUDIO_DURATION, 
  thread_pool_size=10, 
  discard_short_audio=DISCARD_SHORT_AUDIO,
  with_melspectrograms=False
)

audio_feature_util.save_audio_features_as_csv(features, SPLIT_AUDIO_PATH / "audio_features.csv")

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  2 out of 10 | elapsed:    8.8s remaining:   35.5s
[Parallel(n_jobs=10)]: Done 10 out of 10 | elapsed:   10.7s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  2 out of 10 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=10)]: Done 10 out of 10 | elapsed:    1.9s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  2 out of 10 | elapsed:    2.7s remaining:   11.1s
[Parallel(n_jobs=10)]: Done 10 out of 10 | elapsed:    2.8s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  2 out of 10 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=10)]: Done 10 out of 10 | elapsed:    1.8s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: 

In [12]:
audio_feature_df = pd.read_csv(SPLIT_AUDIO_PATH / "audio_features.csv")
normalized_audio_feature_df = audio_feature_util.normalize_audio_features(audio_feature_df)

In [13]:
input_features = audio_feature_util.load_audio_features(
  SPLIT_AUDIO_PATH / "test", 
  max_duration=AUDIO_DURATION, 
  thread_pool_size=10,
  discard_short_audio=True,
  with_melspectrograms=False
)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  2 out of 10 | elapsed:    0.0s remaining:    0.4s
[Parallel(n_jobs=10)]: Done 10 out of 10 | elapsed:    1.5s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 2 out of 6 | elapsed:    6.9s remaining:   14.0s
[Parallel(n_jobs=6)]: Done 6 out of 6 | elapsed:    8.4s finished


In [14]:
input_features_df = audio_feature_util.audio_features_as_dataframe(input_features)
normalized_input_features = audio_feature_util.normalize_audio_features(input_features_df, relative_to=audio_feature_df.drop("filename", axis=1))

In [15]:
"""
STAGE 1: RECOMMENDATION

Extract features from raw audio using autoencoder, reduce it's dimensionality with PCA,
identify clusters with Kmeans, and use euclidean distance to find the best recommendation.
"""

pca = PCA(n_components=2, random_state=RANDOM_STATE)
main_dataset_pca = pd.DataFrame(pca.fit_transform(normalized_audio_feature_df.drop("filename", axis=1)), columns=["pca1", "pca2"])
input_dataset_pca = pd.DataFrame(pca.transform(normalized_input_features.drop("filename", axis=1)), columns=["pca1", "pca2"])

# Cluster with Kmeans, then find closest clusters to the testing data.
clusters_pca, centroids_pca = clustering_util.kmeans_clustering(main_dataset_pca, n_clusters=CLUSTER_COUNT, random_state=RANDOM_STATE)
allocated_clusters = clustering_util.closest_clusters_euclidean(input_dataset_pca, centroids_pca)

# Add labels to datasets for identification.
input_dataset_pca.insert(loc=0, column="filename", value=normalized_input_features["filename"])
main_dataset_pca.insert(loc=0, column="filename", value=normalized_audio_feature_df["filename"])

# Loop through clusters and print out the best matching song for each row of testing data.
main_dataset_pca = main_dataset_pca.groupby(clusters_pca)
for i in range(len(allocated_clusters)):
  row = input_dataset_pca.drop("filename", axis=1).iloc[i]
  group = main_dataset_pca.get_group(allocated_clusters[i])
  
  closest_point = clustering_util.closest_points_euclidean([row], group.drop("filename", axis=1)).argmin()
  print("Input:         " + input_dataset_pca.iloc[i]["filename"])
  print("Best Match:    " + group.iloc[closest_point]["filename"])
  print("----------------------------------")


Input:         001 ANOTHER HIM
Best Match:    076 The Dark Truth
----------------------------------
Input:         002 Beginning
Best Match:    020 Thrash Machine
----------------------------------
Input:         003 School
Best Match:    093 Adventure Board
----------------------------------
Input:         008 The Legend
Best Match:    41 - Credits
----------------------------------
Input:         009 Lancer
Best Match:    toby fox - UNDERTALE Soundtrack - 65 CORE
----------------------------------
Input:         010 Rude Buster
Best Match:    153 Crumbling Tower
----------------------------------
Input:         011 Empty Town
Best Match:    104 Raft Ride
----------------------------------
Input:         013 Field of Hopes and Dreams
Best Match:    137 From Now On (Battle 2)
----------------------------------
Input:         015 Lantern
Best Match:    056 Faint Courage (Game Over)
----------------------------------
