# Data Exploration

In [13]:
import math

import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

### Storage Variables

These point the program to the directory containing the pre-processed audio data, and saves the current dataframes as temporary files so that consecutive runs can pick up where the previous run left off.

In [14]:
from config import *

### Universal Variables

These would be used to perform many repeated computations e.g. normalisation.

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

## Loading in the Dataset

Data is loaded from the `./features.pkl.pbz2` files located in their respective `data/extracted/playlist-name` directories. All data is concatenated into a single DataFrame, with the `playlist` column indicating which folder they came from.

In [16]:
from src.helpers import PandasAudioRepository
dataset = PandasAudioRepository.load_all_feature_datasets(extracted_dir)

We will only visualise numeric data. Higher-dimensional data such as the trajectory and tonnetz representations are dropped.

In [17]:
dataset = dataset.drop(['chord_trajectory', 'note_trajectory'], axis=1)
dataset

Unnamed: 0,song_name,artist,playlist,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,...,mfcc_mean_6,mfcc_var_6,mfcc_mean_7,mfcc_var_7,mfcc_mean_8,mfcc_var_8,mfcc_mean_9,mfcc_var_9,mfcc_mean_10,mfcc_var_10
0,Ivan Sings,Aram Khachaturian,kino,0.030815,0.029865,143.554688,728.505121,164591.144472,1044.706810,1.492931e+06,...,-1.621859,92.088676,-5.985138,59.754135,-7.222707,53.279484,-6.638159,63.120949,-7.461281,58.827705
1,"Prélude in E Minor, Op. 28, No. 4",Frédéric Chopin,kino,0.028196,0.027401,103.359375,615.425486,95544.686241,892.440162,8.328338e+05,...,1.046833,60.844543,-3.295691,66.532372,-7.709404,67.271782,-8.138650,51.361748,-8.201083,51.866173
2,Above the Trees,Kino,kino,0.052121,0.049405,143.554688,1053.924804,248527.612506,1937.848230,1.480721e+06,...,-1.518074,97.376602,0.252557,113.514336,-4.448510,74.239128,-2.306997,76.906097,-2.640234,74.699959
3,All,Kino,kino,0.044240,0.042283,161.499023,619.260455,49458.448746,981.976649,2.497879e+05,...,-5.978751,111.920937,0.971603,109.742783,3.967615,61.414219,4.160887,66.464058,-0.379875,82.376328
4,Anew,Kino,kino,0.048969,0.046571,161.499023,677.808914,72961.813450,1098.534181,4.831254e+05,...,-5.143861,121.065163,-0.958290,75.191750,-0.318088,72.683647,-0.499459,70.440079,-3.557541,71.182747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,Tenderness - Woven Remix,Tony Anderson,tony-anderson,0.050498,0.047948,99.384014,1690.752815,996877.763828,3788.666509,6.661065e+06,...,0.559879,71.918671,6.266844,85.660057,-0.473025,55.165321,1.394155,64.965172,-3.817915,56.059143
421,Tenderness,Tony Anderson,tony-anderson,0.019365,0.018990,151.999081,500.969118,144860.433188,764.697603,1.021311e+06,...,10.077353,55.430840,7.092884,42.235184,1.522114,30.496746,-3.054044,27.486416,-6.413583,28.495449
422,Cambodia - Ross Lara Remix,Tony Anderson,tony-anderson,0.053039,0.050226,129.199219,1618.810406,687875.800819,3493.059627,5.250681e+06,...,2.584433,58.274635,2.368633,62.413822,-2.939895,52.945293,2.519716,61.408394,-2.011139,51.967083
423,Cambodia,Tony Anderson,tony-anderson,0.042857,0.041020,129.199219,1234.479968,276949.182834,2474.291523,1.718215e+06,...,2.503661,131.399948,1.199921,118.341034,-1.348665,86.386292,1.038753,90.882286,-4.584059,92.075935


## Data Exploration and Visualisation

### Basic Distribution Analysis

This section is to visualise overall distribution of data

In [18]:
px.bar(
    dataset['artist'].value_counts(),
    x=dataset['artist'].value_counts().index,
    y='artist',
    title='Artists Distribution'
)

In [19]:
px.bar(
    dataset['playlist'].value_counts(),
    x=dataset['playlist'].value_counts().index,
    y='playlist',
    title='Playlists Distribution'
)

In [20]:
px.histogram(dataset, x='mfcc_var_1', facet_col='playlist', facet_col_wrap=4, title='mfcc_var_1 Grouped by Playlist')