# Adding Spotify Audio Analysis to Tables

In [1]:
import csv
import json
import numpy as np
import re
import pandas as pd
from sqlalchemy import create_engine

## What Do I Want to Keep?

In [2]:
with open('../data/audio_analysis/46n2EGFnPC3tzWCN1Aqe26.json', 'r') as f:
    example = json.load(f)

In [3]:
for k, v in example['sections'][0].items():
    print(k)

start
duration
confidence
loudness
tempo
tempo_confidence
key
key_confidence
mode
mode_confidence
time_signature
time_signature_confidence


In [24]:
df = pd.DataFrame(example['sections'])

## Summary Stats for Audio Analysis

### Sections

In [80]:
df.mean()

confidence                     0.692538
duration                      21.904615
key                            5.384615
key_confidence                 0.558000
loudness                      -7.889308
mode                           0.692308
mode_confidence                0.594923
start                        136.534450
tempo                        166.052538
tempo_confidence               0.426615
time_signature                 4.000000
time_signature_confidence      0.954385
dtype: float64

In [29]:
df.var()

confidence                      0.070495
duration                       45.829279
key                            15.923077
key_confidence                  0.038505
loudness                       15.730314
mode                            0.230769
mode_confidence                 0.013016
start                        7860.392668
tempo                           0.024245
tempo_confidence                0.011827
time_signature                  0.000000
time_signature_confidence       0.018383
dtype: float64

### Iterating Through json's to Determine Overall Shape

In [76]:
analysis_list = []

with open('../data/audio_analysis_list.csv', 'r') as f:
    analysis = csv.reader(f)
    for song in analysis:
        analysis_list.extend(song)

In [78]:
analysis_list[-1]

'7zvKFw17XyoBUx9mHiwzPy.json'

In [176]:
def analysis_sorter(lst):
    mean_dicts = []
    var_dicts = []
    count = 0
    for record in lst:
        with open('../data/audio_analysis/{}'.format(record), 'r') as f:
            analysis = json.load(f)
            if isinstance(analysis, dict):
                if 'sections' in analysis:
                    try:
                        mean_dict = {}
                        var_dict = {}
                        df = pd.DataFrame(analysis['sections'])
                        mean = df[['confidence', 'duration', 'loudness', 'mode', 'mode_confidence',
                                   'tempo', 'tempo_confidence']].mean().to_dict()
                        var = df[['confidence', 'duration', 'loudness', 'mode', 'mode_confidence',
                                   'tempo', 'tempo_confidence']].var().to_dict()
                        mean_dict[record.replace('.json', '')] = mean
                        var_dict[record.replace('.json', '')] = var
                        mean_dicts.append(mean_dict)
                        var_dicts.append(var_dict)
                    except:
                        mean_dict = {}
                        var_dict = {}
                        mean_dict[(record.replace('.json',''))] = 'Unable to calculate mean of section features'
                        var_dict[(record.replace('.json',''))] = 'Unable to calculate variance of section features'
                        mean_dicts.append(mean_dict)
                        var_dicts.append(var_dict)
            count += 1
            if count % 5000 == 0:
                print("Completed {} files".format(count))
            if count % 5000 == 0:
                with open('../data/section_var_summary_{}.json'.format(count), 'w') as f:
                    json.dump(var_dicts, f)
                    var_dicts.clear()
                with open('../data/section_mean_summary_{}.json'.format(count), 'w') as f:
                    json.dump(mean_dicts, f)
                    mean_dicts.clear()
    return mean_dicts, var_dicts

In [177]:
mean_dicts, var_dicts = analysis_sorter(analysis_list)

Completed 5000 files
Completed 10000 files
Completed 15000 files
Completed 20000 files


In [182]:
mean_dicts[0]

{'6k9L7kTBzjXY0GfazHYqCg': {'confidence': 0.6483,
  'duration': 20.578667000000003,
  'loudness': -9.3965,
  'mode': 0.8,
  'mode_confidence': 0.46769999999999995,
  'tempo': 142.13690000000003,
  'tempo_confidence': 0.2643}}

In [183]:
var_dicts[0]

{'6k9L7kTBzjXY0GfazHYqCg': {'confidence': 0.04827756666666666,
  'duration': 88.7212888207789,
  'loudness': 85.8691118333333,
  'mode': 0.17777777777777778,
  'mode_confidence': 0.03472067777777778,
  'tempo': 1.7909101000000023,
  'tempo_confidence': 0.007427788888888888}}

#### Combining Summary Stats Docs

In [184]:
#mean
with open('../data/section_mean_summary_5000.json', 'r') as f:
    section_mean_summary_5000 = json.load(f)
with open('../data/section_mean_summary_10000.json', 'r') as f:
    section_mean_summary_10000 = json.load(f)
with open('../data/section_mean_summary_15000.json', 'r') as f:
    section_mean_summary_15000 = json.load(f)
with open('../data/section_mean_summary_20000.json', 'r') as f:
    section_mean_summary_20000 = json.load(f)

#var
with open('../data/section_var_summary_5000.json', 'r') as f:
    section_var_summary_5000 = json.load(f)
with open('../data/section_var_summary_10000.json', 'r') as f:
    section_var_summary_10000 = json.load(f)
with open('../data/section_var_summary_15000.json', 'r') as f:
    section_var_summary_15000 = json.load(f)
with open('../data/section_var_summary_20000.json', 'r') as f:
    section_var_summary_20000 = json.load(f)

In [185]:
mean_dicts.extend(section_mean_summary_5000)
mean_dicts.extend(section_mean_summary_10000)
mean_dicts.extend(section_mean_summary_15000)
mean_dicts.extend(section_mean_summary_20000)

var_dicts.extend(section_var_summary_5000)
var_dicts.extend(section_var_summary_10000)
var_dicts.extend(section_var_summary_15000)
var_dicts.extend(section_var_summary_20000)

In [186]:
len(mean_dicts), len(var_dicts)

(23125, 23125)

In [188]:
for e in var_dicts[:2]:
    print(e.keys())

dict_keys(['6k9L7kTBzjXY0GfazHYqCg'])
dict_keys(['6kAS4yj3wHJXcLp93vr5aG'])


In [196]:
section_mean = pd.DataFrame(columns=['confidence', 'duration', 'loudness', 'mode',
                                    'mode_confidence', 'tempo', 'tempo_confidence'])

for e in mean_dicts:
    section_mean.loc[list(e.keys())[0]] = list(e.values())[0]

In [201]:
section_var = pd.DataFrame(columns=['confidence', 'duration', 'loudness', 'mode',
                                    'mode_confidence', 'tempo', 'tempo_confidence'])

for e in var_dicts:
    section_var.loc[list(e.keys())[0]] = list(e.values())[0]

In [199]:
section_mean.to_csv('../data/spotify_section_means.csv')

In [203]:
section_var.to_csv('../data/spotify_section_var.csv')

#### How Many Entries did not have Summary Stats?

In [6]:
len(analysis_list)

23124

In [6]:
len(section_analysis_summary)

23124

In [7]:
count = 0
for record in section_analysis_summary:
    for k, v in record.items():
#         for _, value in v.items():
        if 'Unable to calculate mean & variance of section features' in v.values():
            count += 1
print(count)

0


#### Section Analysis to df

# to-do

In [8]:
sec_df = pd.DataFrame(section_analysis_summary)

KeyboardInterrupt: 

Got memory error...gonna have to try something else.

### Segments

I need to separate `pitches` and `timbre` into separate matrices, considering I need to get the mean and variance of each element within each of those arrays. Beyond that, I don't think I'll need any additional values from this listing. Reasons being, that I'm not overly concered with the length of each segment, nor am I concerned with the loundess levels, since they're encapsulated within the `Sections` and audio features.

In [69]:
example['segments'][0]

{'start': 0.0,
 'duration': 0.43651,
 'confidence': 1.0,
 'loudness_start': -60.0,
 'loudness_max_time': 0.31593,
 'loudness_max': -15.215,
 'pitches': [0.972,
  0.851,
  0.534,
  0.398,
  0.574,
  0.824,
  0.931,
  0.95,
  0.95,
  0.895,
  0.936,
  1.0],
 'timbre': [5.394,
  132.335,
  40.451,
  -169.179,
  36.889,
  -87.156,
  48.281,
  -81.004,
  -28.624,
  11.046,
  76.899,
  -15.937]}

In [33]:
empty = np.array(None)
empty

array(None, dtype=object)

#### Testing Grabbing Pitches and Timbre from each Segment

In [64]:
pitches = np.array(example['segments'][0]['pitches'])
timbre = np.array(example['segments'][0]['timbre'])

for record in example['segments'][1:]:
    new_pitch = np.array(record['pitches'])
    new_timbre = np.array(record['timbre'])
    pitches = np.vstack((pitches, new_pitch))
    timbre = np.vstack((timbre, new_timbre))

pitch_means = np.mean(pitches, axis = 0)
timbre_means = np.mean(timbre, axis = 0)
pitch_var = np.var(pitches, axis = 0)
timbre_var = np.var(timbre, axis = 0)

In [65]:
timbre

array([[   5.394,  132.335,   40.451, ...,   11.046,   76.899,  -15.937],
       [  25.492,  -29.872,  -43.306, ...,  -11.308,   -0.346,   -8.5  ],
       [  30.574,   -4.817,  -20.123, ...,   19.59 ,   10.574,   -3.419],
       ..., 
       [  46.967,   61.795,  -93.704, ...,   -9.035,   10.121,    5.212],
       [  42.442,   81.447, -104.876, ...,   -2.364,   -1.035,    1.474],
       [  31.01 ,   83.512, -200.859, ...,  -28.773, -103.174,   17.048]])

In [66]:
pitches

array([[ 0.972,  0.851,  0.534, ...,  0.895,  0.936,  1.   ],
       [ 1.   ,  0.734,  0.05 , ...,  0.024,  0.007,  0.01 ],
       [ 0.101,  0.211,  1.   , ...,  0.288,  0.146,  0.027],
       ..., 
       [ 0.199,  0.019,  0.007, ...,  0.025,  0.147,  1.   ],
       [ 0.106,  0.103,  0.083, ...,  0.514,  1.   ,  0.399],
       [ 0.004,  0.004,  0.004, ...,  0.004,  0.004,  0.029]])

In [70]:
pitch_var

array([ 0.0816765 ,  0.10712059,  0.08919524,  0.03346195,  0.12406955,
        0.0444795 ,  0.09366547,  0.11206538,  0.08183722,  0.13351742,
        0.06099433,  0.09083536])

In [71]:
timbre_var

array([   44.79019062,  2508.65836209,  2092.29554919,  2116.87128229,
         818.87857916,   714.00372464,   591.76958189,   551.21101119,
         254.27142387,   258.69811317,   293.24686903,   222.48651461])

In [67]:
pitch_means

array([ 0.29137624,  0.37816535,  0.31408911,  0.19801782,  0.41874059,
        0.21457426,  0.3793198 ,  0.35321584,  0.32189307,  0.46579505,
        0.24158812,  0.28446238])

In [68]:
timbre_means

array([ 47.48974554,   4.37188713, -19.12441782, -13.57252772,
        32.99183267, -31.4813703 ,  -7.32949406,  -6.3684198 ,
       -10.72296535,  -0.08051782,  -8.91219307,  -3.10303465])

#### Sanity Check of Pitch / Timbre Means

In [63]:
sc_list_pitch = []
sc_list_timbre = []

for record in example['segments']:
    sc_list_timbre.append(record['timbre'][0])
    sc_list_pitch.append(record['pitches'][0])

print(np.mean(sc_list_pitch), np.mean(sc_list_timbre))

0.291376237624 47.4897455446


Looks correct, though I'm surprised that grabbing the mean / variance across the 0 access applies to columns.

#### Creating Functions to Grab Summary Pitch and Timbre Statistics for Each Song

In [96]:
def pt_grabber_sgl(song):
    pitches = np.array(song['segments'][0]['pitches'])
    timbre = np.array(song['segments'][0]['timbre'])
    
    for record in song['segments']:
        new_pitch = np.array(record['pitches'])
        new_timbre = np.array(record['timbre'])
        pitches = np.vstack((pitches, new_pitch))
        timbre = np.vstack((timbre, new_timbre))

    pitch_means = np.mean(pitches, axis = 0)
    timbre_means = np.mean(timbre, axis = 0)
    pitch_var = np.var(pitches, axis = 0)
    timbre_var = np.var(timbre, axis = 0)

    return pitch_means, timbre_means, pitch_var, timbre_var

In [111]:
def pt_grabber(lst):
    timbre_means = []
    timbre_var = []
    pitch_means = []
    pitch_var = []
    count = 0
    for record in lst:
        with open('../data/audio_analysis/{}'.format(record), 'r') as f:
            analysis = json.load(f)
            if isinstance(analysis, dict):
                if 'segments' in analysis:
                    try:
                        pm, tm, pv, tv = pt_grabber_sgl(analysis)
                    except:
                        response = "unable to gather summary stats for song {} pitch & timbre".format(record.replace(".json", ''))
        try:
            timbre_means.append(dict({record.replace(".json", ""):tm}))
            timbre_var.append(dict({record.replace(".json", ""):tv}))
            pitch_means.append(dict({record.replace(".json", ""):pm}))
            pitch_var.append(dict({record.replace(".json", ""):pv}))
        except:
            [lists.append(response) for lists in [timbre_means, timbre_var, 
                                                  pitch_means, pitch_var]]
        count += 1
        if count % 1000 == 0:
            print("grabbing {}".format(count + 1))
    return timbre_means, timbre_var, pitch_means, pitch_var

#### Grabbing Pitch / Timbre Summary Stats

In [112]:
timbre_means, timbre_var, pitch_means, pitch_var = pt_grabber(analysis_list)

grabbing 1001
grabbing 2001
grabbing 3001
grabbing 4001
grabbing 5001
grabbing 6001
grabbing 7001
grabbing 8001
grabbing 9001
grabbing 10001
grabbing 11001
grabbing 12001
grabbing 13001
grabbing 14001
grabbing 15001
grabbing 16001
grabbing 17001
grabbing 18001
grabbing 19001
grabbing 20001
grabbing 21001
grabbing 22001
grabbing 23001


In [114]:
len(timbre_means), len(timbre_var), len(pitch_means), len(pitch_var)

(23129, 23129, 23129, 23129)

#### Checking Lists for Lack of Summary Stats

In [119]:
count = 0
for e in pitch_var:
    if isinstance(e, str):
        count += 1
print(count)

0


#### Tossing Lists into DataFrames + `csv`

In [139]:
list(timbre_means[0].keys())[0]

'000xQL6tZNLJzIrtIgxqSl'

In [135]:
list(timbre_means[0].values())[0]

49.242242014742054

In [150]:
tm_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in timbre_means:
    tm_df.loc[list(e.keys())[0]] = list(e.values())[0]

In [157]:
tv_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in timbre_var:
    tv_df.loc[list(e.keys())[0]] = list(e.values())[0]

In [159]:
pm_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in pitch_means:
    pm_df.loc[list(e.keys())[0]] = list(e.values())[0]

In [161]:
pv_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in pitch_means:
    pv_df.loc[list(e.keys())[0]] = list(e.values())[0]

#### Removing Duplicate Records

In [167]:
tm_df.shape, tv_df.shape, pm_df.shape, pv_df.shape

((23129, 12), (23129, 12), (23129, 12), (23129, 12))

In [168]:
tm_df.drop_duplicates(inplace = True)
tv_df.drop_duplicates(inplace = True)
pm_df.drop_duplicates(inplace = True)
pv_df.drop_duplicates(inplace = True)

In [169]:
tm_df.shape, tv_df.shape, pm_df.shape, pv_df.shape

((23124, 12), (23124, 12), (23124, 12), (23124, 12))

#### Outputting to csv

In [173]:
tm_df.to_csv('../data/timbre_means.csv')
tv_df.to_csv('../data/timbre_var.csv')
pm_df.to_csv('../data/pitch_means.csv')
pv_df.to_csv('../data/pitch_var.csv')

### Stats to Remove

I don't want to keep the following:

#### From Sections:
- Mean and Variance:
  - key, key confidence (these shouldn't be ranked numerically, at least I don't think so)
  - start
  - time_signature
  - time_signature confidence
  - tempo