# Adding Spotify Audio Analysis to Tables

In [1]:
import csv
import json
import numpy as np
import re
import pandas as pd
from sqlalchemy import create_engine

## What Do I Want to Keep?

### Stats to Remove

I don't want to keep the following:

#### From Sections:
- Mean and Variance:
  - key, key confidence (these shouldn't be ranked numerically, at least I don't think so)
  - start
  - time_signature
  - time_signature confidence
  - tempo

In [41]:
with open('../data/audio_analysis/000xQL6tZNLJzIrtIgxqSl.json', 'r') as f:
    example = json.load(f)

In [42]:
example.keys()

dict_keys(['meta', 'track', 'bars', 'beats', 'tatums', 'sections', 'segments'])

In [43]:
for k, v in example['sections'][0].items():
    print(k)

start
duration
confidence
loudness
tempo
tempo_confidence
key
key_confidence
mode
mode_confidence
time_signature
time_signature_confidence


In [24]:
df = pd.DataFrame(example['sections'])

In [210]:
# example['track']

## Summary Stats for Audio Analysis

### Sections

In [8]:
example['sections'][0]

{'start': 0.0,
 'duration': 19.11423,
 'confidence': 1.0,
 'loudness': -12.969,
 'tempo': 166.075,
 'tempo_confidence': 0.507,
 'key': 2,
 'key_confidence': 0.504,
 'mode': 1,
 'mode_confidence': 0.731,
 'time_signature': 4,
 'time_signature_confidence': 0.514}

In [80]:
df.mean()

confidence                     0.692538
duration                      21.904615
key                            5.384615
key_confidence                 0.558000
loudness                      -7.889308
mode                           0.692308
mode_confidence                0.594923
start                        136.534450
tempo                        166.052538
tempo_confidence               0.426615
time_signature                 4.000000
time_signature_confidence      0.954385
dtype: float64

In [29]:
df.var()

confidence                      0.070495
duration                       45.829279
key                            15.923077
key_confidence                  0.038505
loudness                       15.730314
mode                            0.230769
mode_confidence                 0.013016
start                        7860.392668
tempo                           0.024245
tempo_confidence                0.011827
time_signature                  0.000000
time_signature_confidence       0.018383
dtype: float64

### Iterating Through json's to Determine Overall Shape

In [17]:
analysis_list = []

with open('../data/audio_analysis_list.csv', 'r') as f:
    analysis = csv.reader(f)
    for song in analysis:
        analysis_list.extend(song)

In [18]:
analysis_list[-1]

'7zvKFw17XyoBUx9mHiwzPy.json'

In [11]:
example['sections'][0]

{'start': 0.0,
 'duration': 19.11423,
 'confidence': 1.0,
 'loudness': -12.969,
 'tempo': 166.075,
 'tempo_confidence': 0.507,
 'key': 2,
 'key_confidence': 0.504,
 'mode': 1,
 'mode_confidence': 0.731,
 'time_signature': 4,
 'time_signature_confidence': 0.514}

In [56]:
def kc_counter(lst):
    '''
    Return count of key changes throughout songs listed in individual json files
    lst = refers to .json song listing
    '''
    kc_list = []
    for record in lst:
        with open('../data/audio_analysis/{}'.format(record), 'r') as f:
            analysis = json.load(f)
            if isinstance(analysis, dict):
                if 'sections' in analysis:
                    try:
                        start_key = analysis['sections'][0]['key']
                        kc = 0
                        for item in analysis['sections']:
                            cur_key = item['key']
                            if cur_key != start_key:
                                start_key = cur_key
                                kc += 1
                        kc_list.append(dict({record.replace('.json', ''):kc}))
                    except:
                        kc_list.append(dict({record.replace('.json', ''): 'unable to record key changes'}))
    return kc_list

In [57]:
kc_list = kc_counter(analysis_list)

In [81]:
for _ in kc_list:
    for k, v in _.items():
        if v == 'unable to record key changes':
            print(k, v)

21VDF2xzLl8P1vDVr0nuQY unable to record key changes


I'll need to drop this record.

In [74]:
kc_df = pd.Series([val.values() for val in kc_list], index=[k.keys() for k in kc_list])

In [76]:
kc_df = pd.Series()
for _ in kc_list:
    for k, v in _.items():
        kc_df.loc[k] = v

#### Dropping record with no key change value

In [83]:
kc_df.drop(labels='21VDF2xzLl8P1vDVr0nuQY', inplace=True)

#### Dropping non-songs from key change listing

In [84]:
non_songs = pd.read_csv('../data/non_songs.csv', index_col = 0)

In [86]:
non_songs.head(2)

Unnamed: 0,s_song_id,album_release_date,artist_id,artist_name,duration_ms,explicit,linked_album,song_title
163,2xfcxlx0QRbqUhpVidqmOU,2013-05-28,1xlkcCr7PNHw2dRG1Gm6YF,Ron White,322322.0,True,A Little Unprofessional,L.A. Beautiful/You're Beautiful/The Yellow Blur
708,5LMcncchvV1jYHMG4hviSN,1998-04-04,0NnoRcD3WkqC9aouHyE8YY,Trey Parker,153026.0,False,Cannibal! The Musical (Original Motion Picture...,Overture


In [89]:
kc_df.drop(labels=non_songs['s_song_id'], inplace=True)

In [90]:
kc_df.shape

(22909,)

I have slightly fewer works then what's present in my `song_df`. Still enough to get a decent analysis with, however.

#### Getting Mean and Variance of Several Features within Song Sections

In [93]:
def analysis_sorter(lst):
    '''
    Iterate through individual song .json files for sections, and calculate the mean and variance for 
    'confidence', 'duration', 'loudness', 'mode', 'mode_confidence', 'tempo', and 'tempo_confidence' values. 
    
    Returns two lists of dictionaries, one containing each song's section mean values, the other containing 
    each song's section variance values.
    '''
    mean_dicts = []
    var_dicts = []
    count = 0
    for record in lst:
        with open('../data/audio_analysis/{}'.format(record), 'r') as f:
            analysis = json.load(f)
            if isinstance(analysis, dict):
                if 'sections' in analysis:
                    try:
                        mean_dict = {}
                        var_dict = {}
                        df = pd.DataFrame(analysis['sections'])
                        mean = df[['confidence', 'duration', 'loudness', 'mode', 'mode_confidence',
                                   'tempo', 'tempo_confidence']].mean().to_dict()
                        var = df[['confidence', 'duration', 'loudness', 'mode', 'mode_confidence',
                                   'tempo', 'tempo_confidence']].var().to_dict()
                        mean_dict[record.replace('.json', '')] = mean
                        var_dict[record.replace('.json', '')] = var
                        mean_dicts.append(mean_dict)
                        var_dicts.append(var_dict)
                    except:
                        mean_dict = {}
                        var_dict = {}
                        mean_dict[(record.replace('.json',''))] = 'Unable to calculate mean of section features'
                        var_dict[(record.replace('.json',''))] = 'Unable to calculate variance of section features'
                        mean_dicts.append(mean_dict)
                        var_dicts.append(var_dict)
            count += 1
            if count % 5000 == 0:
                print("Completed {} files".format(count))
            if count % 5000 == 0:
                with open('../data/section_var_summary_{}.json'.format(count), 'w') as f:
                    json.dump(var_dicts, f)
                    var_dicts.clear()
                with open('../data/section_mean_summary_{}.json'.format(count), 'w') as f:
                    json.dump(mean_dicts, f)
                    mean_dicts.clear()
    return mean_dicts, var_dicts

In [177]:
mean_dicts, var_dicts = analysis_sorter(analysis_list)

Completed 5000 files
Completed 10000 files
Completed 15000 files
Completed 20000 files


In [182]:
mean_dicts[0]

{'6k9L7kTBzjXY0GfazHYqCg': {'confidence': 0.6483,
  'duration': 20.578667000000003,
  'loudness': -9.3965,
  'mode': 0.8,
  'mode_confidence': 0.46769999999999995,
  'tempo': 142.13690000000003,
  'tempo_confidence': 0.2643}}

In [237]:
var_dicts[0]

{'6k9L7kTBzjXY0GfazHYqCg': {'confidence': 0.04827756666666666,
  'duration': 88.7212888207789,
  'loudness': 85.8691118333333,
  'mode': 0.17777777777777778,
  'mode_confidence': 0.03472067777777778,
  'tempo': 1.7909101000000023,
  'tempo_confidence': 0.007427788888888888}}

#### Combining Summary Stats Docs

In [184]:
#mean
with open('../data/section_mean_summary_5000.json', 'r') as f:
    section_mean_summary_5000 = json.load(f)
with open('../data/section_mean_summary_10000.json', 'r') as f:
    section_mean_summary_10000 = json.load(f)
with open('../data/section_mean_summary_15000.json', 'r') as f:
    section_mean_summary_15000 = json.load(f)
with open('../data/section_mean_summary_20000.json', 'r') as f:
    section_mean_summary_20000 = json.load(f)

#var
with open('../data/section_var_summary_5000.json', 'r') as f:
    section_var_summary_5000 = json.load(f)
with open('../data/section_var_summary_10000.json', 'r') as f:
    section_var_summary_10000 = json.load(f)
with open('../data/section_var_summary_15000.json', 'r') as f:
    section_var_summary_15000 = json.load(f)
with open('../data/section_var_summary_20000.json', 'r') as f:
    section_var_summary_20000 = json.load(f)

In [185]:
mean_dicts.extend(section_mean_summary_5000)
mean_dicts.extend(section_mean_summary_10000)
mean_dicts.extend(section_mean_summary_15000)
mean_dicts.extend(section_mean_summary_20000)

var_dicts.extend(section_var_summary_5000)
var_dicts.extend(section_var_summary_10000)
var_dicts.extend(section_var_summary_15000)
var_dicts.extend(section_var_summary_20000)

In [186]:
len(mean_dicts), len(var_dicts)

(23125, 23125)

In [188]:
for e in var_dicts[:2]:
    print(e.keys())

dict_keys(['6k9L7kTBzjXY0GfazHYqCg'])
dict_keys(['6kAS4yj3wHJXcLp93vr5aG'])


#### Tossing Section Means and Vars into DataFrame + `csv`

In [196]:
section_mean = pd.DataFrame(columns=['confidence', 'duration', 'loudness', 'mode',
                                    'mode_confidence', 'tempo', 'tempo_confidence'])

for e in mean_dicts:
    section_mean.loc[list(e.keys())[0]] = list(e.values())[0]

In [201]:
section_var = pd.DataFrame(columns=['confidence', 'duration', 'loudness', 'mode',
                                    'mode_confidence', 'tempo', 'tempo_confidence'])

for e in var_dicts:
    section_var.loc[list(e.keys())[0]] = list(e.values())[0]

#### How Many Entries did not have Summary Stats?

In [211]:
len(section_mean), len(section_var)

(23125, 23125)

In [281]:
section_var[section_var['confidence'] == 'Unable to calculate variance of section features']

Unnamed: 0,confidence,duration,loudness,mode,mode_confidence,tempo,tempo_confidence
21VDF2xzLl8P1vDVr0nuQY,Unable to calculate variance of section features,Unable to calculate variance of section features,Unable to calculate variance of section features,Unable to calculate variance of section features,Unable to calculate variance of section features,Unable to calculate variance of section features,Unable to calculate variance of section features


In [283]:
section_mean[section_mean['confidence'] == 'Unable to calculate mean of section features']

Unnamed: 0,confidence,duration,loudness,mode,mode_confidence,tempo,tempo_confidence
21VDF2xzLl8P1vDVr0nuQY,Unable to calculate mean of section features,Unable to calculate mean of section features,Unable to calculate mean of section features,Unable to calculate mean of section features,Unable to calculate mean of section features,Unable to calculate mean of section features,Unable to calculate mean of section features


Only for one title was I unable to calculate the mean and/or variance of section features. I'll go ahead and drop this record.

In [278]:
section_var.iloc[9272]

confidence          Unable to calculate variance of section features
duration            Unable to calculate variance of section features
loudness            Unable to calculate variance of section features
mode                Unable to calculate variance of section features
mode_confidence     Unable to calculate variance of section features
tempo               Unable to calculate variance of section features
tempo_confidence    Unable to calculate variance of section features
Name: 21VDF2xzLl8P1vDVr0nuQY, dtype: object

In [285]:
section_mean.drop(labels='21VDF2xzLl8P1vDVr0nuQY', inplace=True)
section_var.drop(labels='21VDF2xzLl8P1vDVr0nuQY', inplace=True)

#### Checking for Duplicate Values

In [267]:
section_mean[section_mean.duplicated(keep=False)]

Unnamed: 0,confidence,duration,loudness,mode,mode_confidence,tempo,tempo_confidence
6wLj4AQJiBuJl5uiY0hSe8,0.760429,34.6233,-16.9286,0.285714,0.553429,110.056,0.601714
2memjAKXTXCK1WsUsWGHe7,0.760429,34.6233,-16.9286,0.285714,0.553429,110.056,0.601714


In [266]:
section_var[section_var.duplicated(keep=False)]

Unnamed: 0,confidence,duration,loudness,mode,mode_confidence,tempo,tempo_confidence
6wLj4AQJiBuJl5uiY0hSe8,0.05692,336.221,184.478,0.238095,0.065341,0.00296895,0.00792824
2GJxRwFe8oLcbXgTw9P5of,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2ULmjTNKicNAC0HAyYa47y,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2memjAKXTXCK1WsUsWGHe7,0.05692,336.221,184.478,0.238095,0.065341,0.00296895,0.00792824
5Asz9rHr2rViBdl6pkXpoq,0.0,0.0,0.0,0.0,0.0,0.0,0.0


It appears as though the duplicate results are just the same observation statistics for two different song id's. Rather than drop them, I will keep the values and decide what to do after I've combined my datasets for EDA.

#### Checking for `null's` 

In [230]:
section_var.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23125 entries, 6k9L7kTBzjXY0GfazHYqCg to 6k7e2cjr10EbQW5QnblOtY
Data columns (total 7 columns):
confidence          23122 non-null object
duration            23122 non-null object
loudness            23122 non-null object
mode                23122 non-null object
mode_confidence     23122 non-null object
tempo               23122 non-null object
tempo_confidence    23122 non-null object
dtypes: object(7)
memory usage: 1.4+ MB


In [232]:
section_mean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23125 entries, 6k9L7kTBzjXY0GfazHYqCg to 6k7e2cjr10EbQW5QnblOtY
Data columns (total 7 columns):
confidence          23125 non-null object
duration            23125 non-null object
loudness            23125 non-null object
mode                23125 non-null object
mode_confidence     23125 non-null object
tempo               23125 non-null object
tempo_confidence    23125 non-null object
dtypes: object(7)
memory usage: 2.0+ MB


Very strange that I have null values in the variance df, but not the means df. I'll check the dicts that I converted to see if those values are null there as well.

In [240]:
for i in var_dicts:
    for k in i.keys():
        if k == '2ULmjTNKicNAC0HAyYa47y':
            print(i)

{'2ULmjTNKicNAC0HAyYa47y': {'confidence': nan, 'duration': nan, 'loudness': nan, 'mode': nan, 'mode_confidence': nan, 'tempo': nan, 'tempo_confidence': nan}}


It's null there as well. I will go back, retreive the original json's, and compute the variance of each value.

In [221]:
section_var.shape

(23125, 7)

In [254]:
section_var[section_var.isnull().any(1)].head()

Unnamed: 0,confidence,duration,loudness,mode,mode_confidence,tempo,tempo_confidence
2GJxRwFe8oLcbXgTw9P5of,,,,,,,
2ULmjTNKicNAC0HAyYa47y,,,,,,,
5Asz9rHr2rViBdl6pkXpoq,,,,,,,


#### Dealing with Null Values

In [256]:
with open('../data/audio_analysis/2GJxRwFe8oLcbXgTw9P5of.json', 'r') as f:
    _2GJxRwFe8oLcbXgTw9P5of = json.load(f)
with open('../data/audio_analysis/2ULmjTNKicNAC0HAyYa47y.json', 'r') as f:
    _2ULmjTNKicNAC0HAyYa47y = json.load(f)
with open('../data/audio_analysis/5Asz9rHr2rViBdl6pkXpoq.json', 'r') as f:
    _5Asz9rHr2rViBdl6pkXpoq = json.load(f)

In [260]:
_5Asz9rHr2rViBdl6pkXpoq['sections']

[{'start': 0.0,
  'duration': 59.93333,
  'confidence': 1.0,
  'loudness': -17.937,
  'tempo': 80.584,
  'tempo_confidence': 0.115,
  'key': 2,
  'key_confidence': 0.285,
  'mode': 0,
  'mode_confidence': 0.487,
  'time_signature': 3,
  'time_signature_confidence': 0.128}]

Turns out that for each one of these titles, there is only 1 section, therefore...impossible to get a variance value. I just changed the observation to all 0's.

In [263]:
section_var.loc['2GJxRwFe8oLcbXgTw9P5of'] = 0
section_var.loc['2ULmjTNKicNAC0HAyYa47y'] = 0
section_var.loc['5Asz9rHr2rViBdl6pkXpoq'] = 0

#### Output to `csv`

In [286]:
section_mean.to_csv('../data/spotify_section_means.csv')

In [287]:
section_var.to_csv('../data/spotify_section_var.csv')

### Segments

I need to separate `pitches` and `timbre` into separate matrices, considering I need to get the mean and variance of each element within each of those arrays. Beyond that, I don't think I'll need any additional values from this listing. Reasons being, that I'm not overly concered with the length of each segment, nor am I concerned with the loundess levels, since they're encapsulated within the `Sections` and audio features.

In [69]:
example['segments'][0]

{'start': 0.0,
 'duration': 0.43651,
 'confidence': 1.0,
 'loudness_start': -60.0,
 'loudness_max_time': 0.31593,
 'loudness_max': -15.215,
 'pitches': [0.972,
  0.851,
  0.534,
  0.398,
  0.574,
  0.824,
  0.931,
  0.95,
  0.95,
  0.895,
  0.936,
  1.0],
 'timbre': [5.394,
  132.335,
  40.451,
  -169.179,
  36.889,
  -87.156,
  48.281,
  -81.004,
  -28.624,
  11.046,
  76.899,
  -15.937]}

In [33]:
empty = np.array(None)
empty

array(None, dtype=object)

#### Testing Grabbing Pitches and Timbre from each Segment

In [64]:
pitches = np.array(example['segments'][0]['pitches'])
timbre = np.array(example['segments'][0]['timbre'])

for record in example['segments'][1:]:
    new_pitch = np.array(record['pitches'])
    new_timbre = np.array(record['timbre'])
    pitches = np.vstack((pitches, new_pitch))
    timbre = np.vstack((timbre, new_timbre))

pitch_means = np.mean(pitches, axis = 0)
timbre_means = np.mean(timbre, axis = 0)
pitch_var = np.var(pitches, axis = 0)
timbre_var = np.var(timbre, axis = 0)

In [65]:
timbre

array([[   5.394,  132.335,   40.451, ...,   11.046,   76.899,  -15.937],
       [  25.492,  -29.872,  -43.306, ...,  -11.308,   -0.346,   -8.5  ],
       [  30.574,   -4.817,  -20.123, ...,   19.59 ,   10.574,   -3.419],
       ..., 
       [  46.967,   61.795,  -93.704, ...,   -9.035,   10.121,    5.212],
       [  42.442,   81.447, -104.876, ...,   -2.364,   -1.035,    1.474],
       [  31.01 ,   83.512, -200.859, ...,  -28.773, -103.174,   17.048]])

In [66]:
pitches

array([[ 0.972,  0.851,  0.534, ...,  0.895,  0.936,  1.   ],
       [ 1.   ,  0.734,  0.05 , ...,  0.024,  0.007,  0.01 ],
       [ 0.101,  0.211,  1.   , ...,  0.288,  0.146,  0.027],
       ..., 
       [ 0.199,  0.019,  0.007, ...,  0.025,  0.147,  1.   ],
       [ 0.106,  0.103,  0.083, ...,  0.514,  1.   ,  0.399],
       [ 0.004,  0.004,  0.004, ...,  0.004,  0.004,  0.029]])

In [70]:
pitch_var

array([ 0.0816765 ,  0.10712059,  0.08919524,  0.03346195,  0.12406955,
        0.0444795 ,  0.09366547,  0.11206538,  0.08183722,  0.13351742,
        0.06099433,  0.09083536])

In [71]:
timbre_var

array([   44.79019062,  2508.65836209,  2092.29554919,  2116.87128229,
         818.87857916,   714.00372464,   591.76958189,   551.21101119,
         254.27142387,   258.69811317,   293.24686903,   222.48651461])

In [67]:
pitch_means

array([ 0.29137624,  0.37816535,  0.31408911,  0.19801782,  0.41874059,
        0.21457426,  0.3793198 ,  0.35321584,  0.32189307,  0.46579505,
        0.24158812,  0.28446238])

In [68]:
timbre_means

array([ 47.48974554,   4.37188713, -19.12441782, -13.57252772,
        32.99183267, -31.4813703 ,  -7.32949406,  -6.3684198 ,
       -10.72296535,  -0.08051782,  -8.91219307,  -3.10303465])

#### Sanity Check of Pitch / Timbre Means

In [63]:
sc_list_pitch = []
sc_list_timbre = []

for record in example['segments']:
    sc_list_timbre.append(record['timbre'][0])
    sc_list_pitch.append(record['pitches'][0])

print(np.mean(sc_list_pitch), np.mean(sc_list_timbre))

0.291376237624 47.4897455446


Looks correct, though I'm surprised that grabbing the mean / variance across the 0 access applies to columns.

#### Creating Functions to Grab Summary Pitch and Timbre Statistics for Each Song

In [96]:
def pt_grabber_sgl(song):
    pitches = np.array(song['segments'][0]['pitches'])
    timbre = np.array(song['segments'][0]['timbre'])
    
    for record in song['segments']:
        new_pitch = np.array(record['pitches'])
        new_timbre = np.array(record['timbre'])
        pitches = np.vstack((pitches, new_pitch))
        timbre = np.vstack((timbre, new_timbre))

    pitch_means = np.mean(pitches, axis = 0)
    timbre_means = np.mean(timbre, axis = 0)
    pitch_var = np.var(pitches, axis = 0)
    timbre_var = np.var(timbre, axis = 0)

    return pitch_means, timbre_means, pitch_var, timbre_var

In [111]:
def pt_grabber(lst):
    timbre_means = []
    timbre_var = []
    pitch_means = []
    pitch_var = []
    count = 0
    for record in lst:
        with open('../data/audio_analysis/{}'.format(record), 'r') as f:
            analysis = json.load(f)
            if isinstance(analysis, dict):
                if 'segments' in analysis:
                    try:
                        pm, tm, pv, tv = pt_grabber_sgl(analysis)
                    except:
                        response = "unable to gather summary stats for song {} pitch & timbre".format(record.replace(".json", ''))
        try:
            timbre_means.append(dict({record.replace(".json", ""):tm}))
            timbre_var.append(dict({record.replace(".json", ""):tv}))
            pitch_means.append(dict({record.replace(".json", ""):pm}))
            pitch_var.append(dict({record.replace(".json", ""):pv}))
        except:
            [lists.append(response) for lists in [timbre_means, timbre_var, 
                                                  pitch_means, pitch_var]]
        count += 1
        if count % 1000 == 0:
            print("grabbing {}".format(count + 1))
    return timbre_means, timbre_var, pitch_means, pitch_var

#### Grabbing Pitch / Timbre Summary Stats

In [112]:
timbre_means, timbre_var, pitch_means, pitch_var = pt_grabber(analysis_list)

grabbing 1001
grabbing 2001
grabbing 3001
grabbing 4001
grabbing 5001
grabbing 6001
grabbing 7001
grabbing 8001
grabbing 9001
grabbing 10001
grabbing 11001
grabbing 12001
grabbing 13001
grabbing 14001
grabbing 15001
grabbing 16001
grabbing 17001
grabbing 18001
grabbing 19001
grabbing 20001
grabbing 21001
grabbing 22001
grabbing 23001


In [114]:
len(timbre_means), len(timbre_var), len(pitch_means), len(pitch_var)

(23129, 23129, 23129, 23129)

#### Checking Lists for Lack of Summary Stats

In [119]:
count = 0
for e in pitch_var:
    if isinstance(e, str):
        count += 1
print(count)

0


#### Tossing Lists into DataFrames + `csv`

In [139]:
list(timbre_means[0].keys())[0]

'000xQL6tZNLJzIrtIgxqSl'

In [135]:
list(timbre_means[0].values())[0]

49.242242014742054

In [150]:
tm_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in timbre_means:
    tm_df.loc[list(e.keys())[0]] = list(e.values())[0]

In [157]:
tv_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in timbre_var:
    tv_df.loc[list(e.keys())[0]] = list(e.values())[0]

In [159]:
pm_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in pitch_means:
    pm_df.loc[list(e.keys())[0]] = list(e.values())[0]

In [161]:
pv_df = pd.DataFrame(columns=["dim_" + str(i) for i in range(1, 13)])

for e in pitch_means:
    pv_df.loc[list(e.keys())[0]] = list(e.values())[0]

#### Removing Duplicate Records

In [167]:
tm_df.shape, tv_df.shape, pm_df.shape, pv_df.shape

((23129, 12), (23129, 12), (23129, 12), (23129, 12))

In [168]:
tm_df.drop_duplicates(inplace = True)
tv_df.drop_duplicates(inplace = True)
pm_df.drop_duplicates(inplace = True)
pv_df.drop_duplicates(inplace = True)

In [169]:
tm_df.shape, tv_df.shape, pm_df.shape, pv_df.shape

((23124, 12), (23124, 12), (23124, 12), (23124, 12))

#### Checking for `null's`

In [273]:
pv_df[tm_df.isnull().any(1)]

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,dim_11,dim_12


#### Outputting to csv

In [173]:
tm_df.to_csv('../data/timbre_means.csv')
tv_df.to_csv('../data/timbre_var.csv')
pm_df.to_csv('../data/pitch_means.csv')
pv_df.to_csv('../data/pitch_var.csv')