In [9]:
"""
Tutorial for the Million Song Dataset

by Thierry Bertin-Mahieux (2011) Columbia University
   tb2332@columbia.edu
   Copyright 2011 T. Bertin-Mahieux, All Rights Reserved

This tutorial will walk you through a quick experiment
using the Million Song Dataset (MSD). We will actually be working
on the 10K songs subset for speed issues, but the code should
transpose seamlessly.

In this tutorial, we do simple metadata analysis. We look at
which artist has the most songs by iterating over the whole
dataset and using an SQLite database.

You need to have the MSD code downloaded from GITHUB.
See the MSD website for details:
http://labrosa.ee.columbia.edu/millionsong/

If you have any questions regarding the dataset or this tutorial,
please first take a look at the website. Send us an email
if you haven't found the answer.

Note: this tutorial is developed using Python 2.6
      on an Ubuntu machine. PDF created using 'pyreport'.
"""

# usual imports
import pickle
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np # get it at: http://numpy.scipy.org/
# path to the Million Song Dataset subset (uncompressed)
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_subset_path='/mnt/snap'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check
# path to the Million Song Dataset code
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_code_path='./'
assert os.path.isdir(msd_code_path),'wrong path' # sanity check
# we add some paths to python so we can import MSD code
# Ubuntu: you can change the environment variable PYTHONPATH
# in your .bashrc file so you do not have to type these lines
sys.path.append(os.path.join(msd_code_path,'PythonSrc') )

# imports specific to the MSD
# import hdf5_getters as GETTERS
from PythonSrc import hdf5_getters as GETTERS

import pandas as pd

import json


In [10]:
import tables

In [3]:
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt

# we can now easily count the number of files in the dataset
print('number of song files:',apply_to_all_files(msd_subset_data_path))

# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
all_artist_names = set()

# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artist_names.add( artist_name )
    h5.close()
    
# let's apply the previous function to all files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_data_path,func=func_to_get_artist_name)
t2 = time.time()
print('all artist names extracted in:',strtimedelta(t1,t2))




# let's see some of the content of 'all_artist_names'
print('found',len(all_artist_names),'unique artist names')
for k in range(5):
    print(list(all_artist_names)[k])

# this is too long, and the work of listing artist names has already
# been done. Let's redo the same task using an SQLite database.
# We connect to the provided database: track_metadata.db
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
# we build the SQL query
q = "SELECT DISTINCT artist_name FROM songs"
# we query the database
t1 = time.time()
res = conn.execute(q)
all_artist_names_sqlite = res.fetchall()
t2 = time.time()
print('all artist names extracted (SQLite) in:',strtimedelta(t1,t2))
# we close the connection to the database
conn.close()
# let's see some of the content
for k in range(5):
    print(all_artist_names_sqlite[k][0])

# now, let's find the artist that has the most songs in the dataset
# what we want to work with is artist ID, not artist names. Some artists
# have many names, usually because the song is "featuring someone else"
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_id FROM songs"
res = conn.execute(q)
all_artist_ids = map(lambda x: x[0], res.fetchall())
conn.close()

# The Echo Nest artist id look like:
for k in range(4):
    print(all_artist_ids[k])

# let's count the songs from each of these artists.
# We will do it first by iterating over the dataset.
# we prepare a dictionary to count files
files_per_artist = {}
for aid in all_artist_ids:
    files_per_artist[aid] = 0

# we prepare the function to check artist id in each file
def func_to_count_artist_id(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_id = GETTERS.get_artist_id(h5)
    files_per_artist[artist_id] += 1
    h5.close()

# we apply this function to all files
apply_to_all_files(msd_subset_data_path,func=func_to_count_artist_id)

# the most popular artist (with the most songs) is:
most_pop_aid = sorted(files_per_artist,
                      key=files_per_artist.__getitem__,
                      reverse=True)[0]
print(most_pop_aid,'has',files_per_artist[most_pop_aid],'songs.')

# of course, it is more fun to have the name(s) of this artist
# let's get it using SQLite
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_name FROM songs"
q += " WHERE artist_id='"+most_pop_aid+"'"
res = conn.execute(q)
pop_artist_names = map(lambda x: x[0], res.fetchall())
conn.close()
print('SQL query:',q)
print('name(s) of the most popular artist:',pop_artist_names)

# let's redo all this work in SQLite in a few seconds
t1 = time.time()
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_id,artist_name,Count(track_id) FROM songs"
q += " GROUP BY artist_id"
res = conn.execute(q)
pop_artists = res.fetchall()
conn.close()
t2 = time.time()
print('found most popular artist in',strtimedelta(t1,t2))
print(sorted(pop_artists,key=lambda x:x[2],reverse=True)[0])



('number of song files:', 10000)
('all artist names extracted in:', '0:01:10.435715')
('found', 4412, 'unique artist names')
Pale Forest
The Real Kids
JennyAnyKind
Little Willie John
Barry Goldberg
('all artist names extracted (SQLite) in:', '0:00:00.020937')
!!!
(hed) p.e.
089 Clique feat. Minnesota Snipe & Skinny Cueball
089 Clique feat. Prophet
1. Futurologischer Congress
AR009211187B989185
AR00A6H1187FB5402A
AR00LNI1187FB444A5
AR00MBZ1187B9B5DB1
(u'AROIHOI122988FEB8E', 'has', 13, 'songs.')
('SQL query:', u"SELECT DISTINCT artist_name FROM songs WHERE artist_id='AROIHOI122988FEB8E'")
('name(s) of the most popular artist:', [u'Mario Rosenstock'])
('found most popular artist in', '0:00:00.078373')
(u'AROIHOI122988FEB8E', u'Mario Rosenstock', 13)


In [11]:
filename = msd_subset_data_path + '/A/A/A/TRAAAPK128E0786D96.h5'
print(filename)
f = GETTERS.open_h5_file_read(filename)
print(GETTERS.get_segments_timbre(f).shape, GETTERS.get_artist_name(f))
a = GETTERS.get_segments_timbre(f)
np.savetxt("mfcc.csv", a, delimiter=",")

f.close()


./data/MSD/MillionSongSubset/data/A/A/A/TRAAAPK128E0786D96.h5
((588, 12), 'Tweeterfriendly Music')


In [12]:
f = open("track_ids.txt", "a")

def func_to_get_song_id(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    track_id = GETTERS.get_track_id(h5)
    f.write(track_id+"\n")
    h5.close()
    
apply_to_all_files(msd_subset_data_path, func=func_to_get_song_id)
f.close()

In [43]:
l = np.array([])
track_ids = []
def func_to_get_timbre(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    a = GETTERS.get_segments_timbre(h5)
    track_id = GETTERS.get_track_id(h5)
    global l
    global track_ids
    if(a.shape[0] > 300):
        l = np.concatenate((l, np.array([a[:300]]))) if l.size else np.array([a[:300]])
        track_ids += [track_id]
    h5.close()
    
apply_to_all_files(msd_subset_data_path, func=func_to_get_timbre)
print(l.shape)
print(len(track_ids))

KeyboardInterrupt: 

In [45]:
print(l.shape)
print(len(track_ids))

(7457, 300, 12)
7457


In [50]:
np.save(open("mfcc.npy", "w"), l)

In [7]:
f = open("chroma.npy", "r")
new_l = np.load(f)
print(new_l.shape)
f.close()

(7457, 300, 12)


In [53]:
f = open("track_ids_for_chroma.txt", "a")
for track in track_ids:
    f.write(track+"\n")
f.close()

In [27]:
song_dict = {}
track_dict = {}
def func_to_match_song_and_track(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    song_id = GETTERS.get_song_id(h5)
    track_id = GETTERS.get_track_id(h5)
    global song_dict
    song_dict[song_id] = track_id
    track_dict[track_id] = song_id
    h5.close()

In [28]:
apply_to_all_files(msd_subset_data_path, func=func_to_match_song_and_track)

10000

In [18]:
song_dict
df = pd.DataFrame(song_dict.items())

In [19]:
df

Unnamed: 0,0,1
0,SOKHEQY12AC468B3F9,TRBGEHK12903CEEFC0
1,SONYSNO12AB0189762,TRADAFY12903CC7EAA
2,SOMWKPQ12A679D8AEA,TRBHGWP128E0793AD8
3,SOSKUNI12AB0187F12,TRAASQC128F93480F0
4,SOEDGSG12AB0184A57,TRADFBR128F9308456
5,SOEYVTH12A8C138E6E,TRAQKIM128F42618D0
6,SOXBEKP12A6D4F979E,TRAZOSO128F149739D
7,SOMRGMY12A6D4F93A8,TRAFZHK128F145E86B
8,SOBHMQL12A67ADE30A,TRALDWN128EF33FB6E
9,SONTKHG12A81C22D4C,TRBBANU128F422B87B


In [22]:
with open('data/song-track-mapping.json', 'w') as fp:
    json.dump(song_dict, fp)

In [15]:
with open('data/song-track-mapping.json', 'rb') as fp:
    new_song_dict = json.load(fp)
# new_song_dict


IOError: [Errno 2] No such file or directory: 'data/song-track-mapping.json'

In [29]:
with open('data/track-song-mapping.json', 'w') as fp:
    json.dump(track_dict, fp)

In [10]:
genre_dict = {}
genre_set = set()

In [12]:
#Explore genre data:


with open('data/msd_tagtraum_cd2.cls', 'r') as fp:
    lines = fp.readlines()
    for i in range(7, len(lines)):
        split = lines[i].strip().split()
        genre_set.add(split[1])
        genre_dict[split[0]] = split[1]

In [13]:
print(genre_dict['TRAACPE128F421C1B9'])
with open('data/track-genre-mapping.json', 'w') as fp:
    json.dump(genre_dict, fp)

RnB


In [16]:
print(list(genre_set))
genre_set
print(len(list(genre_dict.keys())))

['Reggae', 'Rock', 'Pop_Rock', 'Metal', 'World', 'International', 'RnB', 'Jazz', 'Punk', 'Vocal', 'Latin', 'Pop', 'Rap', 'Country', 'New', 'Blues', 'Electronic', 'Folk']
286024


In [6]:

l = np.array([])
track_ids = []
def func_to_get_chroma(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    a = GETTERS.get_segments_pitches(h5)
    track_id = GETTERS.get_track_id(h5)
    global l
    global track_ids
    if(a.shape[0] > 300):
        l = np.concatenate((l, np.array([a[:300]]))) if l.size else np.array([a[:300]])
        track_ids += [track_id]
    h5.close()
    
apply_to_all_files(msd_subset_data_path, func=func_to_get_chroma)
print(l.shape)
print(len(track_ids))

NameError: name 'apply_to_all_files' is not defined

In [6]:
np.save(open("data/chroma.npy", "w"), l)

In [17]:
with open('data/track-genre-mapping.json', 'r') as fp:
    genre_dict = json.load(fp)

In [18]:
with open('data/song_track_artist.pkl', "rb") as f:
    u = pickle.load(f)

In [19]:
track_ids = u["track_id"]
print(track_ids)

18        TRMMMTK128F424EF7C
32        TRMMMQN128F4238509
33        TRMMMKQ128F92EBCB5
47        TRMMMFG128F425087B
53        TRMMMQA128F14A454A
61        TRMMWJS12903CBB7F5
84        TRMMWQU12903CF447F
94        TRMMWCX128F92EF4C8
115       TRMMGCB128E079651D
117       TRMMGPD128F42AA230
123       TRMMGTX128F92FB4D9
139       TRMMGRZ12903CE1C82
145       TRMMGDP128F933E59A
152       TRMMGSH128F4243869
169       TRMMHGT128F42B91AD
172       TRMMHBF12903CF6E59
175       TRMMHDI128F42442F4
191       TRMMHKG12903CDB1B5
211       TRMMHDT128F422F115
226       TRMMHWD128F14AE65F
239       TRMMCRE128F42481FB
240       TRMMCBU128E0791823
245       TRMMCDR128F423AB03
259       TRMMCCU128F92E1A71
266       TRMMCPH128F429F862
285       TRMMRZW128EF34DF59
287       TRMMRNI128EF34C4A7
290       TRMMRJJ128F9319DE9
293       TRMMRIO128F423DD33
296       TRMMRPV128F93110FE
                 ...        
999710    TRYYDQP128F42281C2
999728    TRYYDTF128F9344595
999729    TRYYDUX128F4278732
999730    TRYY

In [20]:
genres = {'Reggae':0, 'Rock':1, 'Pop_Rock':2, 'Metal':3, 'World':4, 'International':5, 'RnB':6, 'Jazz':7, 'Punk':8, 'Vocal':9, 'Latin':10, 'Pop':11, 'Rap':12, 'Country':13, 'New':14, 'Blues':15, 'Electronic':16, 'Folk':17}

In [21]:
import time
relevant_track_ids = []
chroma_features = list()#np.array([])
mfcc_features = []#np.array([])
genre_labels = []#np.array([])
# NLP_embedding = np.array([])
prev_time = time.time()

def foo(track_id):
    global relevant_track_ids, genre_labels, chroma_features, mfcc_features, genre_dict
    filename = '/mnt/snap/data/' + track_id[2] + '/' + track_id[3] + '/' + track_id[4] + '/' + track_id + '.h5'
    h5 = GETTERS.open_h5_file_read(filename)
    chroma = GETTERS.get_segments_pitches(h5)
    mfcc = GETTERS.get_segments_timbre(h5)
    genre = genre_dict.get(track_id)
    
    if (chroma.shape[0] > 300):
        if mfcc.shape[0] > 300:
            if genre:
                idx = genres[genre]
                onehot = np.zeros(18)
                onehot[idx] = 1.0
                chroma_features.append(chroma[:300])
                mfcc_features.append(np.array(mfcc[:300]))
                genre_labels.append(onehot)
#                 chroma_features = np.concatenate((chroma_features, np.array([chroma[:300]]))) if chroma_features.size else np.array([chroma[:300]])
#                 mfcc_features = np.concatenate((mfcc_features, np.array([mfcc[:300]]))) if mfcc_features.size else np.array([mfcc[:300]])
#                 genre_labels = np.concatenate((genre_labels, onehot)) if genre_labels.size else onehot

                relevant_track_ids += [track_id]
    h5.close()

In [22]:
start_time = time.time()
track_ids.map(foo)
print(time.time()-start_time)

1484.14105201


In [24]:
chroma_features = np.array(chroma_features)
mfcc_features = np.array(mfcc_features)
genre_labels = np.array(genre_labels)

In [25]:
np.save(open("data/chroma.npy", "w"), chroma_features)
np.save(open("data/mfcc.npy", "w"), mfcc_features)
np.save(open("data/genre_onehot.npy", "w"), genre_labels)
f = open("track_ids.txt", "w")
for track in relevant_track_ids:
    f.write(track+"\n")
f.close()

In [23]:
print(len(relevant_track_ids))
print(len(list(set(track_ids.values))))
print(track_ids.values)

59134
100600
[u'TRMMMTK128F424EF7C' u'TRMMMQN128F4238509' u'TRMMMKQ128F92EBCB5' ...
 u'TRYYYJZ128F424BFA1' u'TRYYYWR128F4267C95' u'TRYYYWX128F92E11D1']


In [26]:
print(mfcc_features.shape)
print(genre_labels.shape)

(59134, 300, 12)
(59134, 18)


In [96]:
f = open("track_ids.txt", "w")
for track in relevant_track_ids[:10000]:
    f.write(track+"\n")
f.close()

In [11]:
track_id_dict = {'TRMHHRK128F932A818': 149,
 'TRMNYZQ128F1459E10': 651,
 'TRMEBYQ128F932FEFF': 840,
 'TRMEQQX12903CCD9D5': 844,
 'TRWRXSJ128F9314B40': 1548,
 'TRWIKPM128F931F1D2': 1776,
 'TRWVTDX128F931ECEF': 2256,
 'TRGMZNT128F92DE267': 2513,
 'TRGWVYH12903D019BA': 2582,
 'TRGWXUG128F148469F': 2592,
 'TRGCHLH12903CB7352': 2695,
 'TRGXQES128F42BA5EB': 3600,
 'TRHHHKS128F92F98D5': 3879,
 'TRCWHIO128F1488FB7': 5093,
 'TRCRCBT128F4260DD1': 5287,
 'TRCPXID128F92D5D3C': 5717,
 'TRCUEPI128F4278E10': 5815,
 'TRCODNR128F92F6B16': 6167,
 'TRCOOYB128E078ED95': 6169,
 'TRRWJLU128F92F9912': 6399,
 'TRRHVIC128F92F9908': 6509,
 'TRRNFHH128F92D262D': 6894,
 'TRRUVLO128F92DE6F7': 7083,
 'TRRVGMP128F92FD5DE': 7284,
 'TRRKXNQ128F9339002': 7520,
 'TRBZGSM128E078EDB4': 8028,
 'TRBVNWT128F93173BA': 8581,
 'TRBOKUK12903CAE13F': 8701,
 'TRBKFKL128E078ED76': 8770,
 'TRFWGOJ128E0780C8B': 8905,
 'TRFHCVW128E078EB45': 9015,
 'TRFBNZN128F9340B0E': 9160,
 'TRFQKSM128F9338683': 9276,
 'TRFITXC128F1469028': 9361,
 'TRFNGJS128F92F9EEE': 9435,
 'TRFTUIW128E0784B9F': 9548,
 'TRFDCPI128F93234B7': 9849,
 'TRFOVTO128F4228CC3': 9929,
 'TRFXWSD128F93173BF': 9940,
 'TRQFXKD128E0780CAE': 10478,
 'TRQLRYB128E079506A': 10906,
 'TRQDKYR12903D010F9': 11188,
 'TRZJNOD128F4264131': 12194,
 'TRIMGRR12903CEFCEA': 12567,
 'TRIUBAR128F146BD74': 13326,
 'TRIXAZF128F421EE64': 13730,
 'TRAGACS128E078E74D': 13982,
 'TRARUTP128E0797FC7': 14130,
 'TRAFUNV128F92CFEB2': 14239,
 'TRAZDPO128E078ECE6': 14350,
 'TRAALAH128E078234A': 14450,
 'TRANKTK128E07921D9': 14515,
 'TRAEHHJ12903CF492F': 14726,
 'TRNWJFS128F93377B9': 15253,
 'TRNPKRK128F429831C': 15866,
 'TRNKWEN128F933577B': 16342,
 'TRPFYYL128F92F7144': 16814,
 'TRPYQRJ128F42B81C1': 17660,
 'TRTWFKE128EF35F93B': 17755,
 'TRTHGPK128F147DF90': 17850,
 'TRTFVVP12903D052D6': 18067,
 'TRTIRFG128F428D2F8': 18181,
 'TRTNDNE128F1486812': 18305,
 'TRTVTFT128F9327735': 18627,
 'TRTXHBC128F4259DB7': 18731,
 'TRTXJBO128F4294115': 18764,
 'TRUFTBY128F93450B8': 19223,
 'TRLVQME128F931BAF3': 21033,
 'TREWPDP128E07896EE': 21376,
 'TREBUCN128F92D2CAC': 21628,
 'TRENTGL128E0780C8E': 21949,
 'TRJHIKI128F42539D8': 22760,
 'TRJCWTZ128E0797FCA': 22788,
 'TRJRECT12903CBADA3': 22869,
 'TRJRDDL128F147CBE8': 22874,
 'TRJQXHD128F428E701': 23033,
 'TRJZLIF12903CAA7A4': 23070,
 'TRJPXVB128F9316916': 23276,
 'TRJXFFX128E078234B': 23725,
 'TRSUSWW128F93463BF': 24620,
 'TRSDWDN128F4274C03': 24869,
 'TRVUGOX128E0784629': 25818,
 'TRVYICQ128F4252493': 26307,
 'TRDFNLR12903CB0FE4': 26717,
 'TRDTWWZ12903CC36D8': 27037,
 'TRDKDVV128F42733CD': 27575,
 'TROGOSE128F9317118': 27776,
 'TRORLTF128F146DE1B': 27918,
 'TROAQBZ128F9326213': 28202,
 'TROTIUH128E0782538': 28353,
 'TRXWWWB128F9305CAA': 28972,
 'TRXWAZC128F9314B3E': 28991,
 'TRXNEWL12903CE9600': 29507,
 'TRKRHYM128F42934A9': 30373,
 'TRKRRUV128F92F20F1': 30374,
 'TRKXRDF12903CF3F3E': 31272,
 'TRKYAIX128E0785FFB': 31373,
 'TRYCKII128F9338134': 31644,
 'TRYRZFF128F933BBBB': 31662,
 'TRYNYSX128E07897B3': 32051}

In [14]:
top100trackIds = track_id_dict.keys()
segments_start_dict = {}

def foo2(track_id):
    global segmenst_start_dict
    filename = '/mnt/snap/data/' + track_id[2] + '/' + track_id[3] + '/' + track_id[4] + '/' + track_id + '.h5'
    h5 = GETTERS.open_h5_file_read(filename)
    starts = GETTERS.get_segments_start(h5)
    segments_start_dict[track_id] = starts

In [19]:
# print(top100trackIds)
_ = map(foo2, top100trackIds)


In [20]:
segments_start_dict

{'TRAALAH128E078234A': array([  0.     ,   0.93751,   1.35556, ..., 355.09306, 355.41293,
        356.38912]),
 'TRAEHHJ12903CF492F': array([  0.     ,   0.26971,   0.54227,   0.92009,   1.11193,   1.41937,
          1.98825,   2.20866,   2.35415,   2.7258 ,   2.94608,   3.39823,
          3.58435,   4.25211,   4.61805,   4.82617,   5.22735,   5.48181,
          6.09787,   6.72549,   7.33565,   7.76426,   7.97828,   8.37338,
          8.58304,   9.23329,   9.58717,   9.86594,  10.25388,  10.48009,
         10.97451,  11.39224,  11.51451,  11.72272,  12.12454,  12.33315,
         12.7566 ,  12.98299,  13.61542,  14.22571,  14.85175,  15.5127 ,
         16.05501,  16.31977,  16.50036,  16.77329,  17.37728,  17.69075,
         17.9981 ,  18.64413,  19.25769,  19.47669,  19.81506,  20.01102,
         20.1981 ,  20.51188,  21.13338,  21.75297,  22.37506,  22.7254 ,
         23.02599,  23.52454,  23.64617,  23.98971,  24.918  ,  25.26707,
         25.4059 ,  25.51569,  25.95102,  26.19066,  

In [22]:
with open('data/track-segments_start.pkl', 'w') as fp:
    pickle.dump(segments_start_dict, fp)