In [3]:
import os
import glob

import numpy as np
import pandas as pd

import soundfile as sf

In [4]:
# jupyter kernel interrupt sends SIGINT so need to implement this manually for some reason
import signal, sys
terminate = False

def signal_handling(signum, frame):
    global terminate
    terminate = True
    

In [None]:
# signal needs to be in this cell for the SIGINT to flow through (SIX YEAR OLD BUG??)
signal.signal(signal.SIGINT, signal_handling)

'''
this script takes the flac files, converts them to numpy arrays, gets some frequency metadata like
sampling rate and number of channels and concats them with metadata.csv (join on song id) then saves
the numpy arrays as csv files, named by song id

the function has a terminate option, so hit ctrl+c while in this kernel to stop the script, and it will
also print the current index of the flac files.

NOTE: the order of flac files from glob.glob(directory) is sorted in order of what the ls command would return,
so if there are new files added to this directory the current index will be out of date, so do not run this
until ALL songs have been downloaded to the directory (so order doesn't change on each iteration of script)
of course this isn't relevant if you can run the script all the way through the thousands of songs, but it will
take a while.
'''

songs = []
metadata = []

curr_count = 5
# first we just gather all the files that we have, set their id in the same place in the songs list
fnames = list(glob.glob("../youtubescraper/spotify_yt_data/*.flac"))

for fname in fnames[curr_count:]:
    if terminate:
        terminate=False
        print("current index: ", curr_count)
        print("metadata length: ", len(metadata))
        break
    if curr_count == 500:
        break
    song_id = fname.split('/')[-1].split('.')[0]
    
    data, sr = sf.read(fname)
    channel = data.ndim
    
    # if stereo, we average the two channels of the signal to 1-dim
    if channel == 2:
        data = (data[:,0] + data[:,1]) / 2
    
    # store metadata for updating the csv later
    meta = {
        'id': song_id,
        'sr': sr,
        'channel': channel
    }
    metadata.append(meta)
    
    with open('./signal_data/%s.csv'%song_id, 'w') as f:
        np.savetxt(f, data)

    curr_count += 1

In [None]:
'''
this cell for taking whatever's currently in the metadata list, then updating the metadata.csv with it
'''

# convert new meta to df, and load old metadata
new_meta = pd.DataFrame(metadata)
meta_df = pd.read_csv('./metadata.csv')

# align on song id, and then update with sampling rate and number of channels
new_meta.set_index('id', inplace=True)
meta_df.update(new_meta)

# setting column order just in case, then overwriting entire metadata file
column_order = ['id', 'artist', 'title', 'duration', 'sr', 'channel']
meta_df.to_csv('./metadata.csv', columns=column_order, encoding='utf-8', index=False)