In [1]:
import os
import glob

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.style as ms
ms.use('seaborn-muted')
# %matplotlib inline

import librosa
import librosa.display as ld
import skimage.io

import soundfile as sf

In [2]:
# jupyter kernel interrupt sends SIGINT so need to implement this manually for some reason
import signal, sys
terminate = False

def signal_handling(signum, frame):
    global terminate
    terminate = True
    

In [None]:
# constant q transform

# num=1 restricts number of figures to 1, and clear=True will clear existing figure(prevent constant reallocation)
fig, ax = plt.subplots(num=1, clear=True)

# sr=song's sr(mostly 48k) vs default 22.05k also for higher image fidelity
C = np.abs(librosa.cqt(y=data, sr=sr, hop_length=128, res_type='fft'))
img = ld.specshow(librosa.amplitude_to_db(C, ref=np.max), sr=sr, ax=ax)

plt.axis('off')
extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig('./spectrograms/cqt_hop=128_sr=sr_%s'%song_id, transparent=False, bbox_inches=extent, pad_inches=0)


In [3]:
'''
example code for spec image
https://stackoverflow.com/questions/56719138/how-can-i-save-a-librosa-spectrogram-plot-as-a-specific-sized-image/57204349

right now, if we don't specify the window on the data, we get extra long images

'''

def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def spectrogram_image(y, sr, out, hop_length, n_mels):
    # use log-melspectrogram
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels,
                                            n_fft=hop_length*2, hop_length=hop_length)
    mels = np.log(mels + 1e-9) # add small number to avoid log(0)
    
    # min-max scale to fit inside 8-bit range
    img = scale_minmax(mels, 0, 255).astype(np.uint8)
    img = np.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert to grayscale

    # save as PNG
    skimage.io.imsave(out, img)
    

In [6]:
# signal needs to be in this cell for the SIGINT to flow through (SIX YEAR OLD BUG??)
signal.signal(signal.SIGINT, signal_handling)

'''
generate images from audio files
'''

songs = {}
metadata = []
meta_df = pd.read_csv('./metadata.csv')

curr_count = 237
cont = curr_count

bad_duration_songs = []
zero_len_arr_songs = []

# first we just gather all the files that we have, set their id in the same place in the songs list
fnames = list(glob.glob("../youtubescraper/spotify_yt_data/*.flac"))

for fname in fnames[cont:]:
    try:
        if terminate:
            terminate=False
            print("current index: ", cont)
            print("metadata length: ", len(metadata))
            print("num of songs removed for duration: ", len(bad_duration_songs))
            print("zero length np arrays returned: ", len(zero_len_arr_songs))
            break
        if cont == 2000:
            print("current index: ", cont)
            print("metadata length: ", len(metadata))
            print("num of songs removed for duration: ", len(bad_duration_songs))
            print("zero length np arrays returned: ", len(zero_len_arr_songs))
            break
        song_id = fname.split('/')[-1].split('.')[0]

        # setting duration culling here so we only generate images within acceptable duration
        # max = 8 min, min = 2 min
        duration = meta_df['duration'].loc[meta_df['id'] == song_id].iloc[0]
        if duration > 480 or duration < 120:
            bad_duration_songs.append(song_id)
            continue

        # standardize the sampling rate
        sr = 22050
        # offset from beginning by number of seconds
        data, sr = librosa.load(fname, offset=3.0, sr=sr)
        channel = data.ndim

        # some files generate zero length array, so culling those here as well
        if len(data) == 0 or data is None:
            zero_len_arr_songs.append(song_id)
            continue

        # if stereo, we average the two channels of the signal to 1-dim
        if channel == 2:
            data = (data[:,0] + data[:,1]) / 2

        # set spectrogram hyperparameters

        hop_length = 512 # window size
        n_mels = 128 # height (frequency axis)
        time_steps = 255 # width of image

        start_sample = 0
        length_samples = time_steps * hop_length
        window = data[start_sample:start_sample+length_samples]

        savefile = './spectrograms/%s.png'%song_id

        spectrogram_image(window, sr=sr, out=savefile, hop_length=hop_length, n_mels=n_mels)

        # store metadata for updating the csv later
        meta = {
            'id': song_id,
            'sr': sr,
            'channel': channel
        }
        metadata.append(meta)

        cont += 1
    except:
        print('fail on: ', fname)
        continue




ValueError: Input signal length=0 is too small to resample from 48000->22050

{'id': 'l_7TWVX4EOk', 'sr': 22050, 'channel': 1}

In [10]:
'''
update each song info with sr and channel obtained from audio file

10/29 note: metadata needs to include songs [0, 10] (accidentally reset the metadata list)
'''

# convert new meta to df, and load old metadata
new_meta = pd.DataFrame(metadata)
meta_df = pd.read_csv('./metadata.csv')

# align on song id, and then update with sampling rate and number of channels
new_meta.set_index('id', inplace=True)
meta_df.update(new_meta)

# setting column order just in case, then overwriting entire metadata file
column_order = ['id', 'artist', 'title', 'duration', 'sr', 'channel']
meta_df.to_csv('./metadata.csv', columns=column_order, encoding='utf-8', index=False)

KeyError: "['sr', 'channel'] not in index"