In [None]:
import librosa
import librosa.display

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm

import numpy as np
import scipy

In [None]:
import os

In [None]:
def generate_mel_spectrogram(wav, sampling_rate, win_length, hop_length):
    
    y, sr = librosa.load('emodb/wav/' + wav, sr=sampling_rate)

    emphasized_signal = np.append(y[0], y[1:] - 0.97 * y[:-1])

    y = emphasized_signal

    stft = librosa.core.stft(y, win_length=win_length, hop_length=hop_length, window=scipy.signal.hamming, center=True)
    
    abs_stft = np.abs(stft)**2
    
    D = librosa.logamplitude(abs_stft, ref=np.max)

    spec = librosa.feature.melspectrogram(S=D, n_mels=40, sr=sampling_rate, norm=1, fmax=freq)
    
    return spec

In [None]:
# Create a non-mel spectrogram (not used)
def generate_spectrogram(wav, sampling_rate, win_length, hop_length, use_scipy=False):
    
    if use_scipy:
        from scipy.io import wavfile
        from scipy import signal

        eps=1e-10
        rate, data = wavfile.read('emodb/wav/' + wav)
        if data.ndim > 1 : # ignore  channels 2+
            data = data[:, 0]

        nperseg = win_length
        noverlap = hop_length
        freqs, times, spec = signal.spectrogram(data, fs=sampling_rate, window='hamming', nperseg=nperseg, noverlap=noverlap)
        log_specgram = np.log(spec.T.astype(np.float32) + eps)

        return log_specgram
    else:
        y, sr = librosa.load('emodb/wav/' + wav, sr=sampling_rate)
        S = librosa.core.stft(y, n_fft=512, hop_length=hop_length, win_length=win_length, window ='hamming')
        S = librosa.power_to_db(S, ref=np.max)
        
        return S

In [None]:
def make_librosa_image(spec, frame_start, frame_end, sampling_rate, hop_length,
               xAxis, yAxis, img_name, freq, sub_spec_idx, img_width, img_height,
               save_fig=False):
    
    fig = plt.figure()
    
    im = librosa.display.specshow(spec[:,frame_start:frame_end], cmap=cm.jet,
                             sr=sampling_rate, hop_length=hop_length, x_axis=xAxis, y_axis='hz')
    im.set_ylim(0, freq)
    plt.tight_layout()
    im.axes.get_xaxis().set_visible(False)
    im.axes.get_yaxis().set_visible(False)
    
    fig.set_size_inches((img_width/float(DPI)),((img_height/float(DPI))))
    
    if save_fig:
        fname = 'emodb/specgrams/' + img_name + '_' + str(freq) + '_' + str(sub_spec_idx) + '.png'
        plt.savefig(fname, bbox_inches='tight', pad_inches=-0.01, dpi=dpi, transparent=True)
        
        from PIL import Image
        im = Image.open(fname)
        im.resize((img_width, img_height)).save(fname)
#         print(fname)

In [None]:
# This creates a non-mel spectrogram (currently not used)
def make_image(spec, frame_start, frame_end, sampling_rate, hop_length,
               xAxis, yAxis, img_name, freq, sub_spec_idx, img_width, img_height,
               save_fig=False):
    
    spec_rot = np.rot90(spec[:,frame_start:frame_end])
    
    plt_img = plt.imshow(spec_rot, cmap=cm.jet, extent=[0, img_width, 0, img_height], interpolation='nearest', aspect='equal')
    
    plt.axis('off')
    if save_fig:
        fname = 'emodb/specgrams/' + img_name + '_' + str(freq) + '_' + str(sub_spec_idx) + '.png'
        plt.imsave(arr=spec_rot, fname=fname, dpi=dpi, cmap=cm.jet)
        from PIL import Image
        im = Image.open(fname)
        im.resize((img_width, img_height)).save(fname)
        
        # plt.savefig(fname, bbox_inches='tight', pad_inches=-0.01, dpi=dpi, transparent=True)
    

In [None]:
sampling_rate = 16000
dpi = 72
DPI = plt.gcf().get_dpi()
print(DPI)

img_width = 256
img_height = 256

win = 20
hop = 10

win_length = int(np.round((sampling_rate / 1e3) * win))
hop_length = int(np.round((sampling_rate / 1e3) * hop))

print(win_length, hop_length)

min_sec = 1.5
min_frames = int(min_sec*sampling_rate/hop_length)
print('min_sec: {}'.format(min_sec))
print('min_frames: {}'.format(min_frames))

mult_dict = dict(zip(np.subtract(range(2,7,1),1), range(2,7,1)))
print(mult_dict)

frequencies = [7000, 7500, 8000]

enable_splitting=True
save_img = True

plot_axis = 'off'
xAxis=None#'time'
yAxis=None#'hz'

In [None]:
tracker = []
for freq in frequencies:
    print('Frequency Max: {}'.format(freq))
    for wav in [w for w in os.listdir('emodb/wav/') if w.endswith('.wav')][:]:

        spec = generate_spectrogram(wav, sampling_rate, win_length, hop_length)
        
        img_name = wav[:-4]

        # Apply a 'fuzzy' threshold to the cut-off interval
        multiples_of = np.isclose(float(spec.shape[1])/(min_frames*1.0), [x*1.0 for x in mult_dict.values()], rtol=0.35)
        if enable_splitting and np.any(multiples_of):
            idx = np.argmax(multiples_of) + 1

            for sub_spec_idx in range(0, mult_dict[idx]):
                frame_start = (sub_spec_idx)*min_frames
                frame_end = frame_start + min_frames
                #print('start: {}, end: {}'.format(frame_start, frame_end))
                if np.isclose(spec.shape[1], frame_end, rtol=0.1) or frame_end > spec.shape[1]:
                    frame_end = spec.shape[1]
                    #print('start: {}, end: {}'.format(frame_start, frame_end))
                
                #print('start: {}, end: {}'.format(frame_start, frame_end))
                tracker.append((frame_end-frame_start)/sampling_rate*hop_length)
                make_librosa_image(spec, frame_start, frame_end, sampling_rate, hop_length, xAxis, yAxis, 
                           img_name, freq, sub_spec_idx, img_width, img_height,save_img)
                plt.close()
    
        else:
            #print(spec.shape)
            tracker.append(spec.shape[1]/sampling_rate*hop_length)
            make_librosa_image(spec, 0, spec.shape[1], sampling_rate, hop_length, xAxis, yAxis, 
                       img_name, freq, 0, img_width, img_height, save_img)
            # plt.show()
            plt.close()

In [None]:
np.max(tracker)

In [None]:
np.mean(tracker)

In [None]:
np.std(tracker)

In [None]:
np.min(tracker)

In [None]:
plt.hist(tracker)

In [None]:
plt.show()

In [None]:
fig = plt.figure(figsize=(7,7))
plt.hist(tracker, bins=8, rwidth=0.9)
# plt.xticks(range(10))
# plt.xlim([-1,10])
# plt.yticks(np.arange(0,900, 50))
plt.grid()
plt.show()
len(tracker)

In [None]:
print(len(tracker))

In [None]:
fig = plt.figure(figsize=(7,7))

bins = np.arange(0,3.1,0.2)
plt.hist(tracker, bins=bins, rwidth=0.90)
plt.xticks(bins)
plt.yticks(np.arange(0,900, 50))
plt.grid()
plt.show()

In [None]:
# This may be useful for plotting the distribution of all audio lengths over the various frequencies
# DONT' DELETE THIS!!!!
#######################

baz = []
for freq in frequencies:
# print('Frequency Max: {}'.format(freq))
    for wav in [w for w in os.listdir('emodb/wav/') if w.endswith('.wav')][:]:
        y, sr = librosa.load('emodb/wav/' + wav, sr=sampling_rate)
        baz.append(y.shape[0]/sampling_rate)
    
fig = plt.figure(figsize=(20,6))
plt.hist(baz, bins=np.arange(0.25,np.max(baz)+.1,0.25), rwidth=0.90)
plt.xticks(np.arange(0.25,np.max(baz)+.1,0.25))
# plt.yticks(np.arange(0,900, 50))
# plt.xlim([0.5,5]) #cut off if you want to leave outliers ( 5 sec+) out
plt.grid()
plt.show()