##### <b> 1. 개요  </b>
음성 관련 실험을 보다 편리하게 하기 위해서 음성 관련 여러가지 시각화, 분석 기법들을 보다 전문적이고, 자신이 원하는 대로 적용시키는 데 도움을 주는 프로그램을 제작하는 프로젝트

In [57]:
import os
import numpy as np
import matplotlib
from  matplotlib import pyplot as plt
%matplotlib qt
import scipy
import librosa
import librosa.display
import FVA.FVA as fva
from FVA.lpc import *
from FVA.detect import *
saveresult = False

In [3]:
def print_2dplot(x, y, xlim, ylim, xlabel, ylabel, position, xscale = 'linear', yscale = 'linear', color = 'b', lw = 1):
    plt.subplot(position[0], position[1], position[2])
    plt.plot(x, y, color=color, lw=lw)
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xscale(xscale)
    plt.yscale(yscale)

#### <b> 2. Spectrum</b>
target_sounds의 첫 번째 파일을 시각화

In [4]:
FILE_LIST = os.listdir('FVA\\target_sounds')
FILE_PATH = 'FVA\\target_sounds\\' + FILE_LIST[0]

SSA = fva.SingleSoundAnalyser(FILE_PATH = FILE_PATH, duration = 2)

print_2dplot(*SSA.get_source(),[2,1,1],color='slateblue')
print_2dplot(*SSA.get_spectrum(xlim=[1,5000],ylim=[1e-7,None]),[2,1,2],xscale='log',yscale='log',color='mediumslateblue',lw=0.5)

mng = plt.get_current_fig_manager()
#mng.full_screen_toggle()
if saveresult: plt.savefig('FVA\\results\\smalldrum_spectrum')
plt.show()

#### <b>3. Spectrogram</b>
target_sounds의 첫 번째 파일을 시각화

In [5]:
spec_out, fout_index, pout= SSA.get_fp_stft(True, 2560, 1280, 32, 8096, 3)
print(spec_out)

formants : [f1 : 2097.1467391304345/f2 : 3104.8666007905135/f3 : 4602.828557312253]
pitch : 484.6153846153846
formants : [f1 : 4224.252717391304/f2 : 6008.189229249011/f3 : 8072.653162055335]
pitch : 0.0
formants : [f1 : 2197.9187252964425/f2 : 4485.715167984189/f3 : 7726.760128458498]
pitch : 0.0
formants : [f1 : 2914.2168972332015/f2 : 4583.763586956521/f3 : 5566.971343873517]
pitch : 479.34782608695645
formants : [f1 : 1677.7173913043478/f2 : 3423.5239624505925/f3 : 4589.210721343873]
pitch : 0.0
formants : [f1 : 2072.6346343873515/f2 : 5970.059288537549/f3 : 7677.735918972332]
pitch : 0.0
formants : [f1 : 1552.4333003952568/f2 : 3257.3863636363635/f3 : 4777.13685770751]
pitch : 0.0
formants : [f1 : 1792.107213438735/f2 : 3153.8908102766795/f3 : 4382.219614624506]
pitch : 474.1935483870968
formants : [f1 : 1773.0422430830038/f2 : 3167.508646245059/f3 : 4561.975049407114]
pitch : 0.0
formants : [f1 : 2971.411808300395/f2 : 5874.734436758893/f3 : 7418.997035573122]
pitch : 0.0
formant

In [13]:
x = np.arange(0,  spec_out.shape[1] , 1) * SSA.df0
y = np.arange(0,  spec_out.shape[0] , 1) * SSA.dt0
X, Y = np.meshgrid(x, y)

x0=np.zeros(spec_out.shape[0] * SSA.max_num_formants)
y0=np.zeros(spec_out.shape[0] * SSA.max_num_formants)
z0=np.zeros(spec_out.shape[0] * SSA.max_num_formants)
for j in range( SSA.max_num_formants):
	for i in range( spec_out.shape[0] ):
		if fout_index[i][j] >= 0:
			x0[i + j * spec_out.shape[0]]= fout_index[i][j] * SSA.df0
			y0[i + j * spec_out.shape[0]]= SSA.dt0 * i
			z0[i + j * spec_out.shape[0]]= spec_out[i][int(fout_index[i][j])]

fig = plt.figure()
ax1 = fig.add_subplot(111,projection='3d')
ax1.view_init(elev=70,azim=310,roll=0)
ax1.set_xlabel('frequency[Hz]')
ax1.set_ylabel('time[sec]')
ax1.set_zlabel('level')
#ax1.plot_wireframe(X,Y,spec_out,alpha=0.7)
ax1.plot(x0,y0,z0,'o',color='r',ms=1,mew=5)
ax1.plot_surface(X,Y,spec_out,cmap=matplotlib.cm.coolwarm,lw=0,antialiased=False,alpha=0.4)
if saveresult: plt.savefig('FVA\\results\\smalldrum_3dformants')

fig = plt.figure()
ax2 = fig.add_subplot(111)
ax2.set_ylabel('frequency[Hz]')
ax2.set_xlabel('time[sec]')
X,Y=np.meshgrid(x,y)
ax2.pcolor(np.transpose(Y),np.transpose(X),np.transpose(spec_out))
ax2.set_yscale('log')
ax2.plot(y0, x0 , "o", color='r', ms=2.5, mew=2.5)
print(y0)
if saveresult: plt.savefig('FVA\\results\\smalldrum_spectrogram')

mng = plt.get_current_fig_manager()
#mng.full_screen_toggle()

plt.show()

[0.00000000e+00 2.26757370e-05 4.53514739e-05 6.80272109e-05
 9.07029478e-05 1.13378685e-04 1.36054422e-04 1.58730159e-04
 1.81405896e-04 2.04081633e-04 2.26757370e-04 2.49433107e-04
 2.72108844e-04 2.94784580e-04 3.17460317e-04 3.40136054e-04
 3.62811791e-04 3.85487528e-04 4.08163265e-04 4.30839002e-04
 4.53514739e-04 4.76190476e-04 4.98866213e-04 5.21541950e-04
 5.44217687e-04 5.66893424e-04 5.89569161e-04 6.12244898e-04
 6.34920635e-04 6.57596372e-04 6.80272109e-04 7.02947846e-04
 7.25623583e-04 7.48299320e-04 7.70975057e-04 7.93650794e-04
 8.16326531e-04 8.39002268e-04 8.61678005e-04 8.84353741e-04
 9.07029478e-04 9.29705215e-04 9.52380952e-04 9.75056689e-04
 9.97732426e-04 1.02040816e-03 1.04308390e-03 1.06575964e-03
 1.08843537e-03 1.11111111e-03 1.13378685e-03 1.15646259e-03
 1.17913832e-03 1.20181406e-03 1.22448980e-03 1.24716553e-03
 1.26984127e-03 1.29251701e-03 1.31519274e-03 1.33786848e-03
 1.36054422e-03 1.38321995e-03 1.40589569e-03 1.42857143e-03
 1.45124717e-03 1.473922

#### <b>4. LPC(Linear Prediction Coding)</b>
target_sounds의 첫 번째 파일을 시각화

In [None]:
def to_db(x):
  return 10 * np.log10(x)
x,y = SSA.get_spectrum()[:2]
y = to_db(y)
plt.figure(figsize=(12,8))
#plt.subplot(2,1,1)
plt.plot(x[::10],y[::10],c='lightskyblue',lw=1, alpha=1, label='FFT')
plt.xscale('log', base=10)

FreqPoints=8096
df0 = (SSA.sr / 2.) / FreqPoints
Y = SSA.fdata.copy()
Y -= np.hstack((Y[0],Y[:-1]))*0.8
windowed = np.hamming(Y.shape[0]) * Y
a, e = lpc(windowed,lpcOrder=80)
w, h = scipy.signal.freqz(np.sqrt(e), a, FreqPoints)
lpcspec = np.abs(h)
lpcspec[lpcspec < 1.] = 1.
#lpcspec *= (1./float(1 << ((8 * SSA.samplewidth)-1)))

loglpcspec = to_db(lpcspec)

bias = y.mean() - loglpcspec.mean()

#plt.subplot(2,1,2)
plt.plot(np.linspace(0,SSA.sr/2.,len(loglpcspec)), loglpcspec+bias, label='LPC', color='green')

f_result, i_result = formant_detect(lpcspec,df0,1)
plt.plot(f_result, loglpcspec[i_result] + bias, "o", c='r', label='formants', mew=4, ms=4)
plt.legend()

mng = plt.get_current_fig_manager()
#mng.full_screen_toggle()
if saveresult: plt.savefig('FVA\\results\\smalldrum_lpcspectrum')

plt.show()
print(f_result)

[168.86116600790513, 452.1121541501976, 1165.6867588932805, 2097.1467391304345, 2797.103507905138, 3314.581274703557, 4790.754693675889, 6392.212203557312, 7819.361413043478, 9053.137351778656, 10512.969367588932, 11531.583498023714, 12168.898221343872, 12727.22949604743, 13419.015563241106, 14061.777420948616, 15205.67564229249, 15685.023468379446, 16485.752223320156, 17082.213438735176, 17667.780385375492, 18449.444169960472, 19522.529644268772, 20345.046936758892, 21012.320899209484]


#### <b>5. MFCC </b>

<img src="gitar//MFCC3.png" width="50%" height="50%"></img>

_Mel 단위_

<img src="gitar//MFCC4.png" width="50%" height="50%"></img>

<img src="gitar//MFCC.png" width="50%" height="50%"></img>

_MFCC filter bank_

<img src="gitar//MFCC2.png" width="50%" height="50%"></img>

In [105]:
import sklearn
path = 'FVA//target_sounds//2023711작은탐.wav'
sample_rate=44100

wav = librosa.load(path=path, sr=sample_rate)[0] # 파일 불러오기

mfcc = librosa.feature.mfcc(y=wav, sr=16000,
                            n_mfcc=100, n_fft=400, hop_length=160)

mfcc = sklearn.preprocessing.scale(mfcc, axis=1)

print(mfcc.shape)

pad2d = lambda a, i: a[:,:i] if a.shape[1]>i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))
padded_mfcc = pad2d(mfcc, 1000)

librosa.display.specshow(padded_mfcc, sr=sample_rate, x_axis='time')
plt.yscale('log')
plt.show()
print(padded_mfcc.shape)

padded_mfcc_time = np.swapaxes(padded_mfcc,1,0)
plt.figure()
plt.plot(padded_mfcc_time[0])
plt.show()

mfcc_formants = [formant_detect(i,df0,f_min=0) for i in padded_mfcc_time]

mfcc_formants_list = []
x_list = []
for i,elem in enumerate(mfcc_formants):
    for elem2 in elem[0][:5]:
        mfcc_formants_list.append(elem2)
        x_list.append(i)
        
plt.figure()
plt.ylim([0,5000])
plt.plot(x_list,mfcc_formants_list,'o',c='r',ms=1)
plt.show()

(100, 3010)
(100, 1000)




In [106]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np


class PauseAnimation:
    def __init__(self):
        self.fig, self.ax = plt.subplots()
        self.ax.set_title('Click to pause/resume the animation')

        self.ax.set_ylim(-6,6)
        self.p, = self.ax.plot(padded_mfcc_time[i])
        
        self.animation = animation.FuncAnimation(
            self.fig, self.update, frames=200, interval=50, blit=True)
        self.paused = True

        self.fig.canvas.mpl_connect('button_press_event', self.toggle_pause)

    def toggle_pause(self, *args, **kwargs):
        if self.paused:
            self.animation.resume()
        else:
            self.animation.pause()
        self.paused = not self.paused

    def update(self, i):
        self.p.set_ydata(padded_mfcc_time[i])
        return (self.p,)


pa = PauseAnimation()
plt.show()

In [55]:


import matplotlib.pyplot as plt
import librosa.display
import librosa
import numpy as np

path = 'FVA//target_sounds//2023711작은탐.wav'
sample_rate=44100

nMEL=128
nMFCC = 128
df0 = (sample_rate//2)/nMFCC

x = librosa.load(path=path,sr=sample_rate)[0]
S = librosa.feature.melspectrogram(y=x, sr=sample_rate, n_mels=nMEL)
log_S = librosa.power_to_db(S, ref=np.max)
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=nMFCC)

print(mfcc.shape)

delta2_mfcc = librosa.feature.delta(mfcc, order=2)


delta2_mfcc2 = np.swapaxes(delta2_mfcc,1,0)
temp_formants = [formant_detect(i,df0,f_min=0) for i in delta2_mfcc2]

formants_list = []
x_list = []
for i,elem in enumerate(temp_formants):
    for elem2 in elem[0][:5]:
        formants_list.append(elem2)
        x_list.append(i)

print(formants_list)
print(x_list)

plt.figure()
plt.plot(x_list,formants_list,"o",c='r',mew=1,ms=1)
plt.show()

x = np.linspace(0,sample_rate//2,nMFCC)

plt.figure()
plt.plot(x,delta2_mfcc2[0])
plt.plot(temp_formants[0][0],delta2_mfcc2[0][temp_formants[0][1]],"o", c='r', label='formants', mew=4, ms=4)
plt.show()
print(len(delta2_mfcc2[0]))


plt.figure()
librosa.display.specshow(delta2_mfcc)
plt.ylabel('MFCC coeffs')
plt.xlabel('Time')
plt.title('MFCC')
plt.colorbar()
plt.tight_layout()

(128, 941)
[172.265625, 2583.984375, 4306.640625, 7235.15625, 8613.28125, 172.265625, 2583.984375, 4306.640625, 7235.15625, 8613.28125, 172.265625, 2583.984375, 4306.640625, 7235.15625, 8613.28125, 172.265625, 2583.984375, 4306.640625, 7235.15625, 8613.28125, 172.265625, 2583.984375, 4306.640625, 7235.15625, 8613.28125, 172.265625, 2756.25, 4306.640625, 7062.890625, 8785.546875, 172.265625, 2928.515625, 5167.96875, 6546.09375, 9130.078125, 172.265625, 5167.96875, 6718.359375, 9646.875, 11714.0625, 172.265625, 5167.96875, 7062.890625, 9819.140625, 11714.0625, 172.265625, 1205.859375, 5340.234375, 7062.890625, 9819.140625, 172.265625, 1205.859375, 5512.5, 7235.15625, 9819.140625, 172.265625, 1033.59375, 4134.375, 5512.5, 7407.421875, 172.265625, 1205.859375, 2411.71875, 4134.375, 6546.09375, 172.265625, 1205.859375, 2411.71875, 4306.640625, 6373.828125, 172.265625, 2583.984375, 4134.375, 6201.5625, 7579.6875, 689.0625, 2067.1875, 3617.578125, 6201.5625, 7924.21875, 689.0625, 1894.921875,

In [107]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np


class PauseAnimation:
    def __init__(self):
        self.fig, self.ax = plt.subplots()
        self.ax.set_title('Click to pause/resume the animation')

        self.ax.set_ylim(-5,5)
        self.p, = self.ax.plot(x,delta2_mfcc2[0])
        self.f, = self.ax.plot(temp_formants[0][0],delta2_mfcc2[0][temp_formants[0][1]],"o")

        self.animation = animation.FuncAnimation(
            self.fig, self.update, frames=200, interval=50, blit=True)
        self.paused = True

        self.fig.canvas.mpl_connect('button_press_event', self.toggle_pause)

    def toggle_pause(self, *args, **kwargs):
        if self.paused:
            self.animation.resume()
        else:
            self.animation.pause()
        self.paused = not self.paused

    def update(self, i):
        self.p.set_ydata(delta2_mfcc2[i])
        self.f.set_ydata(delta2_mfcc2[i][temp_formants[i][1]])
        self.f.set_xdata(temp_formants[i][0])
        return (self.p,self.f)


pa = PauseAnimation()
plt.show()

# 여러 파일 자동 시각화
target_sounds 내의 .wav 파일을 전부 시각화하여 그 결과를 results에 이미지 형태로 저장

#### 1. '.wav' 파일 긁어오기

In [None]:
FILE_LIST = os.listdir('FVA\\target_sounds')
WAV_LIST = []
for name in FILE_LIST:
  if os.path.splitext(name)[1] == '.wav':
    WAV_LIST.append(name)
print(WAV_LIST)

#### 2. SingleSoundAnalyser로 '.wav' 파일 불러오기

In [None]:
DURATION = None
SAMPLE_RATE = None
SSA_LIST = []
for name in WAV_LIST:
  SSA_LIST.append(fva.SingleSoundAnalyser('FVA\\target_sounds\\'+name, DURATION, SAMPLE_RATE))

In [None]:
print_2dplot(*SSA_LIST[0].get_source(),[1,2,1])