##### <b> 1. 개요  </b>
음성 관련 실험을 보다 편리하게 하기 위해서 음성 관련 여러가지 시각화, 분석 기법들을 보다 전문적이고, 자신이 원하는 대로 적용시키는 데 도움을 주는 프로그램을 제작하는 프로젝트

In [12]:
import os
import numpy as np
import matplotlib
from  matplotlib import pyplot as plt
%matplotlib qt
import scipy
import librosa
import librosa.display
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import FVA.FVA as fva
from FVA.lpc import *
from FVA.detect import *
from FVA.mfcc import *
from FVA.animation import *
saveresult = False

In [3]:
def print_2dplot(x, y, xlim, ylim, xlabel, ylabel, position, xscale = 'linear', yscale = 'linear', color = 'b', lw = 1):
    plt.subplot(position[0], position[1], position[2])
    plt.plot(x, y, color=color, lw=lw)
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xscale(xscale)
    plt.yscale(yscale)

#### <b> 2. Spectrum</b>
target_sounds의 첫 번째 파일을 시각화

In [4]:
FILE_LIST = os.listdir('FVA\\target_sounds')
FILE_PATH = 'FVA\\target_sounds\\' + FILE_LIST[0]

SSA = fva.SingleSoundAnalyser(FILE_PATH = FILE_PATH, duration = 2)

print_2dplot(*SSA.get_source(),[2,1,1],color='slateblue')
print_2dplot(*SSA.get_spectrum(xlim=[1,5000],ylim=[1e-7,None]),[2,1,2],xscale='log',yscale='log',color='mediumslateblue',lw=0.5)

mng = plt.get_current_fig_manager()
#mng.full_screen_toggle()
if saveresult: plt.savefig('FVA\\results\\smalldrum_spectrum')
plt.show()

#### <b>3. Spectrogram</b>
target_sounds의 첫 번째 파일을 시각화

In [9]:
spec_out, fout_index, pout= SSA.get_fp_stft(True, 256, 128, 70, 8096, 3)
print(spec_out)

formants : [f1 : 78.98344861660078/f2 : 0.0/f3 : 0.0]
pitch : 0.0
formants : [f1 : 144.34906126482213/f2 : 0.0/f3 : 0.0]
pitch : 0.0
formants : [f1 : 144.34906126482213/f2 : 0.0/f3 : 0.0]
pitch : 0.0
formants : [f1 : 5433.516551383399/f2 : 5831.157361660079/f3 : 6220.627470355731]
pitch : 0.0
formants : [f1 : 5675.914031620553/f2 : 6002.74209486166/f3 : 8546.55385375494]
pitch : 383.4782608695652
formants : [f1 : 40.853507905138336/f2 : 8249.685029644268/f3 : 8824.35770750988]
pitch : 474.1935483870968
formants : [f1 : 1073.0854743083003/f2 : 2364.056324110672/f3 : 4548.357213438735]
pitch : 0.0
formants : [f1 : 87.15415019762845/f2 : 1832.9607213438735/f3 : 2519.2996541501975]
pitch : 0.0
formants : [f1 : 2617.3480731225295/f2 : 3434.418231225296/f3 : 4058.115118577075]
pitch : 0.0
formants : [f1 : 1724.0180335968378/f2 : 3679.5392786561265/f3 : 4063.5622529644265]
pitch : 0.0
formants : [f1 : 1048.5733695652173/f2 : 2282.349308300395/f3 : 2734.4614624505925]
pitch : 0.0
formants : [f

In [10]:
x = np.arange(0,  spec_out.shape[1] , 1) * SSA.df0
y = np.arange(0,  spec_out.shape[0] , 1) * SSA.dt0
X, Y = np.meshgrid(x, y)

x0=np.zeros(spec_out.shape[0] * SSA.max_num_formants)
y0=np.zeros(spec_out.shape[0] * SSA.max_num_formants)
z0=np.zeros(spec_out.shape[0] * SSA.max_num_formants)
for j in range( SSA.max_num_formants):
	for i in range( spec_out.shape[0] ):
		if fout_index[i][j] >= 0:
			x0[i + j * spec_out.shape[0]]= fout_index[i][j] * SSA.df0
			y0[i + j * spec_out.shape[0]]= SSA.dt0 * i
			z0[i + j * spec_out.shape[0]]= spec_out[i][int(fout_index[i][j])]

fig = plt.figure()
ax1 = fig.add_subplot(111,projection='3d')
ax1.view_init(elev=70,azim=310,roll=0)
ax1.set_xlabel('frequency[Hz]')
ax1.set_ylabel('time[sec]')
ax1.set_zlabel('level')
#ax1.plot_wireframe(X,Y,spec_out,alpha=0.7)
ax1.plot(x0,y0,z0,'o',color='r',ms=1,mew=5)
ax1.plot_surface(X,Y,spec_out,cmap=matplotlib.cm.coolwarm,lw=0,antialiased=False,alpha=0.4)
if saveresult: plt.savefig('FVA\\results\\smalldrum_3dformants')

fig = plt.figure()
ax2 = fig.add_subplot(111)
ax2.set_ylabel('frequency[Hz]')
ax2.set_xlabel('time[sec]')
X,Y=np.meshgrid(x,y)
ax2.pcolor(np.transpose(Y),np.transpose(X),np.transpose(spec_out))
#ax2.set_yscale('log')
ax2.plot(y0, x0 , "o", color='r', ms=2, mew=2)
print(y0)
if saveresult: plt.savefig('FVA\\results\\smalldrum_spectrogram')

mng = plt.get_current_fig_manager()
#mng.full_screen_toggle()

plt.show()

[0.00000000e+00 2.26757370e-05 4.53514739e-05 ... 1.55328798e-02
 1.55555556e-02 1.55782313e-02]


#### <b>4. LPC(Linear Prediction Coding)</b>
target_sounds의 첫 번째 파일을 시각화

In [15]:
def to_db(x):
  return 10 * np.log10(x)
x,y = SSA.get_spectrum()[:2]
y = to_db(y)
plt.figure(figsize=(12,8))
#plt.subplot(2,1,1)
plt.plot(x[::10],y[::10],c='lightskyblue',lw=1, alpha=1, label='FFT')
#plt.xscale('log', base=10)
plt.xlim([0,8000])
FreqPoints=8096
df0 = (SSA.sr / 2.) / FreqPoints
Y = SSA.fdata.copy()
Y -= np.hstack((Y[0],Y[:-1]))*0.8
windowed = np.hamming(Y.shape[0]) * Y
a, e = lpc(windowed,lpcOrder=120)
w, h = scipy.signal.freqz(np.sqrt(e), a, FreqPoints)
lpcspec = np.abs(h)
lpcspec[lpcspec < 1.] = 1.
#lpcspec *= (1./float(1 << ((8 * SSA.samplewidth)-1)))

loglpcspec = to_db(lpcspec)

bias = y.mean() - loglpcspec.mean()

#plt.subplot(2,1,2)
plt.plot(np.linspace(0,SSA.sr/2.,len(loglpcspec)), loglpcspec+bias, label='LPC', color='green')

f_result, i_result = formant_detect(lpcspec,df0,1)
plt.plot(f_result, loglpcspec[i_result] + bias, "o", c='r', label='formants', mew=4, ms=4)
plt.legend()

mng = plt.get_current_fig_manager()
#mng.full_screen_toggle()
if saveresult: plt.savefig('FVA\\results\\smalldrum_lpcspectrum')

plt.show()
print(f_result)

[166.13759881422925, 441.21788537549406, 1152.068922924901, 2241.495800395257, 2851.574851778656, 3341.816946640316, 3878.359683794466, 4365.87821146245, 4736.283349802371, 5716.767539525691, 6168.879693675889, 6473.919219367589, 6868.836462450592, 7247.412302371541, 7832.979249011857, 9012.283843873518, 9622.362895256916, 10690.001235177864, 11212.926136363636, 11605.119812252964, 12158.00395256917, 12719.058794466402, 13209.300889328062, 13517.063982213438, 13996.411808300394, 14339.581274703556, 14799.864130434782, 15205.67564229249, 15622.3814229249, 16003.680830039524, 16510.26432806324, 17163.920454545452, 17673.227519762844, 18539.321887351776, 19301.920701581028, 19645.090167984188, 20276.957756916996, 20660.980731225296, 21009.59733201581, 21355.490365612648, 21793.984683794464]


#### <b>5. MFCC </b>

<img src="gitar//MFCC3.png" width="50%" height="50%"></img>

_Mel 단위_

<img src="gitar//MFCC4.png" width="50%" height="50%"></img>

<img src="gitar//MFCC.png" width="50%" height="50%"></img>

_MFCC filter bank_

<img src="gitar//MFCC2.png" width="50%" height="50%"></img>

In [13]:
hop_size = 15 #ms
FFT_size = 2048

sr, audio = wavfile.read(FILE_PATH)
print(sr, len(audio)/sr)

audio_framed = frame_audio(audio,FFT_size=FFT_size,hop_size=hop_size,sample_rate=sr)

window = get_window("hann",FFT_size,fftbins=True)
audio_win = audio_framed * window

pa = PauseAnimation(np.array(audio_win))
plt.show()

44100 10.92


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [20]:


import matplotlib.pyplot as plt
import librosa.display
import librosa
import numpy as np

path = 'FVA//target_sounds//2023711작은탐.wav'
sample_rate=44100

nMEL=128
nMFCC = 128
df0 = (sample_rate//2)/nMFCC

x = librosa.load(path=path,sr=sample_rate)[0]
S = librosa.feature.melspectrogram(y=x, sr=sample_rate, n_mels=nMEL)
log_S = librosa.power_to_db(S, ref=np.max)
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=nMFCC)

print(mfcc.shape)

delta2_mfcc = librosa.feature.delta(mfcc, order=2)


delta2_mfcc2 = np.swapaxes(delta2_mfcc,1,0)
temp_formants = [formant_detect(i,df0,f_min=0) for i in delta2_mfcc2]

formants_list = []
x_list = []
for i,elem in enumerate(temp_formants):
    for elem2 in elem[0][:5]:
        formants_list.append(elem2)
        x_list.append(i)

print(formants_list)
print(x_list)

plt.figure()
plt.plot(x_list,formants_list,"o",c='r',mew=1,ms=1)
plt.show()

x = np.linspace(0,sample_rate//2,nMFCC)

plt.figure()
plt.plot(x,delta2_mfcc2[0])
plt.plot(temp_formants[0][0],delta2_mfcc2[0][temp_formants[0][1]],"o", c='r', label='formants', mew=4, ms=4)
plt.show()
print(len(delta2_mfcc2[0]))


plt.figure()
librosa.display.specshow(delta2_mfcc)
plt.ylabel('MFCC coeffs')
plt.xlabel('Time')
plt.title('MFCC')
plt.colorbar()
plt.tight_layout()

(128, 941)
[172.265625, 2928.515625, 4651.171875, 7407.421875, 11369.53125, 172.265625, 2928.515625, 4651.171875, 7407.421875, 11369.53125, 172.265625, 2928.515625, 4651.171875, 7407.421875, 11369.53125, 172.265625, 2928.515625, 4651.171875, 7407.421875, 11369.53125, 172.265625, 2928.515625, 4651.171875, 7407.421875, 11369.53125, 861.328125, 3445.3125, 6373.828125, 8096.484375, 9991.40625, 2239.453125, 3445.3125, 5340.234375, 8096.484375, 9819.140625, 2067.1875, 3617.578125, 5167.96875, 7924.21875, 9819.140625, 1378.125, 5340.234375, 8441.015625, 9819.140625, 12747.65625, 1378.125, 4306.640625, 5684.765625, 7407.421875, 8613.28125, 172.265625, 1205.859375, 2756.25, 4306.640625, 7407.421875, 172.265625, 2756.25, 4306.640625, 6373.828125, 7407.421875, 516.796875, 3789.84375, 6373.828125, 7751.953125, 8957.8125, 689.0625, 2239.453125, 3617.578125, 4995.703125, 6373.828125, 172.265625, 1894.921875, 3445.3125, 5167.96875, 8096.484375, 689.0625, 1550.390625, 3445.3125, 5684.765625, 6890.625,

In [19]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np


class PauseAnimation:
    def __init__(self):
        self.fig, self.ax = plt.subplots()
        self.ax.set_title('Click to pause/resume the animation')

        self.ax.set_ylim(-5,5)
        self.p, = self.ax.plot(x,delta2_mfcc2[0])
        self.f, = self.ax.plot(temp_formants[0][0],delta2_mfcc2[0][temp_formants[0][1]],"o")

        self.animation = animation.FuncAnimation(
            self.fig, self.update, frames=200, interval=50, blit=True)
        self.paused = True

        self.fig.canvas.mpl_connect('button_press_event', self.toggle_pause)

    def toggle_pause(self, *args, **kwargs):
        if self.paused:
            self.animation.resume()
        else:
            self.animation.pause()
        self.paused = not self.paused

    def update(self, i):
        self.p.set_ydata(delta2_mfcc2[i])
        self.f.set_ydata(delta2_mfcc2[i][temp_formants[i][1]])
        self.f.set_xdata(temp_formants[i][0])
        return (self.p,self.f)


pa = PauseAnimation()
plt.show()

# 여러 파일 자동 시각화
target_sounds 내의 .wav 파일을 전부 시각화하여 그 결과를 results에 이미지 형태로 저장

#### 1. '.wav' 파일 긁어오기

In [None]:
FILE_LIST = os.listdir('FVA\\target_sounds')
WAV_LIST = []
for name in FILE_LIST:
  if os.path.splitext(name)[1] == '.wav':
    WAV_LIST.append(name)
print(WAV_LIST)

#### 2. SingleSoundAnalyser로 '.wav' 파일 불러오기

In [None]:
DURATION = None
SAMPLE_RATE = None
SSA_LIST = []
for name in WAV_LIST:
  SSA_LIST.append(fva.SingleSoundAnalyser('FVA\\target_sounds\\'+name, DURATION, SAMPLE_RATE))

In [None]:
print_2dplot(*SSA_LIST[0].get_source(),[1,2,1])