<a href="https://colab.research.google.com/github/hakseong1231/DACD-AudioSignalAnalysis/blob/main/DACD_MFCC_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 음성 파일별로 MFCC 평균을 구해서 분석 + 랜덤포레스트 결과 개선

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install praat-parselmouth

Collecting praat-parselmouth
[?25l  Downloading https://files.pythonhosted.org/packages/09/7b/9fa1172a63b6277603d27bb5613559b5a8888f58e68c1698017b87b0061d/praat_parselmouth-0.3.3-cp36-cp36m-manylinux1_x86_64.whl (9.0MB)
[K     |████████████████████████████████| 9.0MB 9.1MB/s 
Installing collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.3.3


In [None]:
import os
import librosa
import pandas as pd
import scipy.io.wavfile as wavf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model  # 로지스틱회귀 모듈
from sklearn.ensemble import RandomForestClassifier  # 랜덤포레스트 모듈
from sklearn.svm import SVC  # SVM 모듈
from sklearn.externals import joblib  # 모델 저장에 사용
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time


import warnings  # 경고 제거
warnings.filterwarnings(action='ignore')

sr = 16000  # Signal Rate
categories = ['20F', '20M', '30M', '40F', '50F', '50M']
wav_unprocessed = "/content/drive/My Drive/DACD/DACD_unprocessed/"
wav_processed = "/content/drive/My Drive/DACD/DACD_processed_2/"
wav_pickle = "/content/drive/MyDrive/DACD/DACD_pickle_2/"



# Part 1. Generate Machine Learning Models

1. Preprocessing

In [None]:
def remove_silence(wav_dir):
  """음원 내에 음압이 0인 부분을 제거"""

  # y: 음 세기, sr: 1초당 데이터 수
  y, _ = librosa.load(wav_dir, sr=sr)

  # cut: 소리를 지우는 기준 음압
  cut = max(y) / 300
  y = pd.DataFrame(y)
  y = y[abs(y[0]) > cut].to_numpy()
  return y.T[0]

In [None]:
"""디렉토리 내 모든 .wav 파일에 remove_silence()를 실행하고 MFCC 벡터로 변환하여 저장"""

run = input("Run? [Y/N] ")
if run in ['Y', 'y']:
  X_mfcc = pd.DataFrame([])
  y_label = pd.DataFrame([], columns=["y_label"])
  for category in os.listdir(wav_unprocessed):
    if category not in categories:
      continue

    print("[Now Processing]", category)

    save_dir = wav_processed + "processed_" + category + "/"
    if not os.path.isdir(save_dir):
      os.mkdir(save_dir)

    for folder in os.listdir(wav_unprocessed + category):
      folder_dir = wav_unprocessed + category + "/"

      for file in os.listdir(folder_dir + folder):
        if file.endswith(".wav"):
          wav = folder_dir + folder + "/" + file  # 음원 위치
          save = save_dir + category + "_" + file[5:]  # 저장 위치
          try:
            # 전처리한 음원의 MFCC 평균을 저장
            y = remove_silence(wav)
            X_mfcc = pd.concat([X_mfcc, pd.DataFrame(pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr)).mean(axis=1)[1:]).T], axis=0, ignore_index=True)  # [1:]: 진동이 거의 없는 [0] 데이터 삭제
            y_label = pd.concat([y_label, pd.DataFrame([category[-3:]], columns=["y_label"])], axis=0, ignore_index=True)
          except:
            continue

  print("[Finished]")

Run? [Y/N] Y
[Now Processing] 20F
[Now Processing] 20M
[Now Processing] 30M
[Now Processing] 50F
[Now Processing] 40F
[Now Processing] 50M
[Finished]


In [None]:
"""음원의 MFCC 평균을 저장"""

result.to_pickle(wav_pickle + "X_mfcc.pkl")
y_label.to_pickle(wav_pickle + "y_label.pkl")

2. Load MFCC

In [None]:
X_mfcc = pd.read_pickle(wav_pickle + "X_mfcc.pkl")
y_label = pd.read_pickle(wav_pickle + "y_label.pkl")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_mfcc, y_label, test_size=0.3, random_state=1)

3-1. Generate & Save Models

In [None]:
"""1. Logistic Regression"""

logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)

# 모델 저장
if not os.path.isfile(wav_pickle + "model_LogisticRegression.pkl"):
  joblib.dump(logreg, wav_pickle + "model_LogisticRegression.pkl")

# 예측 결과 저장
y_pred = pd.DataFrame(logreg.predict(X_test), columns=["y_label"])
if not os.path.isfile(wav_pickle + "y_pred_LR.pkl"):
  y_pred.to_pickle(wav_pickle + "y_pred_LR.pkl")

In [None]:
"""2. Random Forest"""

forest = RandomForestClassifier(n_estimators=50, max_depth=15)
forest.fit(X_train, y_train)

# 모델 저장
if not os.path.isfile(wav_pickle + "model_RandomForest.pkl"):
  joblib.dump(forest, wav_pickle + "model_RandomForest.pkl")

# 예측 결과 저장
y_pred = pd.DataFrame(forest.predict(X_test), columns=["y_label"])
if not os.path.isfile(wav_pickle + "y_pred_RF.pkl"):
  y_pred.to_pickle(wav_pickle + "y_pred_RF.pkl")

In [None]:
"""3. Support Vector Machine"""

svm = SVC()
svm.fit(X_train, y_train)

# 모델 저장
if not os.path.isfile(wav_pickle + "model_SVM.pkl"):
  joblib.dump(svm, wav_pickle + "model_SVM.pkl")

# 예측 결과 저장
y_pred = pd.DataFrame(svm.predict(X_test), columns=["y_label"])
if not os.path.isfile(wav_pickle + "y_pred_SVM.pkl"):
  y_pred.to_pickle(wav_pickle + "y_pred_SVM.pkl")

3-2. Validate Classification Models

In [None]:
"""1. Logistic Regression"""

y_pred = pd.read_pickle(wav_pickle + "y_pred_LR.pkl")

# 정답/ 오답 비율 계산
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[[5002   55   14  406  124   28]
 [  78 3970  630   27   18  247]
 [  10  577 4180    4   11  417]
 [ 478   61   31 4456  447   73]
 [ 306   18   79 1087  758   70]
 [  53  417  455  121   46 1360]] 

              precision    recall  f1-score   support

         20F     0.8439    0.8886    0.8657      5629
         20M     0.7787    0.7988    0.7886      4970
         30M     0.7757    0.8040    0.7896      5199
         40F     0.7304    0.8035    0.7652      5546
         50F     0.5399    0.3270    0.4073      2318
         50M     0.6196    0.5546    0.5853      2452

    accuracy                         0.7554     26114
   macro avg     0.7147    0.6961    0.7003     26114
weighted avg     0.7458    0.7554    0.7475     26114



In [None]:
"""2. Random Forest"""

y_pred = pd.read_pickle(wav_pickle + "y_pred_RF.pkl")

# 정답/ 오답 비율 계산
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[[5497   34    8   77   10    3]
 [  11 4866   58    5    2   28]
 [   1   67 5104    2    1   24]
 [ 136   17    9 5356   23    5]
 [ 118   19    3  229 1944    5]
 [  16   58   70    8   11 2289]] 

              precision    recall  f1-score   support

         20F     0.9512    0.9766    0.9637      5629
         20M     0.9615    0.9791    0.9702      4970
         30M     0.9718    0.9817    0.9767      5199
         40F     0.9435    0.9657    0.9545      5546
         50F     0.9764    0.8387    0.9023      2318
         50M     0.9724    0.9335    0.9526      2452

    accuracy                         0.9595     26114
   macro avg     0.9628    0.9459    0.9533     26114
weighted avg     0.9598    0.9595    0.9591     26114



In [None]:
"""3. Support Vector Machine"""

y_pred = pd.read_pickle(wav_pickle + "y_pred_SVM.pkl")

# 정답/ 오답 비율 계산
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[[5327   14    3  215   64    6]
 [  18 4711  155    2    6   78]
 [   1  183 4935    4    2   74]
 [ 326   33    9 5094   77    7]
 [ 294   18    4  728 1256   18]
 [  34  127  243   42   27 1979]] 

              precision    recall  f1-score   support

         20F     0.8878    0.9463    0.9162      5629
         20M     0.9263    0.9479    0.9370      4970
         30M     0.9226    0.9492    0.9357      5199
         40F     0.8371    0.9185    0.8759      5546
         50F     0.8771    0.5418    0.6699      2318
         50M     0.9154    0.8071    0.8578      2452

    accuracy                         0.8923     26114
   macro avg     0.8944    0.8518    0.8654     26114
weighted avg     0.8929    0.8923    0.8881     26114



# Part 2. Optimize Random Forest Model

In [None]:
logreg = joblib.load(wav_pickle + "model_LogisticRegression.pkl")
rand = joblib.load(wav_pickle + "model_RandomForest.pkl")
svm = joblib.load(wav_pickle + "model_SVM.pkl")

In [None]:
# 랜덤 포레스트를 n번 실행해서 T, D에 따른 Accuracy의 평균값 출력
n = 50
print("[RandomForest {}회 평균 Accuracy]".format(n))
for T in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
  print("T =", T)
  for D in [2, 3, 5, 7, 10, 13, 16, 20, 25, 30]:
    sum = 0
    for i in range(n):
      forest = RandomForestClassifier(n_estimators=T, max_depth=D)
      forest.fit(X_train, y_train)
      sum += forest.score(X_test, y_test)
    print("- D = {:<2} : {:.4f}".format(D, sum/n))
  print("")

[RandomForest 50회 평균 Accuracy]
T = 10
- D = 2  : 0.5681
- D = 3  : 0.6256
- D = 5  : 0.7316
- D = 7  : 0.8257
- D = 10 : 0.9037
- D = 13 : 0.9342
- D = 16 : 0.9446
- D = 20 : 0.9472
- D = 25 : 0.9473
- D = 30 : 0.9472

T = 20
- D = 2  : 0.5812
- D = 3  : 0.6374
- D = 5  : 0.7428
- D = 7  : 0.8394
- D = 10 : 0.9140
- D = 13 : 0.9438
- D = 16 : 0.9554
- D = 20 : 0.9589
- D = 25 : 0.9595
- D = 30 : 0.9592

T = 30
- D = 2  : 0.5873
- D = 3  : 0.6443
- D = 5  : 0.7468
- D = 7  : 0.8439
- D = 10 : 0.9169
- D = 13 : 0.9470
- D = 16 : 0.9585
- D = 20 : 0.9627
- D = 25 : 0.9633
- D = 30 : 0.9630

T = 40
- D = 2  : 0.5884
- D = 3  : 0.6464
- D = 5  : 0.7513
- D = 7  : 0.8472
- D = 10 : 0.9187
- D = 13 : 0.9489
- D = 16 : 0.9603
- D = 20 : 0.9645
- D = 25 : 0.9653
- D = 30 : 0.9651

T = 50
- D = 2  : 0.5911
- D = 3  : 0.6460
- D = 5  : 0.7519
- D = 7  : 0.8481
- D = 10 : 0.9200
- D = 13 : 0.9496
- D = 16 : 0.9613
- D = 20 : 0.9656
- D = 25 : 0.9666
- D = 30 : 0.9664

T = 60
- D = 2  : 0.5919
- D 

- D = 25 : 0.9682
- D = 30 : 0.9681

T = 90
- D = 2  : 0.5962
- D = 3  : 0.6527
- D = 5  : 0.7535
- D = 7  : 0.8509
- D = 10 : 0.9216
- D = 13 : 0.9514
- D = 16 : 0.9631
- D = 20 : 0.9674
- D = 25 : 0.9684
- D = 30 : 0.9685

T = 100
- D = 2  : 0.5963
- D = 3  : 0.6528
- D = 5  : 0.7544
- D = 7  : 0.8513
- D = 10 : 0.9218
- D = 13 : 0.9516
- D = 16 : 0.9632
- D = 20 : 0.9676
- D = 25 : 0.9687
- D = 30 : 0.9686



-> T: 증가할수록 Accuracy 상승  
-> D: D=25에서 Accuracy 최대  
-> 연산 시간과 분류 성능을 고려해 T, D 선정

In [None]:
start = time.time()

forest = RandomForestClassifier(n_estimators=100, max_depth=25)
forest.fit(X_train, y_train)
y_pred = pd.DataFrame(forest.predict(X_test), columns=["y_label"])
print("[Time Consumed] {:.2f}sec".format(time.time() - start))
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[Time Consumed] 42.03sec
[[5540   20    6   51    8    4]
 [  13 4866   56    4    3   28]
 [   2   48 5128    1    1   19]
 [ 116   13   10 5376   27    4]
 [  76   18    3  160 2059    2]
 [  17   43   53    6   11 2322]] 

              precision    recall  f1-score   support

         20F     0.9611    0.9842    0.9725      5629
         20M     0.9716    0.9791    0.9753      4970
         30M     0.9756    0.9863    0.9810      5199
         40F     0.9603    0.9693    0.9648      5546
         50F     0.9763    0.8883    0.9302      2318
         50M     0.9760    0.9470    0.9613      2452

    accuracy                         0.9685     26114
   macro avg     0.9702    0.9590    0.9642     26114
weighted avg     0.9686    0.9685    0.9683     26114



In [None]:
start = time.time()
forest = RandomForestClassifier(n_estimators=200, max_depth=25)
forest.fit(X_train, y_train)
y_pred = pd.DataFrame(forest.predict(X_test), columns=["y_label"])

print("[Time Consumed] {:.2f}sec".format(time.time() - start))
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[Time Consumed] 84.36sec
[[5538   21    9   51    8    2]
 [  10 4872   53    3    6   26]
 [   0   41 5137    3    0   18]
 [ 104   11    9 5394   26    2]
 [  78   16    4  149 2070    1]
 [  16   42   57    7    6 2324]] 

              precision    recall  f1-score   support

         20F     0.9638    0.9838    0.9737      5629
         20M     0.9738    0.9803    0.9770      4970
         30M     0.9749    0.9881    0.9815      5199
         40F     0.9620    0.9726    0.9673      5546
         50F     0.9783    0.8930    0.9337      2318
         50M     0.9794    0.9478    0.9633      2452

    accuracy                         0.9702     26114
   macro avg     0.9720    0.9609    0.9661     26114
weighted avg     0.9703    0.9702    0.9700     26114



In [None]:
start = time.time()
forest = RandomForestClassifier(n_estimators=1000, max_depth=25)
forest.fit(X_train, y_train)
y_pred = pd.DataFrame(forest.predict(X_test), columns=["y_label"])

print("[Time Consumed] {:.2f}sec".format(time.time() - start))
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[Time Consumed] 420.63sec
[[5541   24    7   46    9    2]
 [  11 4872   55    3    4   25]
 [   0   39 5137    2    0   21]
 [ 105   12   10 5396   21    2]
 [  79   16    3  143 2074    3]
 [  14   38   53    8    7 2332]] 

              precision    recall  f1-score   support

         20F     0.9637    0.9844    0.9739      5629
         20M     0.9742    0.9803    0.9772      4970
         30M     0.9757    0.9881    0.9818      5199
         40F     0.9639    0.9730    0.9684      5546
         50F     0.9806    0.8947    0.9357      2318
         50M     0.9778    0.9511    0.9642      2452

    accuracy                         0.9708     26114
   macro avg     0.9726    0.9619    0.9669     26114
weighted avg     0.9709    0.9708    0.9707     26114



-> T와 연산 시간 비례(약 T=100개/42초)
- T=1000, D=25: 연산시간 420.63초, Accuracy: 0.9708  
- T=30,   D=25: 연산시간 10.93초, Accuracy: 0.9633(오차 1% 이내)

In [None]:
start = time.time()
forest = RandomForestClassifier(n_estimators=30, max_depth=25)
forest.fit(X_train, y_train)
y_pred = pd.DataFrame(forest.predict(X_test), columns=["y_label"])

print("[Time Consumed] {:.2f}sec".format(time.time() - start))
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[Time Consumed] 10.93sec
[[5511   26    9   59   21    3]
 [  15 4844   67    5    4   35]
 [   1   54 5114    3    1   26]
 [ 123   16   14 5348   41    4]
 [ 103   22    2  176 2012    3]
 [  16   47   69    8   11 2301]] 

              precision    recall  f1-score   support

         20F     0.9553    0.9790    0.9670      5629
         20M     0.9671    0.9746    0.9708      4970
         30M     0.9695    0.9837    0.9765      5199
         40F     0.9552    0.9643    0.9597      5546
         50F     0.9627    0.8680    0.9129      2318
         50M     0.9701    0.9384    0.9540      2452

    accuracy                         0.9623     26114
   macro avg     0.9633    0.9513    0.9568     26114
weighted avg     0.9624    0.9623    0.9621     26114



결론: RandomForest(T=Inf, D=25)의 성능이 가장 뛰어나고,
성능을 고려했을 때 T=30으로 충분함