<a href="https://colab.research.google.com/github/jhkang0516/dialect_classification/blob/main/RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import librosa
import librosa.display
from IPython.display import Audio
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, LSTM, Bidirectional, GRU, BatchNormalization, LeakyReLU
from keras.utils import to_categorical
import os
import math
import json
import random

In [None]:
# 데이터 로드
dataDir = "/content/drive/MyDrive/Colab Notebooks/TeamProject/"
X_mfcc_ALL = np.load(dataDir + "X_mfcc_ALL.npy")
Y_label_ALL = np.load(dataDir + "Y_label_ALL.npy")

In [None]:
# 3차원 -> 2차원
X_mfcc_conv = []
for item in X_mfcc_ALL:
    series = pd.Series(np.hstack((np.mean(item, axis=1), np.std(item, axis=1))))
    X_mfcc_conv.append(series)

X_mfcc_conv = np.array(X_mfcc_conv)

In [None]:
print(X_mfcc_ALL.shape)
print(X_mfcc_conv.shape)

(60000, 39, 259)
(60000, 78)


In [None]:
# Y_label_ALL 라벨인코딩-원핫인코딩

DF_Y = pd.DataFrame(Y_label_ALL)
DF_Y = pd.get_dummies(DF_Y)

In [None]:
# 인코딩 결과 확인
DF_Y

Unnamed: 0,0_강원,0_경상,0_전라,0_제주,0_충청,0_표준
0,0,1,0,0,0,0
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,1,0,0,0,0
...,...,...,...,...,...,...
59995,0,0,0,0,0,1
59996,0,0,0,0,0,1
59997,0,0,0,0,0,1
59998,0,0,0,0,0,1


# 데이터 스플릿

In [None]:
# 데이터 스플릿

X_train, X_test, Y_train, Y_test = train_test_split(X_mfcc_conv, DF_Y, test_size=0.3, stratify=DF_Y)

In [None]:
# 스플릿 stratify 결과 확인

print(pd.DataFrame(Y_train)["0_제주"].value_counts())
print(pd.DataFrame(Y_test)["0_제주"].value_counts())

0    35000
1     7000
Name: 0_제주, dtype: int64
0    15000
1     3000
Name: 0_제주, dtype: int64


In [None]:
print(X_mfcc_ALL.shape)
print(X_mfcc_conv.shape)

(60000, 39, 259)
(60000, 78)


# RF 모델 구축

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],  # 트리의 개수
    'max_depth': [None, 5, 10],  # 트리의 최대 깊이
    'min_samples_split': [2, 5, 10],  # 노드를 분할하기 위한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4]  # 리프 노드에 필요한 최소 샘플 수
}

grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

grid_search.fit(X_train, Y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best score:  0.35683333333333334


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50)

rfc.fit(X_train, Y_train)
predict = rfc.predict(X_test)
print(accuracy_score(Y_test, predict))

0.36038888888888887


In [None]:
from sklearn.metrics import classification_report

# Precision, Recall, F1-Score
report = classification_report(Y_test, predict);
print(report);

              precision    recall  f1-score   support

           0       0.95      0.10      0.17      3000
           1       0.84      0.10      0.18      3000
           2       0.85      0.23      0.36      3000
           3       0.91      0.56      0.69      3000
           4       0.96      0.45      0.62      3000
           5       0.99      0.73      0.84      3000

   micro avg       0.93      0.36      0.52     18000
   macro avg       0.91      0.36      0.48     18000
weighted avg       0.91      0.36      0.48     18000
 samples avg       0.36      0.36      0.36     18000



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
importance = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis = 0)

max = -1
idx = 0
for f in range(X_train.shape[1]):
  print(f, importance[f])
  if importance[f] > max:
    max = importance[f]
    idx = f

print(max, idx)

0 0.033522606017905
1 0.046936233324344635
2 0.02163553452118628
3 0.02048729386258227
4 0.0231155328301907
5 0.02387673828769269
6 0.014133741881275027
7 0.013534911791001575
8 0.017210330851506545
9 0.011230507726352854
10 0.010470659623700933
11 0.011071662524575891
12 0.012521337376461428
13 0.015011039475388848
14 0.011028153086896256
15 0.018418895402999746
16 0.013111335823375807
17 0.020723115049906173
18 0.013941451186104497
19 0.010874610262057051
20 0.013589137325773723
21 0.009463995614798338
22 0.012782795030420928
23 0.01191168790607246
24 0.011943589337371453
25 0.013805641821070951
26 0.009765242963019904
27 0.01100566055008124
28 0.009593273816940374
29 0.007916851600192171
30 0.010234102248982715
31 0.008826140767584164
32 0.010748233483649028
33 0.010408846560795567
34 0.016466551800937444
35 0.012075238424930522
36 0.01781372858316227
37 0.011673891262520858
38 0.015228754507276484
39 0.009327013818532013
40 0.01677116588445114
41 0.013389249543699019
42 0.011284633