In [1]:
import os
import numpy as np
import pandas as pd
import librosa

#声優の声データからMFCC（声の特徴量）を抽出し、表にまとめる。

X_data = []  # 特徴行列
y_data = []  # クラスラベルデータ
speakers_number = 7 # 声優の人数
files_number = 100 # 一人あたりのサンプルの数

for speaker_num in range(1, speakers_number + 1):  # 声優フォルダの数だけ繰り返し処理
    # ボイスサンプルがあるフォルダ名
    dir_name = f'VoiceSamples/{str(speaker_num).zfill(3)}'
    file_list = os.listdir(dir_name)
    for i in range(files_number):
        file_path = os.path.join(dir_name, file_list[i])  # 音声ファイルへのパス
        y, sr = librosa.load(file_path)  # 音声ファイルを読み込む
        mfcc = librosa.feature.mfcc(y, sr)  # MFCC
        mfcc = np.average(mfcc, axis=1)  # 時間平均を取る
        mfcc = mfcc.flatten()
        mfcc = mfcc.tolist()
        mfcc = mfcc[1:13]  # 低次の係数を取り出す（12次まで取り出すことが多い）
        X_data.append(mfcc)
        y_data.append(speaker_num)

X = pd.DataFrame(X_data, columns=[f'mfcc_{n}' for n in range(1, 13)]) # mfcc_1,mfcc_2,..のように行のタイトルをつける
y = pd.DataFrame({'target': y_data}) # targetタイトルを追加

df = pd.concat([X, y], axis=1) # 合体！！！！！！
df.to_csv('mfcc.csv', index=False)  # csvで保存
df.head() # 見せてほしいな

 -5.8130396e-11  0.0000000e+00], sr=22050 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC
  0.        ], sr=22050 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC
  0.        ], sr=22050 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC
  0.        ], sr=22050 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC
  1.4708222e-04  0.0000000e+00], sr=22050 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(y, sr)  # MFCC

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,target
0,24.233126,-22.866783,8.171872,-3.450461,-14.244987,-1.487553,-1.970962,-1.05463,3.212611,-11.381695,-4.241589,-9.45889,1
1,50.019978,-21.768835,7.129757,-5.470294,-12.972341,3.84152,-7.164298,-5.958019,2.533655,-14.286058,-0.805103,-11.237039,1
2,61.615715,-17.051783,8.200333,2.496114,-14.825385,1.977,-3.839584,1.793461,2.420642,-12.529157,-1.196324,-18.409231,1
3,64.062408,-29.199818,13.17995,3.607777,-22.680412,-1.233991,-5.228384,2.5636,1.059541,-16.468769,1.742816,-17.165136,1
4,75.140808,-4.875223,15.521883,-9.189409,-20.956553,7.712096,-8.545377,-5.856166,5.216793,-17.020967,-0.42029,-14.604029,1


In [2]:

import pandas as pd
from pycaret.regression import *

# 機械学習前の下準備を行う。以下、Pycaretの関数を使う場合このセルを実行してから使うこと

mfcc_data = pd.read_csv("mfcc.csv") # MFCCのデータを読み込む
# データの前準備(以下、Pycaretの関数を使う場合setupを呼び出してから使うこと)
reg = setup(data=mfcc_data, target='target', data_split_shuffle=True, use_gpu=True, silent=True, fold=5, n_jobs=-1)


Unnamed: 0,Description,Value
0,session_id,4927
1,Target,target
2,Original Data,"(700, 13)"
3,Missing Values,False
4,Numeric Features,12
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(489, 12)"


In [21]:
# 機械学習のモデルを複数比較し、精度が高かったモデルを表示する。

compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.1239,2.0067,1.4155,0.4991,0.3309,0.4589,1.44
knn,K Neighbors Regressor,1.0837,2.2621,1.4995,0.4362,0.3413,0.4169,0.074
lightgbm,Light Gradient Boosting Machine,1.1427,2.2791,1.5065,0.4329,0.3514,0.4632,0.274
rf,Random Forest Regressor,1.1774,2.273,1.5058,0.4312,0.3509,0.4789,1.41
gbr,Gradient Boosting Regressor,1.241,2.4577,1.5666,0.3862,0.3677,0.5143,0.394
ada,AdaBoost Regressor,1.4138,2.5972,1.6111,0.3517,0.3763,0.5774,0.31
br,Bayesian Ridge,1.5227,3.2387,1.7977,0.192,0.4199,0.6491,0.06
ridge,Ridge Regression,1.5081,3.2387,1.7976,0.191,0.4204,0.641,0.058
lar,Least Angle Regression,1.5081,3.2387,1.7976,0.191,0.4204,0.6409,0.06
lr,Linear Regression,1.5081,3.2387,1.7976,0.191,0.4204,0.6409,0.062


ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=7886, verbose=0, warm_start=False)

In [15]:
from pycaret.regression import *
# 最も精度が高かったモデルを生成

model = create_model('et') # モデルを生成（）
#tuned_model = tune_model(model) # モデルを調整...すると精度落ちるの何...しないほうがいいのか？
predict_data = predict_model(model) # 既存のデータで予測
predict_data.to_csv('predict_result.csv', index=False)  # csvで保存


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.1473,1.9948,1.4124,0.5042,0.3792,0.6227
1,1.2014,2.3295,1.5263,0.4368,0.3457,0.4675
2,1.0217,1.7889,1.3375,0.5353,0.278,0.3223
3,1.1822,2.3691,1.5392,0.3496,0.3393,0.3989
4,1.0508,1.8213,1.3496,0.5464,0.3229,0.4456
Mean,1.1207,2.0607,1.433,0.4745,0.333,0.4514
Std,0.0717,0.2461,0.0854,0.0732,0.0331,0.0991


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,1.0134,1.6936,1.3014,0.5689,0.3122,0.4253


In [16]:
final_model = finalize_model(model)
save_model(final_model,'Final Model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='target',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy...
                  ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                      criterion='mse', max_depth=None,
                                      max_features='auto', max_leaf_nodes=None,
                                      max_samples=None,
                                    

In [6]:
from pycaret.regression import *

final_model = load_model('Final Model') # モデルの読み込み

Transformation Pipeline and Model Successfully Loaded


In [20]:
import sounddevice as sd
import wave
import numpy as np

#音声を録音

FILE_NAME = 'my_recording.wav'  # 保存するファイル名
wave_length = 5  # 録音する長さ（秒）
sample_rate = 16_000  # サンプリング周波数

print("録音開始")
# 録音開始（wave_length秒間録音。wait で録音し終わるまで待つ）
data = sd.rec(int(wave_length * sample_rate), sample_rate, channels=1)
sd.wait()
print("録音終了")

# ノーマライズ。量子化ビット16bitで録音するので int16 の範囲で最大化する
data = data / data.max() * np.iinfo(np.int16).max

# float -> int
data = data.astype(np.int16)

# ファイル保存
with wave.open(FILE_NAME, mode='wb') as wb:
    wb.setnchannels(1)  # モノラル
    wb.setsampwidth(2)  # 16bit=2byte
    wb.setframerate(sample_rate)
    wb.writeframes(data.tobytes())  # バイト列に変換


録音開始
録音終了


In [4]:
import pandas as pd
import librosa
import numpy as np

###MFCCを抽出###

data = []
TEST_FILE_NAME = "test.wav"

y, sr = librosa.load(TEST_FILE_NAME)  # 音声ファイルを読み込む
mfcc = librosa.feature.mfcc(y, sr)  # MFCC
mfcc = np.average(mfcc, axis=1)  # 時間平均を取る
mfcc = mfcc.flatten()
mfcc = mfcc.tolist()
mfcc = mfcc[1:13]  # 低次の係数を取り出す（12次まで取り出すことが多い）
data.append(mfcc)

df = pd.DataFrame(data, columns=[f'mfcc_{n}' for n in range(1, 13)])

df.to_csv('mfcc_my_recording.csv', index=False)  # csvで保存
df.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12
0,61.791355,-18.600464,18.558226,-1.541007,-2.50288,0.355464,-0.349908,-7.095498,1.180879,-10.101668,4.389037,-3.882792


In [7]:
from decimal import Decimal, ROUND_HALF_UP, ROUND_HALF_EVEN
from pycaret.regression import *

#録音した音声データからどの声優に似ているかを特定

actors = ["下野紘", "花江夏樹", "梶裕貴", "沢城みゆき", "鬼頭明里", "水瀬いのり", "悠木碧"]

predict_my_recording = predict_model(final_model, data = pd.read_csv("mfcc_my_recording.csv")) # 未知データを予測
predict_label = predict_my_recording.at[0, "Label"] # 予測データを取得（小数点数）
voice_actor_number = Decimal(str(predict_label)).quantize(Decimal('0'), rounding=ROUND_HALF_UP) # 整数に四捨五入
print("あなたの声は" + actors[int(voice_actor_number-1)] + "に似ています。")
print("予測値：" + str(predict_label))

あなたの声は鬼頭明里に似ています。
予測値：2.66
