<a href="https://colab.research.google.com/github/ituki0426/How_to_improve_detecting_AI_voice_changer/blob/main/notebook/HuBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 準備

In [None]:
!pip install datasets



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 音声ファイルのテスト

In [None]:
import librosa
from IPython.display import Audio

# .wavファイルをロード
y, sr = librosa.load("/content/drive/MyDrive/customBERT/kanata/001.wav.wav", sr=None)

# 再生
Audio(y, rate=sr)


# データセットの読み込み

In [None]:
import os
import torchaudio
from datasets import Dataset, DatasetDict
from transformers import AutoFeatureExtractor
import torch
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import DatasetDict
from transformers import DataCollatorWithPadding
import torch.nn.functional as F  # torch.nn.functional をインポート

def load_audio_data(feature_extractor, expanded_dir, kanata_dir, max_length, sampling_rate):
    # データ格納用リスト
    data = []
    num = 0
    # Label 1 のデータを読み込む
    for idx,file in enumerate(os.listdir(expanded_dir)):
        if file.endswith('.wav') and idx < 40:
            filepath = os.path.join(expanded_dir, file)
            waveform, sr = torchaudio.load(filepath)

            # Resample する場合（サンプリングレートが異なる場合）
            if sr != sampling_rate:
                resampler = torchaudio.transforms.Resample(sr, sampling_rate)
                waveform = resampler(waveform)

            # パディング処理の修正
            if waveform.size(1) < max_length:
                waveform = F.pad(waveform, (0, max_length - waveform.size(1)))
            else:
                waveform = waveform[:, :max_length]

            # 特徴量抽出
            inputs = feature_extractor(waveform.squeeze().numpy(), sampling_rate=sampling_rate, return_attention_mask=True)
            data.append({
                "label": 0,
                "input_values": inputs["input_values"][0],
                "attention_mask": inputs["attention_mask"][0]
            })
            print(f"now : {num + 1}")

    # Label 2 のデータを読み込む
    for idx,file in enumerate(os.listdir(kanata_dir)):
        if file.endswith('.wav') and 40 >= idx and idx < 80:
            filepath = os.path.join(kanata_dir, file)
            waveform, sr = torchaudio.load(filepath)

            # Resample する場合
            if sr != sampling_rate:
                resampler = torchaudio.transforms.Resample(sr, sampling_rate)
                waveform = resampler(waveform)

            # パディングまたは切り取りで長さを統一
            if waveform.size(1) < max_length:
                waveform = F.pad(waveform, (0, max_length - waveform.size(1)))
            else:
                waveform = waveform[:, :max_length]

            # 特徴量抽出
            inputs = feature_extractor(waveform.squeeze().numpy(), sampling_rate=sampling_rate, return_attention_mask=True)
            data.append({
                "label": 1,
                "input_values": inputs["input_values"][0],
                "attention_mask": inputs["attention_mask"][0]
            })
            print(f"now : {num + 1}")

    return data

# データをTokenizerを用いて変換する

In [None]:

def prepare_dataset(feature_extractor, expanded_dir, kanata_dir, max_length, sampling_rate, train_split=0.8):
    # データをロード
    data = load_audio_data(feature_extractor, expanded_dir, kanata_dir, max_length, sampling_rate)

    # シャッフル
    import random
    random.shuffle(data)

    # データ分割
    train_size = int(len(data) * train_split)
    train_data = data[:train_size]
    test_data = data[train_size:]

    # リストを辞書形式に変換する関数
    def convert_to_dict(data):
        return {
            "label": [item["label"] for item in data],
            "input_values": [item["input_values"] for item in data],
            "attention_mask": [item["attention_mask"] for item in data],
        }

    # DatasetDict の作成
    dataset = DatasetDict({
        "train": Dataset.from_dict(convert_to_dict(train_data)),
        "test": Dataset.from_dict(convert_to_dict(test_data)),
    })

    return dataset

# データセットの作成

In [None]:

# フォルダパスを指定
expanded_dir = "/content/drive/MyDrive/customBERT/expanded"
kanata_dir = "/content/drive/MyDrive/customBERT/kanata"

# Feature Extractor のロード
feature_extractor = AutoFeatureExtractor.from_pretrained('rinna/japanese-hubert-base')

# サンプリングレートと最大長
sampling_rate = feature_extractor.sampling_rate
max_length = int(sampling_rate * 30)  # 30秒

# データセットの準備
print("Preparing dataset...")
dataset = prepare_dataset(feature_extractor, expanded_dir, kanata_dir, max_length, sampling_rate)
print("Done.")
# 結果を確認
print(dataset)


preprocessor_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

Preparing dataset...
Done.
DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 64
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 17
    })
})


In [None]:

# Feature Extractor とモデルのロード
feature_extractor = AutoFeatureExtractor.from_pretrained('rinna/japanese-hubert-base')

model = AutoModelForAudioClassification.from_pretrained(
    'rinna/japanese-hubert-base',
    num_labels=2,  # ラベル数 (1 と 2 の2種類)
    id2label={0: "label_1", 1: "label_2"},
    label2id={"label_1": 0, "label_2": 1}
)

from sklearn.metrics import accuracy_score
import numpy as np

# 評価関数を定義
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)  # 予測結果を取得
    acc = accuracy_score(labels, preds)  # 正解率を計算
    return {"accuracy": acc}


training_args = TrainingArguments(
    output_dir="./results",  # モデル保存先
    evaluation_strategy="epoch",  # 各エポック終了時に評価
    save_strategy="epoch",  # 各エポック終了時にモデル保存
    learning_rate=1e-3,
    per_device_train_batch_size=4,  # バッチサイズ
    per_device_eval_batch_size=4,
    num_train_epochs=10,  # エポック数
    warmup_steps=500,  # ウォームアップステップ
    weight_decay=0.01,
    logging_dir="./logs",  # ログの保存先
    logging_steps=10,
    save_total_limit=2,  # 保存モデル数の上限
    report_to="wandb",  # W&B を使う場合
    run_name="audio-classification"
)

data_collator = DataCollatorWithPadding(tokenizer=feature_extractor)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],  # トレーニングデータセット
    eval_dataset=dataset["test"],   # テストデータセット
    tokenizer=feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # カスタム評価関数
)

# トレーニング開始
trainer.train()

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at rinna/japanese-hubert-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6861,0.626983,0.823529
2,0.5994,0.41653,1.0
3,0.4568,0.192961,1.0
4,0.1422,0.042072,1.0
5,0.0124,0.308655,0.941176
6,0.005,0.001936,1.0
7,0.0016,0.000978,1.0
8,0.0012,0.00065,1.0
9,0.0007,0.00048,1.0
10,0.0005,0.000367,1.0


TrainOutput(global_step=160, training_loss=0.18414021044154652, metrics={'train_runtime': 428.3773, 'train_samples_per_second': 1.494, 'train_steps_per_second': 0.374, 'total_flos': 1.74309746688e+17, 'train_loss': 0.18414021044154652, 'epoch': 10.0})

In [None]:
# 推論に必要なライブラリ
import os
import torchaudio
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoFeatureExtractor


In [None]:
# デバイスを指定（GPUが利用可能な場合はGPU、なければCPU）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデルをデバイスに移動
model = model.to(device)

# 推論対象の音声ファイルパスのリスト
paths = [
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1408_1.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1408.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1407.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1396_1.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1389_1.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1386.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1356_2.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1323.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1274.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1214.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1133.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_1001.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_0932.wav.wav",
    "/content/drive/MyDrive/customBERT/kanata/BASIC5000_0844.wav.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1408_1.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1406.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1399.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1387.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1345.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1315.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1275.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1231_1.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1172.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1124.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1098_1.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1071.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_1009.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_0979_1.wav",
    "/content/drive/MyDrive/customBERT/expanded/BASIC5000_0937.wav"



    # 他の音声ファイルパスを追加
]

# 推論結果を格納するリスト
results = []

# 各音声ファイルに対して推論を実行
for audio_path in paths:
    try:
        # 音声データの読み込み
        waveform, sr = torchaudio.load(audio_path)

        # リサンプリング
        if sr != feature_extractor.sampling_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=feature_extractor.sampling_rate)
            waveform = resampler(waveform)

        # パディングまたは切り取り
        max_length = int(feature_extractor.sampling_rate * 30)  # 30秒
        if waveform.size(1) < max_length:
            waveform = F.pad(waveform, (0, max_length - waveform.size(1)))
        else:
            waveform = waveform[:, :max_length]

        # 特徴量抽出
        inputs = feature_extractor(
            waveform.squeeze().numpy(),
            sampling_rate=feature_extractor.sampling_rate,
            return_attention_mask=True
        )

        # モデル入力の準備
        input_values = torch.tensor(inputs["input_values"]).to(device)
        attention_mask = torch.tensor(inputs["attention_mask"]).to(device)

        # 推論
        with torch.no_grad():
            outputs = model(input_values=input_values, attention_mask=attention_mask)

        # 結果の取得
        logits = outputs["logits"]
        predicted_class = torch.argmax(logits, dim=1).item()

        # 結果を保存
        results.append({
            "audio_path": audio_path,
            "predicted_class": predicted_class
        })

        print(f"Processed: {audio_path}, Predicted class: {predicted_class}")

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

# 推論結果を表示
print("\nPrediction Results:")
for result in results:
    print(f"Audio: {result['audio_path']}, Predicted Class: {result['predicted_class']}")


Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1408_1.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1408.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1407.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1396_1.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1389_1.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1386.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1356_2.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1323.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1274.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBERT/kanata/BASIC5000_1214.wav.wav, Predicted class: 1
Processed: /content/drive/MyDrive/customBE