<a href="https://colab.research.google.com/github/kth0522/AI_news/blob/main/voice_emotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**드라이브 마운트**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
ROOT_DIR = "/content/drive/MyDrive/인공지능융합원/"

**라이브러리 설치**

In [3]:
!pip install transformers
!pip install datasets
!pip install pathlib
!pip install sklearn



**코드**

In [4]:
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

from transformers import AutoConfig, Wav2Vec2Processor

from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

from transformers import (
    Trainer,
    is_apex_available,
)

from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor

from typing import Any, Dict, Union
from transformers import AutoConfig, Wav2Vec2Processor
import torch
from packaging import version
from torch import nn

import librosa
from sklearn.metrics import classification_report

In [5]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [6]:
class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [7]:
def load_wav_files(folder_path):
    wav_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.wav')]
    return wav_files

def process_and_predict(wav_file, processor, model, device):
    speech_array, sampling_rate = torchaudio.load(wav_file)
    if speech_array.shape[0] > 1:  # 스테레오 채널이 있는 경우
        speech_array = torch.mean(speech_array, dim=0, keepdim=True)
    speech_array = speech_array.squeeze().numpy()
    resampled_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate,
                                       target_sr=processor.feature_extractor.sampling_rate)
    features = processor(resampled_array, sampling_rate=processor.feature_extractor.sampling_rate,
                         return_tensors="pt", padding=True)
    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)
    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits
    pred_id = torch.argmax(logits, dim=-1).item()
    return pred_id

def file_number(file_name):
    base_name = os.path.basename(file_name)
    number_part = os.path.splitext(base_name)[0]
    try:
        return int(number_part)
    except ValueError:
        return float('inf')

def classify_emotions_in_subfolders(input_dir, output_dir, model, processor, device, config):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for folder_name in os.listdir(input_dir):
        folder_path = os.path.join(input_dir, folder_name)
        if os.path.isdir(folder_path):
            wav_files = load_wav_files(folder_path)
            wav_files.sort(key=file_number)
            predictions = []
            for wav_file in wav_files:
                pred_id = process_and_predict(wav_file, processor, model, device)
                pred_label = config.id2label[pred_id]
                file_name_without_extension = os.path.splitext(os.path.basename(wav_file))[0]
                predictions.append((file_name_without_extension, pred_label))

            df = pd.DataFrame(predictions, columns=['wav_file', 'predicted_emotion'])
            output_file = os.path.join(output_dir, f"{folder_name}_emotions.csv")
            df.to_csv(output_file, index=False)
            print(f"Results for folder '{folder_name}' saved to {output_file}")

**실행**

**emotion labels** \
AI Hub 데이터셋\
['Angry', 'Disgust', 'Fear', 'Happiness', 'Neutral', 'Sadness', 'Surprise']

In [8]:
label_list = ['Angry', 'Disgust', 'Fear', 'Happiness', 'Neutral', 'Sadness', 'Surprise']
num_labels = len(label_list)
print(label_list)
print(num_labels)

['Angry', 'Disgust', 'Fear', 'Happiness', 'Neutral', 'Sadness', 'Surprise']
7


In [9]:
# model_path에 구글 드라이브에 올려놓은 voice_emotion_model_weight 경로 넣으면 됨
model_path = ROOT_DIR +'voice_emotion_model_weight/'
pooling_mode = 'mean'
config_name = "lighteternal/wav2vec2-large-xlsr-53-greek"

In [10]:
config = AutoConfig.from_pretrained(
    model_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained(config_name)
sampling_rate = processor.feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_path, config=config,
                                                            ignore_mismatched_sizes=False).to(device)

Some weights of the model checkpoint at /content/drive/MyDrive/인공지능융합원/voice_emotion_model_weight/ were not used when initializing Wav2Vec2ForSpeechClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at /content/drive/MyDrive/인공지능융합원/voice_emotion_model_weight/ and are newly initialized: ['wav2vec2.encoder.pos_conv_em

In [13]:
INPUT_DIR = ROOT_DIR + 'voice_segments'
OUTPUT_DIR = ROOT_DIR + 'voice_emotions'

In [14]:
classify_emotions_in_subfolders(INPUT_DIR, OUTPUT_DIR, model, processor, device, config)

Results for folder 'mbn_ai_7' saved to /content/drive/MyDrive/인공지능융합원/voice_emotions/mbn_ai_7_emotions.csv
Results for folder 'mbn_ai_6' saved to /content/drive/MyDrive/인공지능융합원/voice_emotions/mbn_ai_6_emotions.csv
