Слегка модифицированный [ноутбук](https://github.com/salute-developers/golos/blob/master/dusha/demo/recognise_emotions_online.ipynb)

In [1]:
# # установка недостающих библиотек
# 
# !pip install lazycon
# !pip install torch==1.8.1
# !pip install onnx
# !pip install -U librosa
# !pip install torchaudio==0.8.1
# !pip install scikit-learn==0.24.0
# !pip install coremltools 
# !pip install sounddevice
# !pip install -U numba

In [2]:
import torch
import onnx
import toml
import librosa
import lazycon
import time
import os, sys

# import coremltools as ct
import numpy as np
import soundfile as sf
import sounddevice as sd

import pandas as pd

from glob import glob
from pathlib import Path
from tqdm.notebook import tqdm

__import__('warnings').filterwarnings("ignore")

In [3]:
sys.path.append('./experiments')
import core

In [4]:
def create_features(
    data: np.array,
    hop_length_coef: float = 0.01,
    win_length_coef: float = 0.02,
    sample_rate: int = 16000,
    n_mels: int = 64,
) -> np.array:
    """
    As an input all models use standard speech features:
    64 Mel-filterbank calculated from 20ms windows with a 10ms overlap.
    """

    hop_length = int(sample_rate * hop_length_coef)
    win_length = int(sample_rate * win_length_coef)
    if len(data) != 0:
        spec = librosa.feature.melspectrogram(
                    y=data,
                    sr=sample_rate,
                    hop_length=hop_length,
                    n_fft=win_length,
                    n_mels=n_mels,
                )
    else:
        raise AttributeError
    mel_spec = librosa.power_to_db(spec, ref=np.max)

    return mel_spec

In [5]:
def create_features_for_audio(
    wav_name: str,
    hop_length_coef: float = 0.01,
    win_length_coef: float = 0.02,
    sample_rate: int = 16000,
    n_mels: int = 64,
) -> np.array:
    """
    As an input all models use standard speech features:
    64 Mel-filterbank calculated from 20ms windows with a 10ms overlap.
    """

    hop_length = int(sample_rate * hop_length_coef)
    win_length = int(sample_rate * win_length_coef)
    data, rate = librosa.load(wav_name, sr=sample_rate)
#     print(data)
    if len(data) != 0:
        spec = librosa.feature.melspectrogram(
                    y=data,
                    sr=rate,
                    hop_length=hop_length,
                    n_fft=win_length,
                    n_mels=n_mels,
                )
    else:
        raise AttributeError
    mel_spec = librosa.power_to_db(spec, ref=np.max)
    return mel_spec

In [46]:
def index2name(
    index: int
) -> str:
    class_dict = {0:"angry", 1:"sad", 2:"neutral", 3:"positive"}
    
    if index > len(class_dict) or index < 0:
        raise AttributeError
    
    return class_dict[index]

name2index = {emotion: num for num, emotion in {0:"angry", 1:"sad", 2:"neutral", 3:"positive"}.items()}

In [7]:
dir_path = './model/'
model_name = 'podcasts_finetune_old_w_lr_1e-3_try1'
device = 'cpu'

In [8]:
config_path = os.path.join(dir_path, "train.config")
assert os.path.exists(config_path), f"No train.config in {dir_path}"

model_path = os.path.join(dir_path, model_name)
# check the model
if not os.path.exists(model_path):
    print(f"There is no saved model {model_path}. Nothing to inference")
#     return None

# load the model
cfg = lazycon.load(config_path)
model = cfg.model
    
model.to(device)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model = model.double()
model.eval()

ConvSelfAttentionMobileNet(
  (features): Sequential(
    (0): ConvBNActivation(
      (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=4, bias=False)
          (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(32, eps=1e-0

In [9]:
sd.query_devices()

   0 Переназначение звуковых устр. - Input, MME (2 in, 0 out)
>  1 Микрофон (Realtek High Definiti, MME (2 in, 0 out)
   2 Переназначение звуковых устр. - Output, MME (0 in, 2 out)
<  3 Динамики (Realtek High Definiti, MME (0 in, 2 out)
   4 Realtek Digital Output (Realtek, MME (0 in, 2 out)
   5 Realtek Digital Output(Optical), MME (0 in, 2 out)
   6 AMD  DP  Output (AMD High Defin, MME (0 in, 2 out)
   7 Первичный драйвер записи звука, Windows DirectSound (2 in, 0 out)
   8 Микрофон (Realtek High Definition Audio), Windows DirectSound (2 in, 0 out)
   9 Первичный звуковой драйвер, Windows DirectSound (0 in, 2 out)
  10 Динамики (Realtek High Definition Audio), Windows DirectSound (0 in, 2 out)
  11 Realtek Digital Output (Realtek High Definition Audio), Windows DirectSound (0 in, 2 out)
  12 Realtek Digital Output(Optical) (Realtek High Definition Audio), Windows DirectSound (0 in, 2 out)
  13 AMD  DP  Output (AMD High Definition Audio Device), Windows DirectSound (0 in, 2 out)
  14 

In [10]:
# # запись с микрофона
# fs=16000 
# duration = 5 # seconds 
# myrecording = sd.rec(duration * fs, samplerate=fs, channels=1, dtype='float64') 
# print ("Recording Audio") 
# sd.wait() 
# print ("Audio recording complete , Play Audio") 
# sd.play(myrecording, fs) 

In [11]:
# feat = create_features(np.transpose(myrecording)[0])

In [12]:
wavs_path = r'D:\python-datasets\dusha\podcast_train\wavs'
feat = create_features_for_audio(os.path.join(wavs_path, "009bff3c48271cd56f085ad4217cbcb2.wav"))

In [13]:
feat = feat.astype(np.float64)

In [14]:
print(f"Calculating predicts")
inputs = torch.from_numpy(feat).to(device).unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    probs = model(inputs)

Calculating predicts


In [15]:
pred_class = np.argmax(probs.cpu().numpy(), axis=1)
index2name(pred_class[0])

'neutral'

In [16]:
probs

tensor([[-6.5000, -1.0716,  1.3197, -1.9417]], dtype=torch.float64)

In [17]:
DATASET_PATH = r'D:\python-datasets\dusha'
DATASETS = ['podcast_train', 'podcast_test', 'crowd_test']

crowd_test = os.path.join(DATASET_PATH, DATASETS[2])
crowd_info = os.path.join(crowd_test,'raw_crowd_test.tsv')

df = pd.read_csv(crowd_info, sep='\t')
print('Всего строк:', len(df))

Всего строк: 79088


In [18]:
# check_column = 'annotator_emo'
check_column = 'speaker_emo'

if check_column != 'speaker_emo':
    df["speaker_emo"].fillna(df["annotator_emo"], inplace=True)
else:
    df.dropna(subset=['speaker_emo'], inplace=True)

df.drop_duplicates(subset=['audio_path', check_column], inplace=True)
print('Без дубликатов:', len(df))
tqdm.pandas()
# # Проверка и фильтрация датафрейма
# df = df[df['audio_path'].progress_apply(lambda x: os.path.isfile(os.path.join(crowd_test, x)))]
# print('Итого осталось файлов:', len(df))

Без дубликатов: 17217


In [19]:
%%time
tmp = df.iloc[:]
preds = []
predicted_probs = []
for index, row in tqdm(tmp.iterrows(), total=len(tmp)):
    file_path = os.path.join(crowd_test, row['audio_path'])
    feat = create_features_for_audio(file_path)
    feat = feat.astype(np.float64)
    inputs = torch.from_numpy(feat).to(device).unsqueeze(0).unsqueeze(0)
    with torch.no_grad():
        probs = model(inputs)
    pred_class = np.argmax(probs.cpu().numpy(), axis=1)
    pred = index2name(pred_class[0])
    preds.append(pred)
#     print('pred:', pred, 'true:', row['annotator_emo'])

    # Преобразование меток классов в вероятности (softmax)
    softmax = torch.nn.Softmax(dim=1)
    probabilities = softmax(probs)
    predicted_probs.append(probabilities.detach().cpu().numpy())
    
tmp['pred'] = preds
tmp['probs'] = predicted_probs

tmp.to_csv('preds.csv', index=False)

  0%|          | 0/17217 [00:00<?, ?it/s]

Wall time: 1h 6min 28s


In [20]:
tmp.head(7)

Unnamed: 0,hash_id,audio_path,duration,annotator_emo,golden_emo,annotator_id,speaker_text,speaker_emo,source_id,pred,probs
0,9e9961c53ca6eeb440b217e539fbf46c,wavs/9e9961c53ca6eeb440b217e539fbf46c.wav,5.82,neutral,,858305a5450b7bd1288ba0053b1cd1c1,я слушаю,neutral,4282ddc30d71ef420e202e0c60391e9f,sad,"[[0.0007966209990913893, 0.8409130616683064, 0..."
1,0166f65a30354db8282682b1a280e64c,wavs/0166f65a30354db8282682b1a280e64c.wav,3.7,sad,,858305a5450b7bd1288ba0053b1cd1c1,каким стал сбер,neutral,d70dc98ed56e9362eaefefb7b2827c8f,neutral,"[[0.002126912173107892, 0.2119796569617634, 0...."
2,d49a6b560155831725a7bdc7d0a96099,wavs/d49a6b560155831725a7bdc7d0a96099.wav,4.38,neutral,,858305a5450b7bd1288ba0053b1cd1c1,где родился шерлок холмс,neutral,0ee35d2abecf4272ecc8e1539b0839d8,sad,"[[0.001504161939584158, 0.8243252080716712, 0...."
3,c6852b0925797612d7b6724da8cbe7b4,wavs/c6852b0925797612d7b6724da8cbe7b4.wav,8.58,neutral,,858305a5450b7bd1288ba0053b1cd1c1,открой в браузере ennio morricone,neutral,0855e363c1787df1592f58f7a27ebe13,neutral,"[[0.02129252478404198, 0.2611207498575717, 0.5..."
12,64a7aa17132c3e4b7be1aaed5fc88090,wavs/64a7aa17132c3e4b7be1aaed5fc88090.wav,5.06,positive,,32bd471407fe168dacd5f8252f9949b7,а там и ева проснулись с утра,positive,82f97728c4767b283d249afc96d23caf,neutral,"[[0.13181136546781647, 0.12499334244340565, 0...."
13,33152583dcbf1fe40d142954a2a7ca23,wavs/33152583dcbf1fe40d142954a2a7ca23.wav,7.6,neutral,,32bd471407fe168dacd5f8252f9949b7,я хочу чтобы ты меня поздравила с днем моим ро...,positive,7d4e40268690a9f210bb0816bff36317,sad,"[[0.002924788076908251, 0.7515794522042758, 0...."
18,0a1588516e461a6392eec8ae52cfddc6,wavs/0a1588516e461a6392eec8ae52cfddc6.wav,5.247312,sad,,858305a5450b7bd1288ba0053b1cd1c1,позови мне кого нибудь другого лезть,angry,aa38ee6e523e1d206e0d4decfacbe661,sad,"[[0.006748335265136725, 0.5567768192485748, 0...."


In [37]:
prs = tmp
tmp.columns

Index(['hash_id', 'audio_path', 'duration', 'annotator_emo', 'golden_emo',
       'annotator_id', 'speaker_text', 'speaker_emo', 'source_id', 'pred',
       'probs'],
      dtype='object')

In [47]:
from sklearn.metrics import f1_score as F1, classification_report, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

tst = prs.copy()

remap = {'other': 'neutral'}

for col in ('annotator_emo', 'speaker_emo', 'pred'):
    prs[col] = prs[col].map(lambda x: remap.get(x, x))

remap = {'sad': 'neutral', 'other': 'neutral'}

for col in ('annotator_emo', 'speaker_emo', 'pred'):
    tst[col] = tst[col].map(lambda x: remap.get(x, x))

emos = {'4 emo': prs, '3 emo': tst}

for emo, df in emos.items():
    for name_col in ('annotator_emo', 'speaker_emo'):
        # Пример истинных меток и предсказанных вероятностей для многоклассовой классификации
        # Истинные метки классов
        y_true = df[name_col].map(name2index)  
        # Предсказанные вероятности для каждого класса
        y_scores = np.concatenate(df['probs'].values)  
        # Преобразуем истинные метки классов в бинарный формат
        lb = LabelBinarizer()
        y_true_bin = lb.fit_transform(y_true)
        n_emos = len(lb.classes_)
        roc_auc = roc_auc_score(y_true_bin, y_scores[:, :n_emos], average="micro") 
        
        print(f'f1_score ({emo}) по колонке {name_col:<13} =',
              F1(df[name_col], df['pred'], average='weighted', zero_division=0).round(4),
             'roc_auc_score = ', roc_auc.round(4))

f1_score (4 emo) по колонке annotator_emo = 0.5524 roc_auc_score =  0.8233
f1_score (4 emo) по колонке speaker_emo   = 0.4493 roc_auc_score =  0.7638
f1_score (3 emo) по колонке annotator_emo = 0.8006 roc_auc_score =  0.7277
f1_score (3 emo) по колонке speaker_emo   = 0.7032 roc_auc_score =  0.7116


In [39]:
for emo, df in emos.items():
    for name_col in ('annotator_emo', 'speaker_emo'):
        print(f'{emo} по колонке {name_col:<13}:')
        print(classification_report(df[name_col], df['pred'], zero_division=0))

4 emo по колонке annotator_emo:
              precision    recall  f1-score   support

       angry       0.82      0.22      0.35      1911
     neutral       0.72      0.55      0.62     10495
    positive       0.57      0.41      0.48      1730
         sad       0.34      0.81      0.48      3081

    accuracy                           0.55     17217
   macro avg       0.61      0.50      0.48     17217
weighted avg       0.65      0.55      0.55     17217

4 emo по колонке speaker_emo  :
              precision    recall  f1-score   support

       angry       0.86      0.15      0.26      2853
     neutral       0.48      0.52      0.50      7462
    positive       0.62      0.34      0.44      2279
         sad       0.40      0.63      0.49      4623

    accuracy                           0.47     17217
   macro avg       0.59      0.41      0.42     17217
weighted avg       0.54      0.47      0.45     17217

3 emo по колонке annotator_emo:
              precision    recall 

In [40]:
tmp.speaker_emo.value_counts()

neutral     7462
sad         4623
angry       2853
positive    2279
Name: speaker_emo, dtype: int64

In [41]:
tmp.speaker_emo.unique()

array(['neutral', 'positive', 'angry', 'sad'], dtype=object)

In [42]:
tmp.annotator_emo.value_counts()

neutral     10495
sad          3081
angry        1911
positive     1730
Name: annotator_emo, dtype: int64

In [43]:
tmp.annotator_emo.unique()

array(['neutral', 'sad', 'positive', 'angry'], dtype=object)