# Классификация звуковых файлов

В данной задаче нужно по звуковым файлам в формате wav предсказать пол человека, речь которого записана на каждом из файлов (0 – мужчина, 1 – женщина).

Для того, чтобы получить OK по этой задаче, нужно получить точность более 98 процентов на тестовом наборе данных.

In [None]:
!pip install librosa -q 
!pip install tensorflow -q

In [None]:
import numpy as np
import pandas as pd
import os
import random
import soundfile as sf
import matplotlib.pyplot as plt

from os import listdir
from os.path import isfile, join
from scipy.io import wavfile as wav
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten, MaxPooling2D, Conv2D, Input, BatchNormalization
from sklearn import metrics
from tensorflow.keras import models, layers
#from keras.layers import Activation, Dense, Dropout, Input
#from keras.models import Sequential

from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import Adam, schedules
from keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

import librosa
from librosa import get_samplerate

## Подготовка данных

 Загрузим таблицу с названиями файлов и их лейблами:

In [None]:
audio_dataset_path='D:/Dropbox/Projects/Leo/train/'
test_audio_dataset_path ='D:/Dropbox/Projects/Leo/test/'
metadata=pd.read_csv('D:/Dropbox/Projects/Leo/targets.tsv')        
metadata.head()

Unnamed: 0,5d1f7e43366513a1d0a6ec5640c3dc24\t1
0,9a701a4536a05b6610a590a9fe702ed8\t1
1,cad0b8547008d1524c1a0e5fd51f9908\t1
2,4bbe607e7dc95460e2cc1a6ee5f4dfa6\t0
3,30fb32cba90b34af26f3f14f5d636805\t0
4,fa33445afe71a6dc18e4881c053da5be\t0


In [None]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13935 entries, 0 to 13934
Data columns (total 1 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   5d1f7e43366513a1d0a6ec5640c3dc24	1  13935 non-null  object
dtypes: object(1)
memory usage: 109.0+ KB


In [None]:
metadata['target'] = metadata['5d1f7e43366513a1d0a6ec5640c3dc24\t1'].apply(lambda x: x[-1:])
metadata['name'] = metadata['5d1f7e43366513a1d0a6ec5640c3dc24\t1'].apply(lambda x: x[:-2])
metadata.head()

Unnamed: 0,5d1f7e43366513a1d0a6ec5640c3dc24\t1,target,name
0,9a701a4536a05b6610a590a9fe702ed8\t1,1,9a701a4536a05b6610a590a9fe702ed8
1,cad0b8547008d1524c1a0e5fd51f9908\t1,1,cad0b8547008d1524c1a0e5fd51f9908
2,4bbe607e7dc95460e2cc1a6ee5f4dfa6\t0,0,4bbe607e7dc95460e2cc1a6ee5f4dfa6
3,30fb32cba90b34af26f3f14f5d636805\t0,0,30fb32cba90b34af26f3f14f5d636805
4,fa33445afe71a6dc18e4881c053da5be\t0,0,fa33445afe71a6dc18e4881c053da5be


Создадим функции по извлечению признаков из файлов, а именно, спектрограммы:

In [None]:
def get_melspectrogram_db(file_path, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300, top_db=80):
    wav,sr = librosa.load(file_path,sr=sr)
    if wav.shape[0]<5*sr:
        wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
    else:
        wav=wav[:5*sr]
    spec=librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
    return spec

In [None]:
def get_melspectrogram_db_aug(file, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300, top_db=80):
    wav = file
    if wav.shape[0]<5*sr:
        wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
    else:
        wav=wav[:5*sr]
    spec=librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
    return spec

Добавим функции аугментации звуковых файлов (добавление шума, изменение скорости файла, громкости):

In [None]:
def add_white_noise(signal, noise_percentage_factor):
    noise = np.random.normal(0, signal.std(), signal.size)
    augmented_signal = signal + noise * noise_percentage_factor
    return augmented_signal


def time_stretch(signal, time_stretch_rate):
    """Time stretching implemented with librosa:
    https://librosa.org/doc/main/generated/librosa.effects.pitch_shift.html?highlight=pitch%20shift#librosa.effects.pitch_shift
    """
    return librosa.effects.time_stretch(signal, rate=time_stretch_rate)


def pitch_scale(signal, sr, num_semitones):
    """Pitch scaling implemented with librosa:
    https://librosa.org/doc/main/generated/librosa.effects.pitch_shift.html?highlight=pitch%20shift#librosa.effects.pitch_shift
    """
    return librosa.effects.pitch_shift(signal, sr, num_semitones)


def random_gain(signal, min_factor=0.1, max_factor=0.12):
    gain_rate = random.uniform(min_factor, max_factor)
    augmented_signal = signal * gain_rate
    return augmented_signal


def invert_polarity(signal):
    return signal * -1

Чтобы признаки каждого файла не отличались от другого по размеру приведем их к одному стандарту следующей функцией:

In [None]:
def padding(array, xx, yy):
    """
    :param array: numpy array
    :param xx: desired height
    :param yy: desirex width
    :return: padded array
    """
    h = array.shape[0]
    w = array.shape[1]
    a = max((xx - h) // 2,0)
    aa = max(0,xx - a - h)
    b = max(0,(yy - w) // 2)
    bb = max(yy - b - w,0)
    return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')

Добавим список применяемой аугментации:

In [None]:
wav_operations = [invert_polarity(), add_white_noise(0.3), 
                  add_white_noise(0.5), add_white_noise(0.1), 
                  time_stretch(1.1), time_stretch(0.9), time_stretch(0.8), random_gain()]

Извлекаем признаки из тренировочных файлов:

In [None]:
spec_data = []
labels = [] 
max_size=350 # длина фич

extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path), str( row["name"]+'.wav'), )

    label = row["target"]

    spec = get_melspectrogram_db(file_name)
    spec_data.append(padding(spec,128, max_size))
    labels.append(label)
    
    signal, sr = librosa.load(file_name, res_type='kaiser_fast')
    
    for f in wav_operations:
        augmented_signal = f(signal)
        sr = get_samplerate(augmented_signal)
        spec = get_melspectrogram_db_aug(augmented_signal, sr=sr)
        spec_data.append(padding(spec,128, max_size))
        labels.append(label)

13935it [05:28, 42.41it/s]


Подгружаем названия всех файлов в тестовой папке в лист:

In [None]:
only_test_files = [f for f in listdir(test_audio_dataset_path) if isfile(join(test_audio_dataset_path, f))]

Подготовим признаки для тестовой выборки:

In [None]:
test_data = []
labels = [] 
max_size=350 

extracted_features=[]
for file in tqdm(only_test_files):
    file_name = os.path.join(os.path.abspath(test_audio_dataset_path), file, )
    spec = get_melspectrogram_db(file_name)
    test_data.append(padding(spec,128, max_size))

100%|█████████████████████████████████████████████████████████████████████████████| 3413/3413 [00:34<00:00, 100.32it/s]


Понижаем битность фич:

In [None]:
spec_data = np.array(spec_data)
spec_data = np.float32( spec_data )

In [None]:
test_data = np.array(test_data)
test_data = np.float32( test_data )

Разделяем признаки на тестовые и валидационные для модели:

In [None]:
X_train, X_val = train_test_split( spec_data, test_size=0.2,
                                   shuffle=False)

Целевой признак аналогично:

In [None]:
y=np.array(labels)
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))
y_train, y_val = train_test_split( y, test_size=0.2,
                                 shuffle=False      )

___________________________________________________________

## Создание  модели

In [None]:
num_labels=y_val.shape[1]

In [None]:
dropout_ratio = 0.4
l2_ratio=0.0007
input_shape=(X_train.shape[1],X_train.shape[2],1)
CNNmodel = models.Sequential()
CNNmodel.add(layers.Conv2D(16, (3, 3), activation='relu', input_shape=input_shape,
                          kernel_regularizer=tf.keras.regularizers.L2(l2_ratio)))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Dropout(dropout_ratio))
CNNmodel.add(layers.Conv2D(16, (3, 3), activation='relu',
                          kernel_regularizer=tf.keras.regularizers.L2(l2_ratio)))
CNNmodel.add(layers.MaxPooling2D((2, 2)))
CNNmodel.add(layers.Dropout(dropout_ratio))
CNNmodel.add(layers.Conv2D(32, (3, 3), activation='relu',
                          kernel_regularizer=tf.keras.regularizers.L2(l2_ratio)
                          ))
CNNmodel.add(layers.Flatten())
CNNmodel.add(layers.Dense(32, activation='relu',
                          kernel_regularizer=tf.keras.regularizers.L2(l2_ratio)
                         ))
CNNmodel.add(layers.Dropout(dropout_ratio))
CNNmodel.add(layers.Dense(16, activation='relu', 
                          kernel_regularizer=tf.keras.regularizers.L2(l2_ratio)
                         ))
CNNmodel.add(layers.Dense(num_labels, activation='softmax'))

In [None]:
model = CNNmodel

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 126, 348, 16)      160       
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 63, 174, 16)      0         
 2D)                                                             
                                                                 
 dropout_3 (Dropout)         (None, 63, 174, 16)       0         
                                                                 
 conv2d_4 (Conv2D)           (None, 61, 172, 16)       2320      
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 30, 86, 16)       0         
 2D)                                                             
                                                                 
 dropout_4 (Dropout)         (None, 30, 86, 16)       

Сформируем веса классов:

In [None]:
neg, pos = np.bincount(metadata['target'])
total = neg + pos
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weights = {0: weight_for_0, 1: weight_for_1}
class_weights

{0: 0.9143700787401575, 1: 1.1033254156769596}

Загружаем веса:

In [None]:
try:
  model.load_weights('./my_model_weights_wav_yandex_cnn_gfhogf4j.h5')
  print('success')
except:
  pass

success


In [None]:
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer._decayed_lr(tf.float32) 
    return lr

optimizer = keras.optimizers.Adam(learning_rate=3.1250e-06)

lr_metric = get_lr_metric(optimizer)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              # metrics=['accuracy']
              metrics=['accuracy', lr_metric]
             )

EarlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                 patience = 6, mode = 'min', restore_best_weights = True)
callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5,
                              patience=2, min_lr=0)

Обучаем модель:

In [None]:
start = datetime.now()

hist = model.fit(X_train, y_train, batch_size=128, 
                 validation_data=(X_val, y_val),
                 callbacks=[callback,EarlyStopping], verbose=1, epochs=3000,  
           class_weight=class_weights
            )

duration = datetime.now() - start
print('Время обучения:', duration)

In [None]:
model.save_weights('./my_model_weights_wav_yandex_cnn_gfhogf4j.h5')

Проверяем нашу модель на валидационной выборке:

In [None]:
test_accuracy = model.evaluate(X_val,y_val,verbose=0)
print(test_accuracy[1])

Делаем предсказания на тестовых данных от Яндекса:

In [None]:
predicts_test = model.predict(test_data)



In [None]:
predicts_test = (predicts_test[:,1]>0.5)*1

Формируем словарь предсказаний для дальнейшей записи его в файл:

In [None]:
test_ids = [x[:-4] for x in only_test_files]

In [None]:
d = {}
for x in range(len(test_ids)):
    d[test_ids[x]] = predicts_test[x]

Формируем tsv файл:

In [None]:
import csv        
        
with open("records.tsv", "w") as record_file:
    for x,y in d.items():
        record_file.write("%s %s\n" % (x,y))