## Run once

In [1]:
import contextlib
import ipynb.fs
import librosa
import math
import numpy as np
import torch
import torch.nn as nn
import wave

from .defs.extract_formant import extract_formant
from joblib import load
from pydub import AudioSegment
from pydub.silence import detect_leading_silence
from scipy.io import wavfile
from skimage.transform import resize

# The Regression model's definition
class CNNRegressor(nn.Module):
    def __init__(self):
        super(CNNRegressor, self).__init__()
        self.cnn_layer1 = nn.Sequential(nn.Conv2d(1, 16, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.1), nn.BatchNorm2d(16), nn.MaxPool2d(kernel_size=2))
        self.cnn_layer2 = nn.Sequential(nn.Conv2d(16, 32, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.2), nn.BatchNorm2d(32), nn.MaxPool2d(kernel_size=2))
        self.linear_layer1 = nn.Linear(32*30*6 + 8, 64)
        self.dropout1 = nn.Dropout(0.5)
        self.activ1 = nn.ReLU()
        self.linear_layer_p = nn.Linear(64, 32)
        self.dropout_p = nn.Dropout(0.5)
        self.activ_p = nn.ReLU()
        self.linear_layer2 = nn.Linear(32, 2)
        self.activ2 = nn.Sigmoid()
        
    def forward(self, images, features):
        cnn2 = self.cnn_layer2(self.cnn_layer1(images.unsqueeze(1)))
        cnn_vec = cnn2.reshape(cnn2.shape[0], -1)
        out = self.dropout1(self.activ1(self.linear_layer1(torch.cat((cnn_vec, features), dim=1))))
        return self.activ2(self.linear_layer2(self.activ_p(self.dropout_p(self.linear_layer_p(out)))))

# Function for rescaling the melspectrogram
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

# Function for generating the melspectrogram
def spectrogram(y, sr):
    # use log-melspectrogram
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, n_fft=512, hop_length=512)
    mels = np.log(mels + 1e-9) # add small number to avoid log(0)
    img = scale_minmax(mels, 0, 255).astype(np.uint8)
    img = np.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img
    return img
    
clf = load('../models/rule_based.joblib') # The classifier
scaler = load('../models/scaler.joblib') # The scaler, transforms formants so that they have a mean of 0 and a variance of 1
regressor = torch.load('../models/neural_regressor.pt') # The vowel detection model

trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
strip_silence: AudioSegment = lambda x: trim_trailing_silence(trim_leading_silence(x))

## Run once per file (config)

In [2]:
input_file = '../../allwavs/allvowl/corrected/extracted/o__f__0.63636__1.00000__m_o__mot__hzc1_mono.wav' # Put your file here
target_vowel = 'o' # Possible values: 'a', 'e', 'E', 'i', 'o', 'O', 'u', 'y', '2', '9'
speaker_gender = 'f' # Possible values: 'f' or 'm'
previous_phoneme = 'm' # Possible values: 'l', 'm', 'p', 's', 't' or 't1' (last one shouldn't be used)
word_ends_with_r = False # True if there is an /R/ after the vowel

## Run once per file (inference)

In [20]:
idx2key = ['2', '9', 'a', 'a~', 'e', 'E', 'i', 'O', 'o', 'o~', 'u', 'U~+', 'y'] # All possible vowels
valid = [0, 1, 2, 4, 5, 6, 7, 8, 10, 12] # Vowels we consider here (depends on the classifier)
all_phonemes = ['l', 'm', 'p', 's', 't', 't1'] # Phonemes that can be before the vowel

tmp_wav = 'tmp_process.wav'
tmp_wav_2 = 'tmp_process_trimmed.wav'
max_w = 31 # Image width to resize to

# Remove leading and trailing silences
sound = AudioSegment.from_file(input_file)
trimmed = strip_silence(sound)
trimmed.export(tmp_wav, format='wav', bitrate='768k')
    
# Generate log-melspectrogram
y, sr = librosa.load(tmp_wav)
melspec = spectrogram(y=y, sr=sr)

# Feed log-melspectrogram to regression model to predict start and and of the vowel
melspec = resize(melspec, (melspec.shape[0], max_w), anti_aliasing=False)
input_tensor = torch.tensor(melspec).float().cuda()
input_features = torch.tensor([speaker_gender == 'f', not word_ends_with_r, *[previous_phoneme == x for x in all_phonemes]]).cuda()
pred = regressor(input_tensor.unsqueeze(0), input_features.unsqueeze(0))[0]
vowel_start = pred[0].item()
vowel_end = pred[1].item()
if vowel_start >= vowel_end:
    print('The model predicted that the vowel has negative duration!')
    raise ValueError()

# Trim file at start and end to only have the vowel
sample_rate, wave_data = wavfile.read(tmp_wav)
duration = len(wave_data) / sample_rate
start_sample = int(duration * vowel_start * sample_rate)
end_sample = int(duration * vowel_end * sample_rate)
wavfile.write(tmp_wav_2, sample_rate, wave_data[start_sample:end_sample])
duration = len(wave_data[start_sample:end_sample]) / sample_rate

# Extract formants
try:
    formants = extract_formant(tmp_wav_2, f0min=math.ceil(3/duration + 0.000001), n_formants=4)
except ZeroDivisionError:
    print('The file is too short to analyze!')
    raise

# Add additional features (gender, previous phoneme)
input_features = torch.cat([input_features[0:1], input_features[2:]]).cpu()
features = torch.cat((torch.tensor(formants), input_features)).numpy()

# Rescale formants
features[:4] = scaler.transform(np.array(features[:4]).reshape(1, -1))[0]

# Prediction with probabilities
pred = clf.predict_proba([features]) # Probabilities
final_vowel = np.argmax(pred)
final_confidence = pred[0][final_vowel] # Best score
final_vowel = idx2key[valid[final_vowel]] # Actual prediction

print('Vowel ', 'Confidence')
print('-'*25)
for i in range(len(valid)):
    vowel = idx2key[valid[i]]
    print(f'{vowel:<6} {pred[0][i]:.3f}', '='*int(pred[0][i]*100))

print(f'Prediction: /{final_vowel}/, confidence: {final_confidence:.3f}')

Vowel  Confidence
-------------------------
2      0.004 
9      0.000 
a      0.000 
e      0.008 
E      0.000 
i      0.012 =
O      0.000 
y      0.014 =
Prediction: /o/, confidence: 0.572
