In [None]:
import ikrlib as ilib
import numpy as np
import os
import librosa
import soundfile as sf
import noisereduce as nr
from pydub import AudioSegment
from pydub.silence import detect_nonsilent

In [None]:
cepstral_mean_subtraction_enabled = False
delta_coefficients_enabled = False
coefficients_normalization = False

audio_adjust_enabled = True
reduce_noise_enabled = True
data_augmentation_enabled = True
data_pre_emphasis = False

In [None]:
def audio_adjust(dir):
    new_dir = ilib.get_last_two_dirs(dir) + "/rs/"
    min_silence_len = 1000  # Minimálna dĺžka ticha (ms)
    silence_thresh = -44    # prah ticha (dB)

    print(f"Removing silence from records in directory {dir}")
    if not os.path.isdir(new_dir):
        os.mkdir(new_dir)
    for f in os.listdir(dir):
        if f[-3:] == "wav":
            input_file = ilib.get_last_two_dirs(dir) + "/" + f
            audio = AudioSegment.from_wav(input_file)
            nonsilent_intervals = detect_nonsilent(audio, min_silence_len, silence_thresh)

            # Zkonkatenuj invervaly kde nie je ticho
            non_silent_audio = AudioSegment.empty()
            for start, end in nonsilent_intervals:
                non_silent_audio += audio[start:end]

            # Ulož audio
            output_file = new_dir + f
            non_silent_audio.export(output_file, format="wav")

if audio_adjust_enabled:
    for i in range(1, 32):
        audio_adjust(ilib.get_directory(f"train/{i}"))
        audio_adjust(ilib.get_directory(f"dev/{i}"))
print("Silence was successfully removed")

In [None]:
def reduce_noise(dir):
    print(f"Removing noise from records in directory {dir}")
    new_dir = ilib.get_last_two_dirs(dir) + "/rn/"

    if not os.path.isdir(new_dir):
        os.mkdir(new_dir)
    for f in os.listdir(dir):
        if f[-3:] == "wav":
            input_file = dir + "/" + f

            # Load the audio file
            audio, sr = librosa.load(input_file, sr=None)

            # Select a portion of the audio that contains only noise (e.g., the first 0.5 seconds)
            noise_sample = audio[:int(sr * 0.5)]

            # Perform noise reduction using the noise sample
            reduced_audio = nr.reduce_noise(y=audio, sr=sr, y_noise=noise_sample)

            # Save the noise-reduced audio to a new file
            output_file = new_dir + f
            sf.write(output_file, reduced_audio, sr)

if reduce_noise_enabled:
    for i in range(1, 32):
        reduce_noise(ilib.get_directory(f"train/{i}", audio_adjust_enabled))
        reduce_noise(ilib.get_directory(f"dev/{i}", audio_adjust_enabled))

    print("Noise was successfully removed")

In [None]:
def data_augumentation(dir):
    new_dir = ilib.get_last_two_dirs(dir) + "/da/"
    print(f"Removing noise from records in directory {dir}")
    if not os.path.isdir(new_dir):
        os.mkdir(new_dir)
    for f in os.listdir(dir):
        if f[-3:] == "wav":
            input_file = dir + "/" + f
            print("Data augumentation of file: " + input_file)

            time_stretched_audio = ilib.apply_time_stretching(input_file)
            pitch_shifted_audio = ilib.apply_pitch_shifting(input_file, semitones=2)
            time_shifted_audio = ilib.apply_time_shifting(input_file, shift_ms=500)

            # insert "aug" between "audio" and ".wav"
            stretched_file = new_dir + f[:-4] + "_stretched_aug.wav"
            pitch_shifted_file = new_dir + f[:-4] + "_pitch_shifted_aug.wav"
            time_shifted_file = new_dir + f[:-4] + "_time_shifted_aug.wav"

            time_stretched_audio.export(stretched_file, format="wav")
            pitch_shifted_audio.export(pitch_shifted_file, format="wav")
            time_shifted_audio.export(time_shifted_file, format="wav")

if data_augmentation_enabled:
    for i in range(1, 32):
        data_augumentation(ilib.get_directory(f"train/{i}", audio_adjust_enabled, reduce_noise_enabled))
    print("Data augumentation was done")

In [None]:
def pre_emphasis(dir):
    data = []
    for f in os.listdir(dir):
        if f[-3:] == "wav":
            input_file = dir + "/" + f
            print("Proccessing input file: " + input_file)
            sample_rate, audio_samples = ilib.read_wav_file(input_file)
            emphasized_audio = ilib.apply_pre_emphasis(audio_samples)

            assert(sample_rate==16000)
            data.append(ilib.extract_mfcc(emphasized_audio, sample_rate))
    return data

if data_pre_emphasis:
    train = {}
    dev = {}
    for i in range(1, 32):
        train[i] =  np.vstack(pre_emphasis(ilib.get_directory(f'train/{i}', audio_adjust_enabled, reduce_noise_enabled, data_augmentation_enabled)))
        dev[i] =  list(pre_emphasis(ilib.get_directory(f'dev/{i}', audio_adjust_enabled, reduce_noise_enabled)))
    print("Pre emphasis was successfull")

In [None]:
if not data_pre_emphasis:
    train = {}
    dev = {}
    for i in range(1, 32):
        train[i] = np.vstack(list(ilib.wav16khz2mfcc(ilib.get_directory(f'train/{i}', audio_adjust_enabled, reduce_noise_enabled, data_augmentation_enabled)).values()))
        dev[i] = list(ilib.wav16khz2mfcc(ilib.get_directory(f'train/{i}', audio_adjust_enabled, reduce_noise_enabled)).values())
    print("Loading data was successful")

In [None]:
def min_max_normalize(data):
    min_vals = np.min(data, axis=0)
    max_vals = np.max(data, axis=0)
    normalized_data = (data - min_vals) / (max_vals - min_vals)
    return normalized_data

if coefficients_normalization:
    for i in range(1, 32):
        train[i] = min_max_normalize(train[i])

In [None]:
def compute_deltas(cepstral_coeffs, window_size=2):
    num_frames, num_coeffs = cepstral_coeffs.shape
    deltas = np.zeros((num_frames, num_coeffs))

    for t in range(num_frames):
        window_start = max(0, t - window_size)
        window_end = min(num_frames, t + window_size + 1)
        window_indices = np.arange(window_start, window_end)
        window_weights = window_indices - t

        weighted_sum = np.sum(window_weights[:, np.newaxis] * cepstral_coeffs[window_indices, :], axis=0)
        weight_sum_squared = np.sum(window_weights ** 2)

        deltas[t] = weighted_sum / weight_sum_squared

    return deltas

if delta_coefficients_enabled:
    for i in range(1, 32):
        train_delta_coeffs = compute_deltas(train[i], window_size=2)
        train_derivative_delta_coeffs = compute_deltas(train[i], window_size=2)
        train[i] = np.concatenate((train[i], train_delta_coeffs, train_derivative_delta_coeffs), axis=1)

In [None]:
def cepstral_mean_subtraction(cepstral_coeffs):
    # Calculate the mean of the cepstral coefficients across all frames (axis 0)
    mean_coeffs = np.mean(cepstral_coeffs, axis=0)

    # Subtract the mean from the original cepstral coefficients
    cms_coeffs = cepstral_coeffs - mean_coeffs

    return cms_coeffs

if cepstral_mean_subtraction_enabled:
    for i in range(1, 32):
        train[i] = cepstral_mean_subtraction(train[i])

In [None]:
M = 3  # Počet gaussovských komponent
MUs = {}
COVs = {}
Ws = {}
for i in range(1, 32):
    MUs[i] = train[i][np.random.randint(1, len(train[i]), M)]  # Počiatočna stredná hodnota
    #COVs[i] = [np.cov(train[i].T)] * M  # Počiatočna kovariančná matica
    COVs[i] = [np.diag(np.diag(np.cov(train[i].T))) for _ in range(M)]  # Initial diagonal covariance matrix
    Ws[i] = np.ones(M) / M

In [None]:
for jj in range(30):
    # TTL_t je doveryhodnosť
    for i in range(1, 32):
        Ws[i], MUs[i], COVs[i], TTL = ilib.train_gmm(train[i], Ws[i], MUs[i], COVs[i])
        print(f'Iteration: {jj} Total log likelihood: {TTL} for person {i}')

In [None]:
P_t=0.5
P_n=1.0-P_t

score = []
correct = 0
total = 0

for true_class in range(1, 32):
    for dev_p_i in dev[true_class]:
        dev_p_i_cpy = dev_p_i.copy()

        if coefficients_normalization:
            dev_p_i_cpy = min_max_normalize(dev_p_i_cpy)

        if delta_coefficients_enabled:
            test_t_delta_coeffs = compute_deltas(dev_p_i_cpy, window_size=2)
            test_t_derivative_delta_coeffs = compute_deltas(test_t_delta_coeffs, window_size=2)

            dev_p_i_cpy = np.concatenate((dev_p_i_cpy, test_t_delta_coeffs, test_t_derivative_delta_coeffs), axis=1)

        if cepstral_mean_subtraction_enabled:
            dev_p_i_cpy = cepstral_mean_subtraction(dev_p_i_cpy)

        # Compute the likelihoods for all the classes
        likelihoods = np.array([ilib.logpdf_gmm(dev_p_i_cpy, Ws[i], MUs[i], COVs[i]).sum() for i in range(1, 32)])

        # Find the class with the highest likelihood
        predicted_class = np.argmax(likelihoods) + 1

        # Compare the predicted class with the true class
        if predicted_class == true_class:
            correct += 1
        total += 1

accuracy = correct / total
print(f"Fraction of correctly recognized targets: {accuracy * 100}%")