In [3]:
import ikrlib as ilib
import numpy as np
import os
from pydub import AudioSegment
from pydub.silence import detect_nonsilent



In [52]:
def audio_adjust(dir):
    min_silence_len = 1000  # Minimálna dĺžka ticha (ms)
    silence_thresh = -44    # prah ticha (dB)

    print(f"Removing silence from records in directory {dir}")
    if not os.path.isdir(dir + "/rs"):
        os.mkdir(dir + "/rs")
    for f in os.listdir(dir):
        if f[-3:] == "wav":
            input_file = dir + "/" + f
            audio = AudioSegment.from_wav(input_file)
            nonsilent_intervals = detect_nonsilent(audio, min_silence_len, silence_thresh)

            # Zkonkatenuj invervaly kde nie je ticho
            non_silent_audio = AudioSegment.empty()
            for start, end in nonsilent_intervals:
                non_silent_audio += audio[start:end]

            # Ulož audio
            output_file = dir + "/rs/" + f
            non_silent_audio.export(output_file, format="wav")

audio_adjust("target_train")
audio_adjust("non_target_train")
audio_adjust("target_dev")
audio_adjust("non_target_dev")

Removing silence from records in directory target_train
Removing silence from records in directory non_target_train
Removing silence from records in directory target_dev
Removing silence from records in directory non_target_dev


In [4]:
train_t = ilib.wav16khz2mfcc('target_train/rs').values()
train_n = ilib.wav16khz2mfcc('non_target_train/rs').values()

test_t = ilib.wav16khz2mfcc('target_dev/rs').values()
test_n = ilib.wav16khz2mfcc('non_target_dev/rs').values()

# Koeficienty sa konkatenuju do jedneho poľa
train_t = np.vstack(list(train_t))
train_n = np.vstack(list(train_n))

Processing file:  target_train/rs/m421_03_r09_i0_0.wav
Processing file:  target_train/rs/m421_03_f18_i0_0.wav
Processing file:  target_train/rs/m421_02_r09_i0_0.wav
Processing file:  target_train/rs/m421_02_f18_i0_0.wav
Processing file:  target_train/rs/m421_01_f18_i0_0.wav
Processing file:  target_train/rs/m421_01_r09_i0_0.wav
Processing file:  non_target_train/rs/m424_02_r09_i0_0.wav
Processing file:  non_target_train/rs/m429_01_r09_i0_0.wav
Processing file:  non_target_train/rs/m429_01_f18_i0_0.wav
Processing file:  non_target_train/rs/f402_03_r09_i0_0.wav
Processing file:  non_target_train/rs/f410_03_r09_i0_0.wav
Processing file:  non_target_train/rs/m414_02_r09_i0_0.wav
Processing file:  non_target_train/rs/m424_02_f18_i0_0.wav
Processing file:  non_target_train/rs/m420_03_r09_i0_0.wav
Processing file:  non_target_train/rs/f413_01_f18_i0_0.wav
Processing file:  non_target_train/rs/m420_02_r09_i0_0.wav
Processing file:  non_target_train/rs/m427_03_r09_i0_0.wav
Processing file:  non

In [None]:
def compute_deltas(cepstral_coeffs, window_size=2):
    num_frames, num_coeffs = cepstral_coeffs.shape
    deltas = np.zeros((num_frames, num_coeffs))

    for t in range(num_frames):
        window_start = max(0, t - window_size)
        window_end = min(num_frames, t + window_size + 1)
        window_indices = np.arange(window_start, window_end)
        window_weights = window_indices - t

        weighted_sum = np.sum(window_weights[:, np.newaxis] * cepstral_coeffs[window_indices, :], axis=0)
        weight_sum_squared = np.sum(window_weights ** 2)

        deltas[t] = weighted_sum / weight_sum_squared

    return deltas

train_t_delta_coefficients = compute_deltas(train_t, window_size=2)
train_n_delta_coefficients = compute_deltas(train_n, window_size=2)

train_t = np.concatenate((train_t, train_t_delta_coefficients), axis=1)
train_n = np.concatenate((train_n, train_n_delta_coefficients), axis=1)

train_t_ddelta_coefficients = compute_deltas(train_t, window_size=2)
train_n_delta_coefficients = compute_deltas(train_n, window_size=2)

In [37]:
M_t = 2  # Počet gaussovských komponent
MUs_t = train_t[np.random.randint(1, len(train_t), M_t)]  # Počiatočna stredná hodnota
COVs_t = [np.cov(train_t.T)] * M_t  # Počiatočna kovariančná matica
Ws_t = np.ones(M_t) / M_t

M_n = 20
MUs_n = train_n[np.random.randint(1, len(train_n), M_n)]
COVs_n = [np.cov(train_n.T)] * M_t
Ws_n = np.ones(M_n) / M_n

In [38]:
for jj in range(30):
    # TTL_t je doveryhodnosť
    Ws_t, MUs_t, COVs_t, TTL_t = ilib.train_gmm(train_t, Ws_t, MUs_t, COVs_t)
    Ws_n, MUs_n, COVs_n, TTL_n = ilib.train_gmm(train_n, Ws_n, MUs_n, COVs_n)
    print(f'Iteration: {jj} Total log likelihood: {TTL_t} for target {TTL_n} for non target')

Iteration: 0 Total log likelihood: -69899.22421735441 for target -2809996.614740586 for non target
Iteration: 1 Total log likelihood: -47173.85311228724 for target -2141285.4458649354 for non target
Iteration: 2 Total log likelihood: -46358.052248430555 for target -2125745.4249553285 for non target
Iteration: 3 Total log likelihood: -45341.82687765901 for target -2089567.733789713 for non target
Iteration: 4 Total log likelihood: -44445.945229413424 for target -2038481.0926930718 for non target
Iteration: 5 Total log likelihood: -43676.72141317656 for target -2013296.3435948638 for non target
Iteration: 6 Total log likelihood: -43217.78982812778 for target -1995916.5772487177 for non target
Iteration: 7 Total log likelihood: -42893.92767066146 for target -1972782.9733279606 for non target
Iteration: 8 Total log likelihood: -42693.51191911126 for target -1952630.7032347973 for non target
Iteration: 9 Total log likelihood: -42567.83116718388 for target -1947868.2015927827 for non target


In [39]:
P_t=0.5
P_n=1.0-P_t

score=[]
for tst in test_t:
    ll_t = ilib.logpdf_gmm(tst, Ws_t, MUs_t, COVs_t)
    ll_n = ilib.logpdf_gmm(tst, Ws_n, MUs_n, COVs_n)
    score.append((sum(ll_t) + np.log(P_t)) - (sum(ll_n) + np.log(P_n)))
print(score)
print(f"Fraction of correctly recognized targets: {np.mean(np.array(score) > 0)}%")

[291.53382512676217, 383.80723060477703]
Fraction of correctly recognized targets: 1.0%


In [40]:
score=[]
for tst in test_n:
    ll_t = ilib.logpdf_gmm(tst, Ws_t, MUs_t, COVs_t)
    ll_n = ilib.logpdf_gmm(tst, Ws_n, MUs_n, COVs_n)
    score.append((sum(ll_t) + np.log(P_t)) - (sum(ll_n) + np.log(P_n)))
print(score)
print(f"Fraction of correctly recognized non targets: {np.mean(np.array(score) < 0)}%")

[-3088.929192276106, -1046.5391712627184, -6409.610502855019, -647.9476851044346, -3346.123855851036, -1331.2549925696994, -626.7319546130002, -8614.863078811526, -13839.374035568966, -5423.689217477597, -2884.259432632658, -5535.629855856305, -3415.66008820327, -5026.607481575655, -859.7149962096946, -974.1352302232881, -164.84266303773074, -3675.035478711785, -12005.59662769195, -9512.878829575333, -172.6270733389582, -555.8067468168074, 99.8593365918241, -2646.0955005004216, -15064.132311447895, -30.18424313430114, -3381.5068378224387, -4835.633980504546, -268.8164436943225, 429.6315968384861, -521.887058491202, -1143.0456949085765, -783.5328438370179, -1940.5954176061787, -4214.595938427374, -3317.60083552834, -3239.9885209959975, -9396.185592370164, -3879.6282931645546, -488.7489446907266, -14819.159324774286, -755.7348323628494, -12147.45092552347, -3250.564071423346, 1.6818265394254013, -3050.490648057241, -4176.797730671988, -5256.12252158366, -925.7287031046762, -2255.25316217