In [None]:
import os
import torch
import librosa
import numpy as np
import torch.nn as nn
import soundfile as sf
import noisereduce as nr
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

from PIL import Image
from glob import glob
from pydub import AudioSegment
from torch.utils.data import DataLoader
from pydub.silence import detect_nonsilent

# if not os.path.isfile("SUR_projekt2022-2023.zip"):
#     !wget https://www.fit.vutbr.cz/study/courses/SUR/public/projekt_2022-2023/SUR_projekt2022-2023.zip
#     !unzip SUR_projekt2022-2023.zip


## Image

In [None]:
from image import CustomDataset, augment_images, SmallCNNMultiClass, fit, eval, predict_image

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu" 

CLASSES = 31
data_augmentation_enabled = False

# note that function will fail if augmentations were already present
if data_augmentation_enabled:
    augment_images('train', 'train/da')
    augment_images('dev', 'dev/da', num_augmentations=int(1e2))

def png_load(dir_name):
    """
    Loads all *.png images
    """
    features = {}
    for f in glob(dir_name + '/*.png'):
        features[f] = np.array(Image.open(f), dtype=np.float64)
    return features

train_x = np.empty((0,80,80,3))
train_y = np.empty((0),dtype=int)

test_x = np.empty((0,80,80,3))
test_y = np.empty((0),dtype=int)

for i in range(1,CLASSES+1):
    train_i = np.array(list(png_load(os.path.join("train",str(i))).values()))
    label_i = np.full(len(train_i),i-1)
    train_x = np.concatenate((train_x, train_i), axis=0)
    train_y = np.concatenate((train_y, label_i), axis=0)

    train_i = np.array(list(png_load(os.path.join("dev",str(i))).values()))
    label_i = np.full(len(train_i),i-1)
    train_x = np.concatenate((train_x, train_i), axis=0)
    train_y = np.concatenate((train_y, label_i), axis=0)

    test_i = np.array(list(png_load(os.path.join("dev",str(i))).values()))
    label_i = np.full(len(test_i),i-1)
    test_x = np.concatenate((test_x, test_i), axis=0)
    test_y = np.concatenate((test_y, label_i), axis=0)

print("Images were successfully loaded")

# convert 80,80,3 to 3,80,80
train_x = np.array(train_x)
train_x = np.transpose(train_x, (0, 3, 1, 2))

test_x = np.array(test_x)
test_x = np.transpose(test_x, (0, 3, 1, 2))

# Convert NumPy arrays to PyTorch tensors
train_tensors = torch.Tensor(train_x)
test_tensors = torch.Tensor(test_x)


# Create new TensorDataset instances with the modified labels
train_dataset = CustomDataset(train_tensors, train_y)
dev_dataset = CustomDataset(test_tensors, test_y)
print("Dataset was successfully created")


batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)

model_name = 'model-0.9516.pt'

if not os.path.isfile(model_name):
    model = SmallCNNMultiClass().to(dev)
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    criterion = F.cross_entropy
    fit(200, model, optimizer, criterion, train_loader, dev_loader)

    torch.save(model.state_dict(), 'model.pt')
else:
    model = SmallCNNMultiClass()
    model.load_state_dict(torch.load(model_name, map_location=torch.device(dev)))
    model.to(dev)



In [None]:
from torch.utils.data import TensorDataset

test_x = np.empty((0,80,80,3))
test_x = np.array(list(png_load('eval').values()))
test_x = np.transpose(test_x, (0, 3, 1, 2))

test_dataset = TensorDataset(torch.Tensor(test_x))
test_loader = DataLoader(test_dataset, batch_size=736)

for x in test_loader:
    pred = model(x[0])
    _, pred = torch.max(pred, dim=1)
pred = pred + 1
print(pred)

## Audio

In [None]:
import ikrlib as ilib

from audio import audio_adjust, reduce_noise, data_augumentation, pre_emphasis, min_max_normalize, compute_deltas, cepstral_mean_subtraction

cepstral_mean_subtraction_enabled = False
delta_coefficients_enabled = False
coefficients_normalization = False

audio_adjust_enabled = True
reduce_noise_enabled = True
data_augmentation_enabled = True
data_pre_emphasis = False

if audio_adjust_enabled:
    for i in range(1, 32):
        audio_adjust(ilib.get_directory(f"train/{i}"))
        audio_adjust(ilib.get_directory(f"dev/{i}"))
    
    audio_adjust(ilib.get_directory(f"eval"))
print("Silence was successfully removed")


if reduce_noise_enabled:
    for i in range(1, 32):
        reduce_noise(ilib.get_directory(f"train/{i}", audio_adjust_enabled))
        reduce_noise(ilib.get_directory(f"dev/{i}", audio_adjust_enabled))
    reduce_noise(ilib.get_directory(f"eval", audio_adjust_enabled))
    print("Noise was successfully removed")

if data_augmentation_enabled:
    for i in range(1, 32):
        data_augumentation(ilib.get_directory(f"train/{i}", audio_adjust_enabled, reduce_noise_enabled))
    print("Data augumentation was done")

if data_pre_emphasis:
    train = {}
    dev = {}
    for i in range(1, 32):
        train[i] =  np.vstack(pre_emphasis(ilib.get_directory(f'train/{i}', audio_adjust_enabled, reduce_noise_enabled, data_augmentation_enabled)))
        dev[i] =  list(pre_emphasis(ilib.get_directory(f'dev/{i}', audio_adjust_enabled, reduce_noise_enabled)))
    print("Pre emphasis was successfull")

if not data_pre_emphasis:
    train = {}
    dev = {}
    for i in range(1, 32):
        train[i] = np.vstack(list(ilib.wav16khz2mfcc(ilib.get_directory(f'train/{i}', audio_adjust_enabled, reduce_noise_enabled, data_augmentation_enabled)).values()))
        dev[i] = list(ilib.wav16khz2mfcc(ilib.get_directory(f'train/{i}', audio_adjust_enabled, reduce_noise_enabled)).values())
    print("Loading data was successful")

if coefficients_normalization:
    for i in range(1, 32):
        train[i] = min_max_normalize(train[i])

if delta_coefficients_enabled:
    for i in range(1, 32):
        train_delta_coeffs = compute_deltas(train[i], window_size=2)
        train_derivative_delta_coeffs = compute_deltas(train[i], window_size=2)
        train[i] = np.concatenate((train[i], train_delta_coeffs, train_derivative_delta_coeffs), axis=1)

if cepstral_mean_subtraction_enabled:
    for i in range(1, 32):
        train[i] = cepstral_mean_subtraction(train[i])

M = 3  # Počet gaussovských komponent
MUs = {}
COVs = {}
Ws = {}
for i in range(1, 32):
    MUs[i] = train[i][np.random.randint(1, len(train[i]), M)]  # Počiatočna stredná hodnota
    #COVs[i] = [np.cov(train[i].T)] * M  # Počiatočna kovariančná matica
    COVs[i] = [np.diag(np.diag(np.cov(train[i].T))) for _ in range(M)]  # Initial diagonal covariance matrix
    Ws[i] = np.ones(M) / M

for jj in range(30):
    # TTL_t je doveryhodnosť
    for i in range(1, 32):
        Ws[i], MUs[i], COVs[i], TTL = ilib.train_gmm(train[i], Ws[i], MUs[i], COVs[i])
        print(f'Iteration: {jj} Total log likelihood: {TTL} for person {i}')

P_t=0.5
P_n=1.0-P_t

score = []
correct = 0
total = 0

for true_class in range(1, 32):
    for dev_p_i in dev[true_class]:
        dev_p_i_cpy = dev_p_i.copy()

        if coefficients_normalization:
            dev_p_i_cpy = min_max_normalize(dev_p_i_cpy)

        if delta_coefficients_enabled:
            test_t_delta_coeffs = compute_deltas(dev_p_i_cpy, window_size=2)
            test_t_derivative_delta_coeffs = compute_deltas(test_t_delta_coeffs, window_size=2)

            dev_p_i_cpy = np.concatenate((dev_p_i_cpy, test_t_delta_coeffs, test_t_derivative_delta_coeffs), axis=1)

        if cepstral_mean_subtraction_enabled:
            dev_p_i_cpy = cepstral_mean_subtraction(dev_p_i_cpy)

        # Compute the likelihoods for all the classes
        likelihoods = np.array([ilib.logpdf_gmm(dev_p_i_cpy, Ws[i], MUs[i], COVs[i]).sum() for i in range(1, 32)])

        # Find the class with the highest likelihood
        predicted_class = np.argmax(likelihoods) + 1

        # Compare the predicted class with the true class
        if predicted_class == true_class:
            correct += 1
        total += 1

accuracy = correct / total
print(f"Fraction of correctly recognized targets: {accuracy * 100}%")