In [2]:
import sys
sys.path.append('..')

from utils.performanceMetrics import cer
from utils.performanceMetrics import wer
from utils.misc import IterMeter
from utils.misc import GreedyDecoder

import os
import pickle
import random
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np
from pydub import AudioSegment, effects
import matplotlib.pyplot as plt
from datetime import datetime


import utils.transformations.textTransform as text_trans #????? 

import utils.networkConfigurations.SpeechRecognitionModel_0_0 as network

In [3]:
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram()

text_transform = text_trans.TextTransform()



In [4]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, utterance) in data:
        if data_type == 'train':
            spec = train_audio_transforms(torch.from_numpy(waveform).float()).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = valid_audio_transforms(torch.from_numpy(waveform).float()).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')

        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths

In [5]:
def avg_wer(wer_scores, combined_ref_len):
    return float(sum(wer_scores)) / float(combined_ref_len)

In [8]:
kwargs = {'num_workers': 1, 'pin_memory': True} 

hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512,
        "n_class": 29,
        "n_feats": 128,
        "stride":2,
        "dropout": 0.1,
        "learning_rate": 5e-4,
        "batch_size": 30,
        "epochs": 100
    }



use_cuda = torch.cuda.is_available()

torch.manual_seed(7)

device = torch.device("cuda")





model = network.SpeechRecognitionModel(
            hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
            hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
            ).to(device)


print(model)



optimizer = torch.optim.Adam(model.parameters(), lr=hparams['learning_rate'])
    

criterion = nn.CTCLoss(blank=28).to(device)



scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                                steps_per_epoch=2000,
                                                epochs=hparams['epochs'],
                                                anneal_strategy='linear')

epochs = hparams['epochs']
iter_meter = IterMeter()
print(hparams)

SpeechRecognitionModel(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rescnn_layers): Sequential(
    (0): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (layer_norm2): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,),

In [9]:
database_location = '../data/final/data_clean_batch_1.pickle'

print("---------------------------------------------------------------------")
print('Loading data from:'+database_location)
print("---------------------------------------------------------------------")
with open(database_location, 'rb') as handle:
        b = pickle.load(handle)


train_set = b[0:2000]
test_set = b[2000:2300]


---------------------------------------------------------------------
Loading data from:../data/final/data_clean_batch_1.pickle
---------------------------------------------------------------------


In [43]:
b[2000+3]

(array([[-0.00276384, -0.00298204, -0.00305477, ..., -0.0004364 ,
         -0.00138192, -0.00065459]]),
 'ME KINN ARIVE')

In [106]:
checkpoint = torch.load("../models/"+"model_20_06_2023_10_30_52_100.pt")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

In [107]:
test_1 = []
test_1.append(b[2000+3])
test_1.append(b[2000+14])
test_1.append(b[2000+45])
test_1.append(b[2000+58])
test_1.append(b[2000+73])
test_1.append(b[2000+83])
test_1.append(b[2000+98])
test_1.append(b[2000+117])
test_1.append(b[2000+173])
test_1.append(b[2000+209])
test_1.append(b[2000+241])
test_1.append(b[2000+285])
test_1.append(b[2000+297])

valid_loader = data.DataLoader(dataset=test_1,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)

for i, _data in enumerate(valid_loader):
    print(i)
    spectrograms, labels, input_lengths, label_lengths = _data
    spectrograms, labels = spectrograms.to(device), labels.to(device)
    output = model(spectrograms)  # (batch, time, n_class)
    output = F.log_softmax(output, dim=2)
    output = output.transpose(0, 1) # (time, batch, n_class)
    decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
    print(decoded_preds)
    print(decoded_targets)
    
print("-------------------------------------------------------------------------------------")    
    
train_1 = []
train_1.append(b[19])
train_1.append(b[24])
train_1.append(b[25])
train_1.append(b[26])
train_1.append(b[30])
train_1.append(b[41])
train_1.append(b[50])
train_1.append(b[54])
train_1.append(b[63])
train_1.append(b[66])
train_1.append(b[72])
train_1.append(b[80])
train_1.append(b[92])


valid_loader = data.DataLoader(dataset=train_1,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)

for i, _data in enumerate(valid_loader):
    print(i)
    spectrograms, labels, input_lengths, label_lengths = _data
    spectrograms, labels = spectrograms.to(device), labels.to(device)
    output = model(spectrograms)  # (batch, time, n_class)
    output = F.log_softmax(output, dim=2)
    output = output.transpose(0, 1) # (time, batch, n_class)
    decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
    print(decoded_preds)
    print(decoded_targets)

0
['me ki arive']
['me kinn arive']
1
[' bann a dolesan ex ekxppri zot zot zot mem par sa tip dey de viio lan s la']
['bann adolesan exprim exprim zot zot zot mem par sa tip violans la']
2
['kir i perdir par skann']
['ki ou pe rod dir par sote']
3
['omo pa mo pak kone tro kii imi kin li ki tpe ro dir mee boukou do fero stonnh zordi dan moris ki la poliszz zeskameler me pi demidiet ti ena bann kas la bro dan la renioni ti pe role ek don']
['bon mo pa mo pa kone tro ki ki manier li ki li ti pe rod dir me boukou de personn setonne zordi dan moris ki lapolis ziska ler depi ti ena bann case ladrog dan la renion ki ti pe roule ek donk']
4
['be dot pou bazin kapataa axsie']
['be zot pou bizin kapav travay lor la']
5
['bon swar']
['bon swar']
6
['ki enn zen donn lesa lnna t li pe devlopl pe degouver kouma fer sa']
['ki enn zen a adolesan ena ki li pe devlope pe dekouver kouma fer sa']
7
['no malerral man se komn sa ']
['ek malerezman li koumsa']
8
[' soubash gobin  ki konsultan ki enn zournali

In [None]:
prepocessed_data=b[2000:2300]