# INSTALLATION

In [217]:
%pip install evaluate

Note: you may need to restart the kernel to use updated packages.


# IMPORT

In [218]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# neural network
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim

# data download (hugging face)
from datasets import load_dataset, Audio
from huggingface_hub import login

# data preprocessing
from transformers import Wav2Vec2Processor
from torch.utils.data import DataLoader

# evaluation
import evaluate

In [None]:
# use o logging da conta do seu hugging face

#login('xxxxx')

# DEVICE

In [220]:
# hardware status
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

# NETWORK

In [221]:
# first version speed-text model network
class STN1(nn.Module):
    def __init__(self, vocab_size):
        super(STN1, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=(1, 10))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(1, 10))
        self.drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, vocab_size)
    def forward(self, x):
        # Ensure the input tensor has 4 dimensions
        if x.dim() == 3:
            x = x.unsqueeze(1)
        
        x = f.relu(self.conv1(x))
        x = f.relu(self.conv2(x))
        x = self.drop(x)

        # Dynamically calculate the flattened size
        batch_size = x.size(0)
        flattened_size = x.size(1) * x.size(2) * x.size(3)
        x = x.view(batch_size, -1)
        
        x = f.relu(self.fc1(x))
        return f.log_softmax(self.fc2(x), dim=1)

# DATA

## preprocessing

In [222]:
# processor for input (audio signal) and output (text) conversion
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')

processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/wav2vec2-base-960h', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	1: AddedToken("<s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	2: AddedToken("</s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	3: AddedToken("<unk>", rstrip=True, lstrip=Tru

In [223]:
# load mozilla foundation, commom voice v11, training dataset from hugging face
ds_train = load_dataset('mozilla-foundation/common_voice_11_0', 'en', split='train', streaming=True)

ds_train

IterableDataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_shards: 24
})

In [224]:
# method to process each row of dataset, audio array into dimensional features and text into numbers
def prepare_batch(batch):
    input_values = processor(batch['audio']['array'], sampling_rate=16_000).input_values[0]
    labels = processor.tokenizer(batch['sentence'], return_attention_mask=False).input_ids
    batch['input_values'] = input_values
    batch['labels'] = labels
    return batch

In [225]:
# apply method above and align audio sampling rate with the method
ds_train = ds_train.cast_column('audio', Audio(sampling_rate=16_000))
ds_train = ds_train.map(prepare_batch)

ds_train

IterableDataset({
    features: Unknown,
    num_shards: 24
})

## loader

In [226]:
"""# extraction of batch input and label values into dataloader streamline
def collate_fn(batch):
    # extract target cols
    input_values = [item['input_values'] for item in batch]
    labels = [item['labels'] for item in batch]

    # if tensor, convert to numpy first
    if isinstance(input_values[0], torch.Tensor):
        input_values = [x.tolist() for x in input_values]

    # calculate padding, to align all element into same length
    max_len = max((len(x) if isinstance(x, list) else x.shape[0]) for x in input_values)
    padded_inputs = [x + [0] * (max_len - len(x)) for x in input_values]
    input_tensor = torch.tensor(padded_inputs, dtype=torch.float32)

    # calculate padding, to align all element into same length
    max_len2 = max(len(label) for label in labels)
    padded_labels = [label + [processor.tokenizer.eos_token_id] * (max_len2 - len(label)) for label in labels]
    label_tensor = torch.tensor(padded_labels)
    
    return input_tensor, label_tensor"""

"# extraction of batch input and label values into dataloader streamline\ndef collate_fn(batch):\n    # extract target cols\n    input_values = [item['input_values'] for item in batch]\n    labels = [item['labels'] for item in batch]\n\n    # if tensor, convert to numpy first\n    if isinstance(input_values[0], torch.Tensor):\n        input_values = [x.tolist() for x in input_values]\n\n    # calculate padding, to align all element into same length\n    max_len = max((len(x) if isinstance(x, list) else x.shape[0]) for x in input_values)\n    padded_inputs = [x + [0] * (max_len - len(x)) for x in input_values]\n    input_tensor = torch.tensor(padded_inputs, dtype=torch.float32)\n\n    # calculate padding, to align all element into same length\n    max_len2 = max(len(label) for label in labels)\n    padded_labels = [label + [processor.tokenizer.eos_token_id] * (max_len2 - len(label)) for label in labels]\n    label_tensor = torch.tensor(padded_labels)\n    \n    return input_tensor, labe

In [227]:
import torch
import numpy as np

def collate_fn(batch):
    # Extract input_values and labels from the batch
    input_values = [item['input_values'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Convert tensors to lists if necessary
    if isinstance(input_values[0], torch.Tensor):
        input_values = [x.numpy() for x in input_values]  # Convert to NumPy for efficient padding

    # Pad input_values to the same length
    max_len = max(len(x) for x in input_values)
    padded_inputs = np.array([np.pad(x, (0, max_len - len(x)), mode='constant', constant_values=0) for x in input_values])
    input_tensor = torch.tensor(padded_inputs, dtype=torch.float32)

    # Pad labels to the same length
    max_len2 = max(len(label) for label in labels)
    padded_labels = np.array([np.pad(label, (0, max_len2 - len(label)), mode='constant', constant_values=processor.tokenizer.eos_token_id) for label in labels])
    label_tensor = torch.tensor(padded_labels, dtype=torch.long)

    return input_tensor, label_tensor

In [228]:
# invoke dataloader with parameters above
train_loader = DataLoader(ds_train, batch_size=4, collate_fn=collate_fn)

train_loader

<torch.utils.data.dataloader.DataLoader at 0x7d9b7140ee90>

# TRAIN

## setup

In [229]:
# invoke speech-text first version model
stn1 = STN1(vocab_size=len(processor.tokenizer)).to(device)

stn1

STN1(
  (conv1): Conv2d(1, 10, kernel_size=(1, 10), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(1, 10), stride=(1, 1))
  (drop): Dropout2d(p=0.5, inplace=False)
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=32, bias=True)
)

In [230]:
# model training elements
optimizer = optim.Adam(stn1.parameters(), lr=1e-4)
criterion = nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

optimizer, criterion

(Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.0001
     maximize: False
     weight_decay: 0
 ),
 CTCLoss())

In [231]:
# load checkpoint elements, if exists
checkpoint_pth = '/kaggle/working/checkpoint'
epoch_pth = '/kaggle/working/epoch'

start_epoch = 0
if os.path.exists(checkpoint_pth):
    stn1.load_state_dict(torch.load(checkpoint_pth))
    with open(epoch_pth, 'r') as f:
        start_epoch = int(f.read())

## training

In [232]:
# loop
final_epoch = 5 # train all dataset for 5 times
save_interval = 100 # save every 100 rows or batches

for epoch in range(start_epoch, final_epoch):
    stn1.train() # activate stn1 model training mode
    total_loss = 0.0 # new training loss counting
    
    for idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad() # clean the gradient
        
        outputs = stn1(inputs.unsqueeze(1)) # add channel dimension
        
        inputs_lengths = torch.full((outputs.size(0), ), outputs.size(2), dtype=torch.long) # new len
        target_lengths = torch.tensor([len(label) for label in labels], dytpe=torch.long) # new len

        loss = criterion(outputs.permute(2, 0, 1), labels, inputs_lengths, target_lengths) # loss
        
        loss.backward()
        optimizer.step() # update loss above
        total_loss += loss.item() # add loss above
        
        if idx % save_interval == 0: # every save interval, save the checkpoint and epoch
            print(f'Epoch {epoch+1} - Batch {idx+1}, Loss: {total_loss/save_interval:.4f}')
            total_loss = 0.0

            torch.save(stn1.state_dict(), checkpoint_pth)
            with open(epoch_pth, 'w') as f:
                f.write(str(epoch))
            print('STN1 model checkpoint saved!')
                        
    print(f'Epoch {epoch} completed!')

Reading metadata...: 948736it [00:14, 64822.52it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x2403480 and 320x50)

In [None]:
stn1.eval()

In [None]:
torch.save(stn1.state_dict(), 'kaggle/working/stn1_v1.pth')

In [None]:
print('STN1 training finished!')