# IMPORT

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

# STORAGE

In [3]:
!df -h /kaggle/working

Filesystem      Size  Used Avail Use% Mounted on
/dev/loop1       20G  156K   20G   1% /kaggle/working


# DATA

## checking remote dataset

In [4]:
from huggingface_hub import HfApi
from huggingface_hub import login

login("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")

# print out total memory size
def print_dataset_file_sizes(repo_id):
    api = HfApi()
    dataset_info = api.dataset_info(repo_id=repo_id, files_metadata=True)

    total_size_bytes = 0  
    print(f"File sizes for dataset '{repo_id}/en':")  
    for sibling in dataset_info.siblings:  
        #filename = sibling.rfilename  
        #print(filename.split('/'))
        #filename_array = filename.split('/')
        #if len(filename_array) > 1 and filename_array[1] == 'en':
        size_in_bytes = sibling.size or 0  
        total_size_bytes += size_in_bytes  
        #size_mb = size_in_bytes / (1024 * 1024)  
        #print(f"  {filename}: {size_mb:.2f} MiB")
        """else:
            size_in_bytes = sibling.size or 0  
            total_size_bytes = size_in_bytes"""

    total_size_gb = total_size_bytes / (1024 ** 3)  
    print(f"\nTotal size: {total_size_gb:.2f} GiB")

print_dataset_file_sizes('mozilla-foundation/common_voice_17_0')

File sizes for dataset 'mozilla-foundation/common_voice_17_0/en':

Total size: 967.10 GiB


## dataset loading

In [5]:
from datasets import load_dataset

ds_train = load_dataset(
    "mozilla-foundation/common_voice_17_0", 'en', split="train", 
    streaming=True, trust_remote_code=True)

README.md:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

common_voice_17_0.py:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/132k [00:00<?, ?B/s]

## preprocess dataset

In [6]:
import torchaudio.transforms as T
import librosa
from transformers import Wav2Vec2Processor

TARGET_SR = 16000

mel_transform = T.MelSpectrogram(sample_rate=TARGET_SR, n_mels=128, hop_length=160, n_fft=1024)
facebook_proc = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-large-960h')

def preprocess_ds(example):
    audio_array = example['audio']['array']
    sample_rate = example['audio']['sampling_rate']
    if sample_rate != TARGET_SR:
        audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=TARGET_SR)
    audio_tensor = torch.tensor(audio_array).unsqueeze(0).float()
    audio_shape = audio_tensor.shape
    if len(audio_shape) != 2 or audio_shape[0] != 1:
        raise ValueError(
            f'Unexpected audio shape: {audio_shape} -> Expected audio shape: (1, num_samples)'
        )
    mel_spec = mel_transform(audio_tensor).squeeze(0)
    mel_dim_shape = mel_spec.shape[0]
    if mel_dim_shape != 128:
        raise ValueError(
            f'Unexpected mel shape: {mel_dim_shape} -> Expected mel shape: (128,)'
        )
    sent_tok = facebook_proc.tokenizer(example['sentence']).input_ids
    return {
        'mel_spec': mel_spec,
        'sent_tok': torch.tensor(sent_tok),
    }

ds_train_processed = ds_train.map(preprocess_ds)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

## check number of classes (vocabulary size)

In [7]:
vocab = facebook_proc.tokenizer.get_vocab()

print("Number of classes (vocabulary size): ", len(vocab))
print("Samples words (first 50 words): ", list(vocab.keys())[:50])

Number of classes (vocabulary size):  32
Samples words (first 50 words):  ['<pad>', '<s>', '</s>', '<unk>', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z']


## creating dataloader

In [8]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    mel_spec_batch = []
    sent_tok_batch = []
    for item in batch:
        mel_spec_batch.append((item['mel_spec']).permute(1, 0).clone().detach())
        sent_tok_batch.append((item['sent_tok']).clone().detach())
    mel_length = torch.tensor([v.shape[0] for v in mel_spec_batch])
    tok_length = torch.tensor([len(v) for v in sent_tok_batch])
    #for idx, mel in enumerate(mel_spec_batch):
    #    print(f'{idx}, mel_dim_shape: {mel.shape}')
    mel_spec_padded = pad_sequence(mel_spec_batch, batch_first=True, padding_value=0)
    sent_tok_batch = pad_sequence(sent_tok_batch, batch_first=True, padding_value=0)
    #print(f"mel_spec_padded type: {type(mel_spec_padded)}, shape: {mel_spec_padded.shape}")
    return mel_spec_padded.permute(0, 2, 1), sent_tok_batch, mel_length, tok_length

data_loader = DataLoader(ds_train_processed, batch_size=16, collate_fn=collate_fn)

## check a single batch from dataloader

In [9]:
"""# Iterate over one batch
for batch_idx, (mel_spec_padded, sent_tok_padded) in enumerate(data_loader):
    print(f"Batch {batch_idx + 1}:")
    
    # Inspect the shape of the mel spectrogram and tokenized sentences
    print(f"  mel_spec_padded shape: {mel_spec_padded.shape}")
    print(f"  sent_tok_padded shape: {sent_tok_padded.shape}")

    # Inspect the first sample (if batch_size > 1)
    if mel_spec_padded.shape[0] > 0:
        print(f"  Sample 0 mel_spec: {mel_spec_padded[0]}")
        print(f"  Sample 0 sent_tok: {sent_tok_padded[0]}")

    # Break after inspecting the first batch
    if batch_idx > 1:
        break"""

'# Iterate over one batch\nfor batch_idx, (mel_spec_padded, sent_tok_padded) in enumerate(data_loader):\n    print(f"Batch {batch_idx + 1}:")\n    \n    # Inspect the shape of the mel spectrogram and tokenized sentences\n    print(f"  mel_spec_padded shape: {mel_spec_padded.shape}")\n    print(f"  sent_tok_padded shape: {sent_tok_padded.shape}")\n\n    # Inspect the first sample (if batch_size > 1)\n    if mel_spec_padded.shape[0] > 0:\n        print(f"  Sample 0 mel_spec: {mel_spec_padded[0]}")\n        print(f"  Sample 0 sent_tok: {sent_tok_padded[0]}")\n\n    # Break after inspecting the first batch\n    if batch_idx > 1:\n        break'

 # NETWORK

## model 1

In [10]:
import torch.nn as nn
import math

class FeatureEncoder(nn.Module):
    """CNN"""
    def __init__(self, in_channels=128, out_channels=512):
        super().__init__()
        self.conv_layers = nn.Sequential(
            # block 1
            nn.Conv1d(in_channels, 128, kernel_size=3, stride=1, padding=1), 
            nn.BatchNorm1d(128), nn.ReLU(),
            # block 2
            nn.Conv1d(128, 128, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm1d(128), nn.ReLU(),
            nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(128), nn.ReLU(),
            # block 3
            nn.Conv1d(128, 256, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm1d(256), nn.ReLU(),
            nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(256), nn.ReLU(),
            # block 4
            nn.Conv1d(256, 512, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm1d(512), nn.ReLU(),
            nn.Conv1d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(512), nn.ReLU(),
        )
    def forward(self, x):
        return self.conv_layers(x)

class PositionalEncoder(nn.Module):
    """Sin/Cos Logic Postional"""
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp((torch.arange(0, d_model, 2)) * (-math.log(10000.0)/d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerEncoder(nn.Module):
    """Transformer"""
    def __init__(self, input_dim=512, num_head=8, ff_dim=2048, n_layers=12, dropout=0.1):
        super().__init__()
        self.pos_encoder = PositionalEncoder(d_model=input_dim, dropout=dropout)
        transformer_layer = nn.TransformerEncoderLayer(
            d_model=input_dim,
            nhead=num_head,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=False,
            activation=nn.GELU(),
        )
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=n_layers)
    def forward(self, x, src_key_padding_mask=None):
        x = self.pos_encoder(x)
        return self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

class SpeechTextNeural(nn.Module):
    """Speech to Text Neural"""
    def __init__(
        self, 
        vocab_size, 
        input_dim=512, 
        ff_dim=2048, 
        transformer_layers=12, 
        transformer_heads=8):
        super().__init__()
        self.encoder = FeatureEncoder()
        self.transformer = TransformerEncoder(
            input_dim, 
            transformer_heads, 
            ff_dim, 
            transformer_layers
        )
        self.fc1 = nn.Linear(input_dim, vocab_size)
    def forward(self, x, mel_length=None):
        #print(f'before encoder: {x.shape}')
        x = self.encoder(x)
        if mel_length is not None:
            reduced_length = mel_length // 8
            max_len = x.size(2)
            mask = torch.arange(max_len).expand(
                len(reduced_length), max_len) >= reduced_length.unsqueeze(1)
        else:
            mask = None    
        #print(f'after encoder: {x.shape}')
        x = x.permute(2, 0, 1)
        #print(f'before transformer: {x.shape}')
        x = self.transformer(x)
        #print(f'after transformer: {x.shape}')
        x = x.permute(1, 0, 2)
        #print(f'before full connector: {x.shape}')
        x = self.fc1(x)
        #print(f'after full connector: {x.shape}')
        return x

vocab_length = len(facebook_proc.tokenizer.get_vocab())
model1 = SpeechTextNeural(vocab_length)



In [11]:
"""# Test one batch
mel, tokens, mel_lengths, token_lengths = next(iter(data_loader))
print(f"Mel: {mel.shape}, Tokens: {tokens.shape}")
print(f"Mel lengths: {mel_lengths}, Token lengths: {token_lengths}")

logits = model1(mel, mel_lengths)
print(f"Logits shape: {logits.shape}")  # Should be [16, time//8, vocab_size]

# Verify length requirements
assert (mel_lengths // 8).ge(token_lengths).all()"""

'# Test one batch\nmel, tokens, mel_lengths, token_lengths = next(iter(data_loader))\nprint(f"Mel: {mel.shape}, Tokens: {tokens.shape}")\nprint(f"Mel lengths: {mel_lengths}, Token lengths: {token_lengths}")\n\nlogits = model1(mel, mel_lengths)\nprint(f"Logits shape: {logits.shape}")  # Should be [16, time//8, vocab_size]\n\n# Verify length requirements\nassert (mel_lengths // 8).ge(token_lengths).all()'

In [12]:
from torchinfo import summary

batch_size=16
mel_features = 128
samples = 16_000

summary(model1, input_size=(batch_size, mel_features, samples))

Layer (type:depth-idx)                                  Output Shape              Param #
SpeechTextNeural                                        [16, 2000, 32]            --
├─FeatureEncoder: 1-1                                   [16, 512, 2000]           --
│    └─Sequential: 2-1                                  [16, 512, 2000]           --
│    │    └─Conv1d: 3-1                                 [16, 128, 16000]          49,280
│    │    └─BatchNorm1d: 3-2                            [16, 128, 16000]          256
│    │    └─ReLU: 3-3                                   [16, 128, 16000]          --
│    │    └─Conv1d: 3-4                                 [16, 128, 8000]           49,280
│    │    └─BatchNorm1d: 3-5                            [16, 128, 8000]           256
│    │    └─ReLU: 3-6                                   [16, 128, 8000]           --
│    │    └─Conv1d: 3-7                                 [16, 128, 8000]           49,280
│    │    └─BatchNorm1d: 3-8                  

# TRAIN

## training

In [13]:
def save_model(epoch, model, optimizer, training_losses):
    
    path='model'
    os.makedirs(path, exist_ok=True)
    existing_files = [
        f for f in os.listdir(path) if f.endswith('.pth') and f.startswith('model')]
    versions = []
    
    for file in existing_files:
        filename = file.replace('.pth', '').split('_')
        if len(filename) > 1 and filename[-1].isdigit():
            versions.append(int(filename[-1]))
    new_version = max(versions)+1 if versions else 1

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'training_losses': training_losses,
    }
    
    new_path = os.path.join(path, f'model_50percent_{new_version}.pth')
    torch.save(checkpoint, new_path)
    print(f'Model saved at {new_path}!')

In [14]:
"""import shutil

shutil.rmtree('/kaggle/working/model')"""

"import shutil\n\nshutil.rmtree('/kaggle/working/model')"

In [15]:
import torch.optim as optim

optimizer = optim.Adam(model1.parameters(), lr=1e-3)
criterion = nn.CTCLoss(blank=0, zero_infinity=True)

In [16]:
import itertools

def train(model, dataloader, epochs=10):
    
    model.train().to(device)
    training_losses = []
    #batches_acc = []
    
    for epoch in range(epochs):
        
        total_loss = 0.
        total_accuracies = 0.
        total_pred_label = 0.
        
        # calculate the number of batches to process (50% of the dataset)
        num_batches = int(0.5 * len(dataloader.dataset)) if hasattr(dataloader.dataset, '__len__') else 1500
        
        for idx, (mel, tok, input_length, label_length) in enumerate(itertools.islice(dataloader, num_batches)):
            
            #print(f'BATCH {idx} -----------------')
            optimizer.zero_grad()
            
            outputs = model(mel.to(device))
            log_probs = outputs.log_softmax(2).permute(1, 0 ,2) # (N ,T, C) -> (T, N , C)
            
            #print(f"log_probs shape: {log_probs.shape}")  # Should be (T, N, C)
            #print(f"tok shape: {tok.shape}")  # Should be (sum(target_lengths),)
            #print(f'input_length')
            #print(f"input_length shape: {input_length.shape}")  # Should be (N,)
            #print(f"label_length shape: {label_length.shape}")  # Should be (N,)
            
            downsampling_factor = mel.shape[-1] / log_probs.shape[0] # mel divide by predict sentences
            input_length = (input_length / downsampling_factor).floor().long()
            loss = criterion(
                log_probs, tok.to(device), input_length.to(device), label_length.to(device)
            )
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            training_losses.append(loss.item())
            
            #prediction, prediction2 = log_probs.max(2)
            #print('THIS ', tok.shape)
            #print(prediction.shape)
            #print(prediction2.shape)
            #break
            #total_accuracies += (prediction == tok.to(device)).sum().item()
            #total_pred_label += tok.size(0) * tok.size(1)
            
            #if idx % 99 == 0:
            #    save_model(model)
                
        #acc = total_accuracies / total_pred_label * 100.0
        #batches_acc.append(acc)

        print(f'Epoch {epoch+1}, Loss: {total_loss:.4f}.')

        save_model(epoch, model, optimizer, training_losses)
        print(f'Model of Epoch {epoch} saved!')
        
    return training_losses

In [17]:
training_losses = train(model1, data_loader, 10)

Reading metadata...: 1101170it [00:18, 58267.48it/s]


Epoch 1, Loss: 377.6626.
Model saved at model/model_50percent_1.pth!
Model of Epoch 0 saved!


Reading metadata...: 1101170it [00:19, 57854.76it/s]


Epoch 2, Loss: 370.9526.
Model saved at model/model_50percent_2.pth!
Model of Epoch 1 saved!


Reading metadata...: 1101170it [00:17, 61588.27it/s]


Epoch 3, Loss: 370.1924.
Model saved at model/model_50percent_3.pth!
Model of Epoch 2 saved!


Reading metadata...: 1101170it [00:18, 59023.49it/s]


Epoch 4, Loss: 369.9383.
Model saved at model/model_50percent_4.pth!
Model of Epoch 3 saved!


Reading metadata...: 1101170it [00:18, 59688.41it/s]


Epoch 5, Loss: 369.8207.
Model saved at model/model_50percent_5.pth!
Model of Epoch 4 saved!


Reading metadata...: 1101170it [00:19, 57810.24it/s]


Epoch 6, Loss: 369.8134.
Model saved at model/model_50percent_6.pth!
Model of Epoch 5 saved!


Reading metadata...: 1101170it [00:16, 66441.32it/s]


Epoch 7, Loss: 369.6544.
Model saved at model/model_50percent_7.pth!
Model of Epoch 6 saved!


Reading metadata...: 1101170it [00:18, 58119.56it/s]


Epoch 8, Loss: 369.6468.
Model saved at model/model_50percent_8.pth!
Model of Epoch 7 saved!


Reading metadata...: 1101170it [00:17, 64153.59it/s]


Epoch 9, Loss: 369.5677.
Model saved at model/model_50percent_9.pth!
Model of Epoch 8 saved!


Reading metadata...: 1101170it [00:18, 60169.66it/s]


Epoch 10, Loss: 369.5561.
Model saved at model/model_50percent_10.pth!
Model of Epoch 9 saved!


## plotting

# EVALUATION

# TEST