In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# !pip install librosa
import librosa
import torchaudio

In [3]:
BASE_DIR  = '/kaggle/input/english-multispeaker-corpus-for-voice-cloning/VCTK-Corpus/VCTK-Corpus'
TXT_DIR =  os.path.join(BASE_DIR, 'txt')
AUDIO_DIR = os.path.join(BASE_DIR, 'wav48')

SAMPLING_RATE = 22050
MAX_DURATION = 8
SR_DOWNSAMPLE = 2
LOAD_CHECKPOINT = False

In [4]:
for file in sorted(os.listdir(TXT_DIR))[:10]:
    print(file)

p225
p226
p227
p228
p229
p230
p231
p232
p233
p234


In [5]:
for file in sorted(os.listdir(AUDIO_DIR))[:10]:
    print(file)

p225
p226
p227
p228
p229
p230
p231
p232
p233
p234


In [6]:
len(os.listdir(AUDIO_DIR)), len(os.listdir(TXT_DIR))

(109, 108)

- we don't have the text said by one of the speakers. Good, use it for testing later.

In [7]:
speaker_ids = sorted(os.listdir(TXT_DIR))
print('number of speakers is', len(speaker_ids))

number of speakers is 108


## sample Audio and text exploration 

In [8]:
def get_speech(speaker_id = np.random.choice(speaker_ids), passage_id = None):
    if not passage_id:
        speaker_passage_path = os.path.join(TXT_DIR, speaker_id)
        passage_id = np.random.choice(os.listdir(speaker_passage_path))[:-4]
        
    text_path = os.path.join(TXT_DIR, speaker_id, passage_id + '.txt')
    speech_path = os.path.join(AUDIO_DIR, speaker_id, passage_id + '.wav')
    
    x, sr = librosa.load(speech_path)
    
    with open(text_path, 'r') as text_file:
        text = text_file.read()
        
    return x, sr, text, speech_path
        
        
x, sr, text, speech_path = get_speech()
print('sampling rate', sr, 'hz')
print('number of samples', len(x))
print('duration', round(len(x)/sr, 2), 'seconds')

sampling rate 22050 hz
number of samples 48988
duration 2.22 seconds


In [9]:
import IPython.display as ipd
print(text)
ipd.Audio(speech_path)

Everything happened so quickly.


## organizing dataset retrival paths into a dataframe

In [10]:
import re

In [11]:
speakers_passages = []
for speaker in speaker_ids:
    speaker_passages_path = os.path.join(AUDIO_DIR, speaker)
    speaker_passages = sorted(os.listdir(speaker_passages_path), key = lambda x : int(re.findall('[\d]+', x)[-1]))
    speakers_passages.append([passage[:-4] for passage in speaker_passages])    

In [12]:
data = { speaker_id : passages for speaker_id, passages in zip(speaker_ids, speakers_passages)}

In [13]:
data_samples = []
for speaker_id, passages in data.items():
    for passage in passages:
        data_samples.append((speaker_id, passage))
print(data_samples[:10])

[('p225', 'p225_001'), ('p225', 'p225_002'), ('p225', 'p225_003'), ('p225', 'p225_004'), ('p225', 'p225_005'), ('p225', 'p225_006'), ('p225', 'p225_007'), ('p225', 'p225_008'), ('p225', 'p225_009'), ('p225', 'p225_010')]


In [14]:
df = pd.DataFrame(data_samples, columns = ['speaker_id', 'passage_id'])

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset

from tqdm import tqdm

## what characters appear in the data

In [16]:
# character_set = set()
# possibly_interesting = []

# for i in tqdm(range(len(df))):
#     speaker_id, passage_id = df.iloc[i].values
#     text_path = os.path.join(TXT_DIR, speaker_id, passage_id + '.txt')
    
#     with open(text_path, 'r') as text_file:
#         text = text_file.read()
#     if re.findall('[\:\!\(\)\-\"]', text):
#         possibly_interesting.append(text)
        
#     character_set = character_set.union(set([char.lower() for char in text]))

In [17]:
# character_set

In [18]:
character_set = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
     'u', 'v', 'w', 'x', 'y', 'z', ' ', '\'', ',',
]

character_encodings = {r'<\p>': -1}
inverse_encodings = {-1 : r'<\p>'}
for i, char in enumerate(character_set):
    character_encodings[char] = i+1
    inverse_encodings[i+1] = char

In [19]:
# print(possibly_interesting[np.random.choice(len(possibly_interesting))])

# dataset and dataloader

In [20]:
class VTCK(Dataset):
    def __init__(self, TXT_DIR, AUDIO_DIR, sr_downsample = 2):
        self.txt_dir = TXT_DIR
        self.audio_dir = AUDIO_DIR
        self.sr_downsample = sr_downsample
        
        self.speakers = os.listdir(AUDIO_DIR)
        
        speakers_passages = []
        for speaker in self.speakers:
            speaker_passages_path = os.path.join(AUDIO_DIR, speaker)
            speaker_passages = sorted(os.listdir(speaker_passages_path), key = lambda x : int(re.findall('[\d]+', x)[-1]))
            speakers_passages.append([passage[:-4] for passage in speaker_passages])    
            
        data = { speaker_id : passages for speaker_id, passages in zip(self.speakers, speakers_passages)}
        
        data_samples = []
        for speaker_id, passages in data.items():
            for passage in passages:
                text_path = os.path.join(self.txt_dir, speaker_id, passage + '.txt')
                speech_path = os.path.join(self.audio_dir, speaker_id, passage + '.wav')
                
                if os.path.isfile(text_path) and os.path.isfile(speech_path):
                    data_samples.append((speaker_id, passage))
#                 else : 
#                     print(f'({speaker_id}, {passage}), is skipped')
                
        self.df = pd.DataFrame(data_samples, columns = ['speaker_id', 'passage_id'])
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        speaker_id, passage_id = self.df.iloc[index].values
        
        text_path = os.path.join(self.txt_dir, speaker_id, passage_id + '.txt')
        speech_path = os.path.join(self.audio_dir, speaker_id, passage_id + '.wav')

        speech, sr = torchaudio.load(speech_path)

        with open(text_path, 'r') as text_file:
            text = text_file.read().lower()
            text = ''.join(char for char in text if char in character_set).strip()
            text = [character_encodings[char] for char in text]
            
        return speech[0][::self.sr_downsample], torch.tensor(text)

    
def collate_fn(batch):
    out_speech = []
    out_text = []
    target_lengths = []  
    max_num_samples = ( SAMPLING_RATE * MAX_DURATION ) // (SR_DOWNSAMPLE)
    
    
    max_len_speech = 0
    max_len_text = 0
    
    for speech, text in batch:
        if len(speech) > max_len_speech and len(speech) < max_num_samples:
            max_len_speech = len(speech)
            
        if len(text) > max_len_text and len(speech) < max_num_samples:
            max_len_text = len(text)
                        
    for speech, text in batch:
        if len(speech) < max_num_samples:
            out_speech.append(F.pad(speech.view(1, 1, -1), pad = (0, max_len_speech - len(speech)), value = 0.0))
            out_text.append(F.pad(text.view(1, -1), pad = (0, max_len_text - len(text)), value = -1))
            target_lengths.append(len(text))
        
        
    out_speech = torch.cat((*out_speech,), dim = 0)
    out_text = torch.cat((*out_text,), dim = 0)
    target_lengths = torch.tensor(target_lengths)
                
    return out_speech, out_text, target_lengths
        

In [21]:
# ds = VTCK(TXT_DIR, AUDIO_DIR)

In [22]:
# dl = DataLoader(ds, batch_size = 4, collate_fn =collate_fn, shuffle = True, num_workers = 1)

In [23]:
# sample = next(iter(dl))

## sample batch

In [24]:
# i = 0
# print("*"*40)
# print('audio first and last elements\n')
# print("*"*40)
# print('\t', sample[0][i][:10])
# print('\t', sample[0][i][-10:])
# print("*"*40)
# print('\nencodings\n')
# print("*"*40)
# print('\t', sample[1][i])
# print("*"*40)
# print('\ntext\n')
# print("*"*40)
# print('\t', "".join([inverse_encodings[index.item()] for index in sample[1][i]]))
# print("*"*40)
# print('\nof length', sample[2][i].item())
# print("*"*40)

## trying out the CTC loss

In [25]:
# loss = nn.CTCLoss(blank = 0)
# N = 4
# T = 70 # input sequence length
# vocab = len(character_encodings) - 1 # number of classes (excluding padding)

# x = torch.randn(T, N, vocab + 1).log_softmax(2) # output of the model (TODO)

# y = sample[1]

# input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) # same
# target_lengths = sample[2] # original lengths of text to take loss over, to ignore paddings

# loss(x, y, input_lengths, target_lengths)

## model

### CNN + RNN for simplicity

In [26]:
from torch import Tensor
from typing import List

class ConvNextStem(nn.Sequential):
    def __init__(self, in_features: int, out_features: int):
        super().__init__(
            nn.Conv1d(in_features, out_features, kernel_size = 4, stride=4),
            nn.BatchNorm1d(out_features)
        )
        
class ConvNormAct(nn.Sequential):
    """
    A little util layer composed by (conv) -> (norm) -> (act) layers.
    """
    def __init__(
        self,
        in_features: int,
        out_features: int,
        kernel_size: int,
        norm = nn.BatchNorm1d,
        act = nn.ReLU,
        **kwargs
    ):
        super().__init__(
            nn.Conv1d(
                in_features,
                out_features,
                kernel_size=kernel_size,
                padding=kernel_size // 2,
                **kwargs
            ),
            norm(out_features),
            act(),
        )
        
from torchvision.ops import StochasticDepth

class LayerScaler(nn.Module):
    def __init__(self, init_value: float, dimensions: int):
        super().__init__()
        self.gamma = nn.Parameter(init_value * torch.ones((dimensions)), 
                                    requires_grad=True)
        
    def forward(self, x):
        return self.gamma[None,...,None] * x

class BottleNeckBlock(nn.Module):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        expansion: int = 4,
        drop_p: float = .0,
        layer_scaler_init_value: float = 1e-6,
    ):
        super().__init__()
        expanded_features = out_features * expansion
        self.block = nn.Sequential(
            # narrow -> wide (with depth-wise and bigger kernel)
            nn.Conv1d(
                in_features, in_features, kernel_size=7, padding=3, bias=False, groups=in_features
            ),
            # GroupNorm with num_groups=1 is the same as LayerNorm but works for 2D data
            nn.GroupNorm(num_groups=1, num_channels=in_features),
            # wide -> wide 
            nn.Conv1d(in_features, expanded_features, kernel_size=1),
            nn.GELU(),
            # wide -> narrow
            nn.Conv1d(expanded_features, out_features, kernel_size=1),
        )
        self.layer_scaler = LayerScaler(layer_scaler_init_value, out_features)
        self.drop_path = StochasticDepth(drop_p, mode="batch")

        
    def forward(self, x: Tensor) -> Tensor:
        res = x
        x = self.block(x)
        x = self.layer_scaler(x)
        x = self.drop_path(x)
        x += res
        return x

class ConvNexStage(nn.Sequential):
    def __init__(
        self, in_features: int, out_features: int, depth: int, **kwargs
    ):
        super().__init__(
            # add the downsampler
            nn.Sequential(
                nn.GroupNorm(num_groups=1, num_channels=in_features),
                nn.Conv1d(in_features, out_features, kernel_size=2, stride=2)
            ),
            *[
                BottleNeckBlock(out_features, out_features, **kwargs)
                for _ in range(depth)
            ],
        )
        
class ConvNextEncoder(nn.Module):
    def __init__(
        self,
        in_channels: int,
        stem_features: int,
        depths: List[int],
        widths: List[int],
        drop_p: float = .0,
    ):
        super().__init__()
        self.stem = ConvNextStem(in_channels, stem_features)

        in_out_widths = list(zip(widths, widths[1:]))
        # create drop paths probabilities (one for each stage)
        drop_probs = [x.item() for x in torch.linspace(0, drop_p, sum(depths))] 
        
        self.stages = nn.ModuleList(
            [
                ConvNexStage(stem_features, widths[0], depths[0], drop_p=drop_probs[0]),
                *[
                    ConvNexStage(in_features, out_features, depth, drop_p=drop_p)
                    for (in_features, out_features), depth, drop_p in zip(
                        in_out_widths, depths[1:], drop_probs[1:]
                    )
                ],
            ]
        )
        

    def forward(self, x):
        x = self.stem(x)
        for stage in self.stages:
            x = stage(x)
        return x

In [27]:
# encoder = ConvNextEncoder(in_channels=1, stem_features=64, depths=[3,3,3,3,3], widths=[128, 128, 128, 128, 128])
# x = torch.zeros(10, 1, 50000)
# encoder(x).shape

In [28]:
class SpeechRecogonizer(nn.Module):
    def __init__(
        self,
        in_channels: int,
        stem_features: int,
        depths: List[int],
        widths: List[int],
        RNN_features : int,
        vocab_size : int,
        drop_p: float = .0
    ):
        super().__init__()
        
        self.conv_encoder = ConvNextEncoder(
            in_channels,
            stem_features,
            depths,
            widths,
            drop_p
        )
        self.rnn = nn.GRU(input_size=widths[-1], hidden_size = RNN_features)
        self.output = nn.Linear(in_features = RNN_features, out_features = vocab_size)
        

    def forward(self, x):
        encoded = self.conv_encoder(x)
        rnn_out, _ = self.rnn(encoded.permute(2, 0, 1))
        out = self.output(rnn_out)
        return out
    
    def decode_output(self, output): # T X N X C
        with torch.no_grad():
            output = torch.argmax(output, dim = 2)
            
        predicted_texts = []
            
        for i in range(len(output.T)):
            predicted_text = ''
            clean_text = ''
            for j in range(len(output)):
                predicted_text += inverse_encodings.get(output[j, i].item(), '*')
                
            predicted_text_split = re.split('[*]+', predicted_text)
            
            for single_split in predicted_text_split:
                for i in range(len(single_split)):
                    if  i == len(single_split) - 1 or single_split[i] != single_split[i+1]:
                        clean_text += single_split[i]
                        
            predicted_texts.append(clean_text)
            
        return predicted_texts
            

In [29]:
# model = SpeechRecogonizer(
#     in_channels=1,
#     stem_features=64,
#     depths=[3]*7,
#     widths=[256]*7,
#     RNN_features = 128,
#     vocab_size = len(character_encodings) + 1
# )

In [30]:
# model(x).shape # out sequence , Batch_size, Vocab_size + 1(null_token)

In [31]:
# 'model contains ' + str(round(sum([p.numel() for p in model.parameters()])/1_000_000, 2)) + 'M parameters'

## transformer-based (TODO)

## training loop

In [32]:
def train(
    data_loader,
    batch_size,
    model,
    optimizer,
    loss_fn,
    scaler,
    epochs,
    device
):
    
    model.to(device)
    
    if LOAD_CHECKPOINT and os.path.isfile('../input/automatic-speech-recognition/model_weights.pth'):

        model_checkpoint = torch.load('../input/automatic-speech-recognition/model_weights.pth', map_location = 'cpu')
        optimizer_checkpoint = torch.load('../input/automatic-speech-recognition/optimizer_state.pth', map_location = 'cpu')
        scaler_checkpoint = torch.load('../input/automatic-speech-recognition/scaler_state.pth', map_location = 'cpu')


        model.load_state_dict(model_checkpoint)
        optimizer.load_state_dict(optimizer_checkpoint)
        scaler.load_state_dict(scaler_checkpoint)
    
    for epoch in range(epochs):
        
        loop = tqdm(enumerate(data_loader))
        
        for i, (speech, text, target_lengths) in loop:
            
            with torch.cuda.amp.autocast():
                
                predictions = model(speech.to(device)).log_softmax(2)
                N = predictions.shape[1]
                T = len(predictions) # input sequence length
                vocab = len(character_encodings) - 1 # number of classes (excluding padding)

                input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)

                loss = loss_fn(predictions, text.to(device), input_lengths.to(device), target_lengths.to(device))
            
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            loop.set_description(f'Epoch {epoch+1}/{epochs} Batch {i+1}/{len(train_dataset)//batch_size + 1}')
            loop.set_postfix(loss = loss.detach().item())
                        
        torch.save(model.state_dict(), './model_weights.pth')
        print('model parameters saved')
        torch.save(optimizer.state_dict(), './optimizer_state.pth')
        print('optimizer state saved')
        torch.save(scaler.state_dict(), './scaler_state.pth')
        print('scaler state saved')

## modell, optimizer and dataloader instantiation

In [33]:
model = SpeechRecogonizer(
    in_channels=1,
    stem_features=64,
    depths=[3, 3, 5, 5, 3, 3],
    widths=[128, 128, 256, 256, 128, 128],
    RNN_features = 256,
    vocab_size = len(character_encodings) + 1
)
'model contains ' + str(round(sum([p.numel() for p in model.parameters()])/1_000_000, 2)) + 'M parameters'

'model contains 7.53M parameters'

In [34]:
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

In [35]:
train_dataset = VTCK(TXT_DIR, AUDIO_DIR, sr_downsample = SR_DOWNSAMPLE)

train_dataloader = DataLoader(train_dataset, batch_size = 32, collate_fn = collate_fn, shuffle = True, num_workers = 2, prefetch_factor = 2)

In [36]:
scaler = torch.cuda.amp.GradScaler()

## overfit a single batch

In [37]:
# speech, text, target_lengths = next(iter(train_dataloader))

In [38]:
# loop = range(2000)
# loss_fn = nn.CTCLoss(blank = 0)
# model.to(torch.device('cuda'))

# for i in loop:

#     predictions = model(speech.to(torch.device('cuda'))).log_softmax(2)
    
#     N = predictions.shape[1]
#     T = len(predictions) # input sequence length
#     vocab = len(character_encodings) - 1 # number of classes (excluding padding)
#     input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long, device = torch.device('cuda'))
    
#     loss = loss_fn(predictions.to(torch.device('cuda')), text.to(torch.device('cuda')), input_lengths.to(torch.device('cuda')), target_lengths.to(torch.device('cuda')))

#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
    
#     if i % 50 == 0:
#         print('iteration:', i, ',loss:', round(loss.item(), 2))

In [39]:
# model.decode_output(predictions)[0]

## we can successfully overfit a signle batch!

## actual trainig

- loading last checkpoint

In [46]:
train(
    train_dataloader,
    32,
    model,
    optimizer,
    nn.CTCLoss(blank = 0),
    scaler,
    10,
    torch.device('cuda')
)

Epoch 1/10 Batch 1378/1378: : 1378it [16:50,  1.36it/s, loss=0.836]


model parameters saved
optimizer state saved
scaler state saved


Epoch 2/10 Batch 1378/1378: : 1378it [16:50,  1.36it/s, loss=1.44] 


model parameters saved
optimizer state saved
scaler state saved


Epoch 3/10 Batch 1378/1378: : 1378it [16:51,  1.36it/s, loss=0.909]


model parameters saved
optimizer state saved
scaler state saved


Epoch 4/10 Batch 1378/1378: : 1378it [16:51,  1.36it/s, loss=1.19] 


model parameters saved
optimizer state saved
scaler state saved


Epoch 5/10 Batch 1378/1378: : 1378it [16:50,  1.36it/s, loss=0.952]


model parameters saved
optimizer state saved
scaler state saved


Epoch 6/10 Batch 1378/1378: : 1378it [16:51,  1.36it/s, loss=1.02] 


model parameters saved
optimizer state saved
scaler state saved


Epoch 7/10 Batch 1378/1378: : 1378it [16:51,  1.36it/s, loss=1.36] 


model parameters saved
optimizer state saved
scaler state saved


Epoch 8/10 Batch 1378/1378: : 1378it [16:50,  1.36it/s, loss=0.655]


model parameters saved
optimizer state saved
scaler state saved


Epoch 9/10 Batch 1378/1378: : 1378it [16:50,  1.36it/s, loss=1.1]  


model parameters saved
optimizer state saved
scaler state saved


Epoch 10/10 Batch 1378/1378: : 1378it [16:51,  1.36it/s, loss=0.826]


model parameters saved
optimizer state saved
scaler state saved


In [47]:
speech, text, target_lengths = next(iter(train_dataloader))
model.to(torch.device('cpu'))

with torch.no_grad():
    predictions = model(speech)

In [56]:
''.join([inverse_encodings.get(text[5][i].item(), '') for i in range(len(text.T))]).split('<\\p>')[0]

'they are so easy for youngsters to open'

In [57]:
model.decode_output(predictions)[5]

'they ar so esy fo homgrs to opren'