In [3]:
# !pip install speechbrain
# !pip install ruamel_yaml
# fsepteixeira 
# !pip install --upgrade ruamel.yaml --ignore-installed ruamel.yaml

# https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html 
# self.conv = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
# kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding),
# PixelShuffle(upscale_factor),
# nn.InstanceNorm2d(num_features=out_channels)) 

In [1]:
import speechbrain as sb
import torch
import torchaudio
from torchaudio.datasets import LIBRISPEECH
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from speechbrain.pretrained import EncoderDecoderASR
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", 
                                       savedir="pretrained_models/asr-crdnn-rnnlm-librispeech",run_opts={"device":"cuda"},freeze_params=True)

In [3]:
encoder = model.hparams.encoder

In [4]:
val_dataset = LIBRISPEECH(".", url= 'dev-clean',download=True)
train_dataset = LIBRISPEECH(".",download=True)
# dataset1 = MNIST(".",download=True)

In [5]:
def collate(batch):
    """waveform, sample_rate, transcript, speaker_id"""
    waveforms = [b[0].permute([1,0]) for b in batch]
    waveforms = pad_sequence(waveforms,batch_first=True)
    waveforms = waveforms.squeeze()
    input_len = torch.FloatTensor([b[0].shape[1] for b in batch])
    input_len /= torch.max(input_len)
    sampling_rates = torch.FloatTensor([b[1] for b in batch])
    transcript = [b[2] for b in batch]
    speaker_id = torch.LongTensor([b[3] for b in batch])
    
    return waveforms,input_len,sampling_rates,transcript,speaker_id
    

In [10]:
train_dataLoader = DataLoader(train_dataset,batch_size=24,shuffle=True,collate_fn=collate)
val_dataLoader = DataLoader(val_dataset,batch_size=24,shuffle=True,collate_fn=collate)
# train_dataloader = DataLoader(dataset1,batch_size=5,shuffle=True,collate_fn=collate)

# highwat network
- adopted from https://github.com/kefirski/pytorch_Highway.git

In [7]:
class Highway(nn.Module):
    def __init__(self, size, num_layers, f):

        super(Highway, self).__init__()

        self.num_layers = num_layers

        self.nonlinear = nn.ModuleList([nn.Linear(size, size) for _ in range(num_layers)])

        self.linear = nn.ModuleList([nn.Linear(size, size) for _ in range(num_layers)])

        self.gate = nn.ModuleList([nn.Linear(size, size) for _ in range(num_layers)])

        self.f = f

    def forward(self, x):
        """
            :param x: tensor with shape of [batch_size, size]
            :return: tensor with shape of [batch_size, size]
            applies σ(x) ⨀ (f(G(x))) + (1 - σ(x)) ⨀ (Q(x)) transformation | G and Q is affine transformation,
            f is non-linear transformation, σ(x) is affine transformation with sigmoid non-linearition
            and ⨀ is element-wise multiplication
            """

        for layer in range(self.num_layers):
            gate = torch.sigmoid(self.gate[layer](x))

            nonlinear = self.f(self.nonlinear[layer](x))
            linear = self.linear[layer](x)

            x = gate * nonlinear + (1 - gate) * linear

        return x

In [8]:
class SpecsReconstruction(nn.Module):
    
    def __init__(self,input_features,num_blocks,hidden_size,out_feature=40):
        super().__init__()
        self.input_features = input_features
        self.num_blocks = num_blocks
        self.out_feature = out_feature
        
        self.linear1 = nn.Linear(input_features,hidden_size) 
        self.relu1 = nn.ReLU()
        self.highway = Highway(hidden_size,num_blocks,nn.ReLU())
        self.linear2 = nn.Linear(hidden_size,out_feature)
    def forward(self,x):
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.highway(x)
        x = self.linear2(x)
        return x
        
            

In [None]:
h0 = SpecsReconstruction(2560,5,100,40).cuda()


In [11]:
# criterion = nn.L1Loss()
# optimizer = torch.optim.Adam(h0.parameters(),lr=0.001)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=2)
# epochs = 30
path = "h1_states.pth"
def load_model_parameter(path):
    try:
        print("Loading states")
        state = torch.load(path)
        start_epoch = state["epoch"]
        train_losses = state["train_losses"]
        val_losses = state["val_losses"]
        model_dict = state["model_dict"]
        optimizer = state["optimizer"]
        scheduler = state["scheduler"]
        print("successifully loaded states")
        return start_epoch,train_losses,val_losses,model_dict,optimizer,scheduler
    except:
        print("failed to load states")
        return None

def save_model_parameters(path,state_dict):
    torch.save(state_dict,path)
    print("states at {} epoch saved".format(state_dict["epoch"]))
    
states = load_model_parameter(path)
if states is not None:
    start_epoch,train_losses,val_losses,model_dict,optimizer_dict,scheduler_dict = states
    h1.load_state_dict(model_dict)
    optimizer.load_state_dict(optimizer_dict)
    scheduler.load_state_dict(scheduler_dict)

Loading states
successifully loaded states


In [None]:
train_loss = []
val_loss = []
min_loss = np.Infinity

for i in range(epochs):
    t = []
    l = []
    h0.train()
    for j,(waveform, input_len ,sample_rate, transcript, speaker_id) in enumerate(train_dataLoader,start=1):
        waveform = waveform.cuda()
        input_len = input_len.cuda()
        optimizer.zero_grad()
        with torch.no_grad():
            specs = encoder.compute_features(waveform)
            targets = encoder.normalize(specs,input_len)
            block_0 = encoder.model.CNN.block_0(targets)
        block_0 = block_0.reshape(block_0.shape[0],block_0.shape[1],-1)
        out = h0(block_0)
        loss = criterion(targets,out)
        loss.backward()
        optimizer.step()
        if (j+1) % 100 == 0:
            print("epoch:{}/{}".format(i+1,epochs,j))
        t.append(loss.item())
    av_t = sum(t)/len(t)
    print("epoch:{}/{},Train loss:{}".format(i+1,epochs,av_t))
    train_loss.append(av_t)
    del waveform
    del input_len
    # validation loop
    h0.eval()
    for j,(waveform, input_len ,sample_rate, transcript, speaker_id) in enumerate(val_dataLoader,start=1):
        waveform = waveform.cuda()
        input_len = input_len.cuda()
        with torch.no_grad():
            specs = encoder.compute_features(waveform)
            targets = encoder.normalize(specs,input_len)
            block_0 = encoder.model.CNN.block_0(targets)
            block_0 = block_0.reshape(block_0.shape[0],block_0.shape[1],-1)
            out = h0(block_0)
            loss = criterion(targets,out)
            if (j+1) % 100 == 0:
                print("epoch:{}/{}".format(i+1,epochs,j))
            l.append(loss.item())
    av_l = sum(l)/len(l)
    print("epoch:{}/{},Val loss:{}".format(i+1,epochs,av_l))
    val_loss.append(av_l)
    if av_l < min_loss:
        min_loss = av_l
        torch.save(h0,"best_model_0")
        
    state_dict = {
    "epoch":i,
    "train_losses":train_loss,
    "val_losses":val_loss,
    "model_dict":h0.state_dict(),
    "optimizer":optimizer.state_dict(),
    "scheduler":scheduler.state_dict()
    }
    save_model_parameters("h0_states.pth",state_dict)
    scheduler.step(av_l)
    

In [10]:
# del h0
h1 = SpecsReconstruction(2560,5,100,40).cuda()
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(h1.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=2)
epochs = 30

In [12]:
train_loss = []
val_loss = []
min_loss = np.Infinity

for i in range(epochs):
    t = []
    l = []
    h1.train()
    for j,(waveform, input_len ,sample_rate, transcript, speaker_id) in enumerate(train_dataLoader,start=1):
        waveform = waveform.cuda()
        input_len = input_len.cuda()
        optimizer.zero_grad()
        with torch.no_grad():
            specs = encoder.compute_features(waveform)
            targets = encoder.normalize(specs,input_len)
            block_0 = encoder.model.CNN.block_0(targets)
            block_1 = encoder.model.CNN.block_1( block_0)
            block_1 = block_1.reshape(block_1.shape[0],block_1.shape[1],-1)
        out = h1(block_1)
        loss = criterion(targets,out)
        loss.backward()
        optimizer.step()
        if (j+1) % 100 == 0:inputs
            print("epoch:{}/{}".format(i+1,epochs,j))
        t.append(loss.item())
    av_t = sum(t)/len(t)
    print("epoch:{}/{},Train loss:{}".format(i+1,epochs,av_t))
    train_loss.append(av_t)
    del waveform
    del input_len
    # validation loop
    h1.eval()
    for j,(waveform, input_len ,sample_rate, transcript, speaker_id) in enumerate(val_dataLoader,start=1):
        waveform = waveform.cuda()
        input_len = input_len.cuda()
        with torch.no_grad():
            specs = encoder.compute_features(waveform)
            targets = encoder.normalize(specs,input_len)
            block_0 = encoder.model.CNN.block_0(targets)
            block_1 = encoder.model.CNN.block_1(block_0)
            block_1 = block_1.reshape(block_1.shape[0],block_1.shape[1],-1)
            out = h1(block_1)
            loss = criterion(targets,out)
            if (j+1) % 100 == 0:
                print("epoch:{}/{}".format(i+1,epochs,j))
            l.append(loss.item())
    av_l = sum(l)/len(l)
    print("epoch:{}/{},Val loss:{}".format(i+1,epochs,av_l))
    val_loss.append(av_l)
    if av_l < min_loss:
        min_loss = av_l
        torch.save(h1,"best_model_1")
        
    state_dict = {
    "epoch":i,
    "train_losses":train_loss,
    "val_losses":val_loss,
    "model_dict":h1.state_dict(),
    "optimizer":optimizer.state_dict(),
    "scheduler":scheduler.state_dict()
    }
    save_model_parameters("h1_states.pth",state_dict)
    scheduler.step(av_l)
    

epoch:1/30
epoch:1/30
epoch:1/30
epoch:1/30
epoch:1/30
epoch:1/30
epoch:1/30
epoch:1/30,Train loss:0.1295041365887198


RuntimeError: CUDA out of memory. Tried to allocate 1.99 GiB (GPU 0; 14.76 GiB total capacity; 6.71 GiB already allocated; 1.98 GiB free; 11.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
h1_1 = SpecsReconstruction(5120,5,100,40).cuda()
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(h1_1.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=2)
epochs = 30
stretch = torchaudio.transforms.TimeStretch()

path = "h1_1_states.pth"
def load_model_parameter(path):
    try:
        print("Loading states")
        state = torch.load(path)
        start_epoch = state["epoch"]
        train_losses = state["train_losses"]
        val_losses = state["val_losses"]
        model_dict = state["model_dict"]
        optimizer = state["optimizer"]
        scheduler = state["scheduler"]
        print("successifully loaded states")
        return start_epoch,train_losses,val_losses,model_dict,optimizer,scheduler
    except:
        print("failed to load states")
        return None

def save_model_parameters(path,state_dict):
    torch.save(state_dict,path)
    print("states at {} epoch saved".format(state_dict["epoch"]))
    
states = load_model_parameter(path)
if states is not None:
    start_epoch,train_losses,val_losses,model_dict,optimizer_dict,scheduler_dict = states
    h1_1.load_state_dict(model_dict)
    optimizer.load_state_dict(optimizer_dict)
    scheduler.load_state_dict(scheduler_dict)
else:
    start_epoch = 0

Loading states
successifully loaded states


In [12]:
train_loss = []
val_loss = []
min_loss = np.Infinity

for i in range(start_epoch,epochs):
    t = []
    l = []
    h1_1.train()
    for j,(waveform, input_len ,sample_rate, transcript, speaker_id) in enumerate(train_dataLoader,start=1):
        waveform = waveform.cuda()
        input_len = input_len.cuda()
        optimizer.zero_grad()
        with torch.no_grad():
            specs = encoder.compute_features(waveform)
            targets = encoder.normalize(specs,input_len)
            block_0 = encoder.model.CNN.block_0(targets)
            conv_1 = encoder.model.CNN.block_1.conv_1(block_0)
            norm_1 = encoder.model.CNN.block_1.norm_1(conv_1)
            norm_1 =  norm_1.reshape( norm_1.shape[0], norm_1.shape[1],-1)
        out = h1_1(norm_1)
        loss = criterion(targets,out)
        loss.backward()
        optimizer.step()
        if (j+1) % 100 == 0:
            print("epoch:{}/{}".format(i+1,epochs,j))
        t.append(loss.item())
    av_t = sum(t)/len(t)
    print("epoch:{}/{},Train loss:{}".format(i+1,epochs,av_t))
    train_loss.append(av_t)
    del waveform
    del input_len
    # validation loop
    h1_1.eval()
    for j,(waveform, input_len ,sample_rate, transcript, speaker_id) in enumerate(val_dataLoader,start=1):
        waveform = waveform.cuda()
        input_len = input_len.cuda()
        with torch.no_grad():
            specs = encoder.compute_features(waveform)
            targets = encoder.normalize(specs,input_len)
            block_0 = encoder.model.CNN.block_0(targets)
            conv_1 = encoder.model.CNN.block_1.conv_1(block_0)
            norm_1 = encoder.model.CNN.block_1.norm_1(conv_1)
            norm_1 =  norm_1.reshape( norm_1.shape[0], norm_1.shape[1],-1)
            out = h1_1(norm_1)
            loss = criterion(targets,out)
            if (j+1) % 100 == 0:
                print("epoch:{}/{}".format(i+1,epochs,j))
            l.append(loss.item())
    av_l = sum(l)/len(l)
    print("epoch:{}/{},Val loss:{}".format(i+1,epochs,av_l))
    val_loss.append(av_l)
    if av_l < min_loss:
        min_loss = av_l
        torch.save(h1_1,"best_model_11")
        
    state_dict = {
    "epoch":i,
    "train_losses":train_loss,
    "val_losses":val_loss,
    "model_dict":h1_1.state_dict(),
    "optimizer":optimizer.state_dict(),
    "scheduler":scheduler.state_dict()
    }
    save_model_parameters("h1_1_states.pth",state_dict)
    scheduler.step(av_l)

epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30
epoch:4/30,Train loss:0.07184761008482521
epoch:4/30
epoch:4/30,Val loss:0.04556572872453031
states at 3 epoch saved
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30
epoch:5/30,Train loss:0.0682624492262091
epoch:5/30
epoch:5/30,Val loss:0.054650104151362865
states at 4 epoch saved
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30
epoch:6/30,Train loss:0.06298657956869662
epoch:6/30
epoch:6/30,Val loss:0.03953117112406587
states at 5 epoch saved
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30
epoch:7/30,Train loss:0.06011461657198036
epoch:7/30
epoch:7/30,Val loss:0.04646785352873591
states at 6 epoch saved
epoch:8/30
epoch:8/30
epoch:8/30
epoch:8/30
epoc

In [None]:
encoder

In [None]:
stretch = torchaudio.transforms.TimeStretch()
specs = torchaudio.transforms.Spectrogram()
a = specs(torch.randn((40,100)))
a

In [None]:
a = specs(a)
a.shape

In [None]:
stretch(a,1.5)