## **Speech Denoising using 2D CNN**

In [137]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import torchvision
from torch import nn, optim
import torch.nn.functional as F
import librosa
import torch.utils.data as utils

In [138]:
cv, sr=librosa.load('/content/drive/My Drive/Colab Notebooks/audio/train_clean_male.wav', sr=None)
clean_v=librosa.stft(cv, n_fft=1024, hop_length=512)
dv, sr=librosa.load('/content/drive/My Drive/Colab Notebooks/audio/train_dirty_male.wav', sr=None)
dirty_v=librosa.stft(dv, n_fft=1024, hop_length=512)

clean_v_abs = torch.tensor(np.abs(clean_v))
dirty_v_abs = torch.tensor(np.abs(dirty_v))
print("clean: ", clean_v.shape)
print("dirty: ", dirty_v.shape)
print("\nclean (tensored): ", clean_v_abs.shape)
print("dirty (tensored): ", dirty_v_abs.shape)

clean_v_abs = clean_v_abs.t()
dirty_v_abs = dirty_v_abs.t()
print("\nclean (transpose, tensored): ", clean_v_abs.shape)
print("dirty (transpose, tensored): ", dirty_v_abs.shape)

clean:  (513, 2459)
dirty:  (513, 2459)

clean (tensored):  torch.Size([513, 2459])
dirty (tensored):  torch.Size([513, 2459])

clean (transpose, tensored):  torch.Size([2459, 513])
dirty (transpose, tensored):  torch.Size([2459, 513])


In [139]:
def img_tranform(tnsr):
  norm_dist = torch.distributions.Normal(0, 0.000003)
  extra = norm_dist.sample((20,513))
  padded_input = torch.cat((extra, tnsr))
  sound_img = []
  for i in range(20, padded_input.shape[0]):
      sound_img.append(padded_input[i-20:i,:].unsqueeze(0))
  padded_input = torch.cat(sound_img)
  padded_input = padded_input.unsqueeze(0)
  padded_input = padded_input.permute(1, 0, 2, 3)
  print("Shape of transformed data: ",padded_input.shape)
  return padded_input

The above function take the data and converts it into the required format. Adding 19 rows having very small values at the start of the data. Then making a dataset having 'image' of size 20 x 513. After transformation, the data size of the output will be **[batch_size x 1 x 20 x 513].** This will the input for our 2D CNN

In [140]:
# clean_trans = img_tranform(clean_v_abs)
dirty_trans = img_tranform(dirty_v_abs)

# Create a dataset and dataloader
dataset = utils.TensorDataset(dirty_trans, clean_v_abs)
train_loader = utils.DataLoader(dataset, batch_size=128, shuffle=False)

Shape of transformed data:  torch.Size([2459, 1, 20, 513])


In [141]:
class Denoiser(nn.Module):
  def __init__(self):
    super(Denoiser, self).__init__()

    self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=4, stride=1)
    self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=1)
    self.conv3 = nn.Conv2d(in_channels=32, out_channels=8, kernel_size=4, stride=2)
    self.fc1 = nn.Linear(2000, 1024)
    self.out = nn.Linear(1024, 513)
    self.pool = nn.MaxPool2d(2)

  def forward(self, x):
    x = F.relu(self.conv1(x))
    x = self.pool(F.relu(self.conv2(x)))
    x = F.relu(self.conv3(x))
    x = x.view(x.size(0), -1)
    x = F.relu(self.fc1(x))
    x = F.relu(self.out(x))
    return x

def init_weights(layer):
  if isinstance(layer, nn.Linear):
    nn.init.xavier_uniform_(layer.weight.data)
    nn.init.zeros_(layer.bias.data)

**Models tried**

self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(6,6), stride=2) \
self.conv2 = nn.Conv2d(in_channels=4, out_channels=16, kernel_size=(6, 6), stride=2) \
self.conv3 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), stride=2)

-------------------------------------------------------------------------------

self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(6,6), stride=2) \
self.conv2 = nn.Conv2d(in_channels=4, out_channels=16, kernel_size=(6, 6), stride=2)

-------------------------------------------------------------------------------

self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(9, 9), stride=2) \
self.conv2 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=(6, 6), stride=2)

-------------------------------------------------------------------------------

self.conv1 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(6, 6), stride=2) \
self.conv2 = nn.Conv2d(in_channels=2, out_channels=6, kernel_size=(4, 4), stride=2) \
self.conv3 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(2, 2), stride=2)

For this assignment I tried various architecture to get the decent enough resutls. I also tried changing various hyper-parameters such as epochs and batch size, etc.However, I was not able to achieve any good results. Above are some of the architecture that I tried. Current architecture used, was the best was able to get.

In [142]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = Denoiser()
net.apply(init_weights)

net.to(device)

print(net)

err_func = nn.SmoothL1Loss()
optimizer = optim.Adam(net.parameters())

Denoiser(
  (conv1): Conv2d(1, 16, kernel_size=(4, 4), stride=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(4, 4), stride=(1, 1))
  (conv3): Conv2d(32, 8, kernel_size=(4, 4), stride=(2, 2))
  (fc1): Linear(in_features=2000, out_features=1024, bias=True)
  (out): Linear(in_features=1024, out_features=513, bias=True)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)


In [143]:
# Training the network
epochs = 800
 
for e in range(1, epochs+1):
  train_loss_epoch = 0
  net.train()
  for row, target in train_loader:
    # set the gradients to zero as PyTorch automatically accumulates gradients
    optimizer.zero_grad()
    
    # get the output from the model
    row = row.to(device)
    target = target.to(device)
    outs = net(row)
    
    # calculate loss
    loss = err_func(outs, target)
    
    # Calculate gradients
    loss.backward()
    
    # Propagate gradients back
    optimizer.step()
    
    train_loss_epoch += loss.item()
  else:
    if e%100 == 0:
      print(f"Overall Training loss for epoch {e}: {round(train_loss_epoch/len(train_loader),4)}")

Overall Training loss for epoch 100: 0.0024
Overall Training loss for epoch 200: 0.0013
Overall Training loss for epoch 300: 0.001
Overall Training loss for epoch 400: 0.0009
Overall Training loss for epoch 500: 0.0009
Overall Training loss for epoch 600: 0.0008
Overall Training loss for epoch 700: 0.0007
Overall Training loss for epoch 800: 0.0005


In [144]:
t1, tr1=librosa.load('/content/drive/My Drive/Colab Notebooks/audio/test_x_01.wav', sr=None)
test_01_org=librosa.stft(t1, n_fft=1024, hop_length=512)
t2, tr2=librosa.load('/content/drive/My Drive/Colab Notebooks/audio/test_x_02.wav', sr=None)
test_02_org=librosa.stft(t2, n_fft=1024, hop_length=512)

print("test_01: ", test_01_org.shape)
print("test_02: ", test_02_org.shape)

test_01 = torch.tensor(np.abs(test_01_org))
test_02 = torch.tensor(np.abs(test_02_org))
print("test_01 (tensored): ", test_01.shape)
print("test_02 (tensored): ", test_02.shape)

test_01 = test_01.t()
test_02 = test_02.t()
print("test_01 (tensored, transpose): ", test_01.shape)
print("test_02 (tensored, transpose): ", test_02.shape)

test_01:  (513, 142)
test_02:  (513, 380)
test_01 (tensored):  torch.Size([513, 142])
test_02 (tensored):  torch.Size([513, 380])
test_01 (tensored, transpose):  torch.Size([142, 513])
test_02 (tensored, transpose):  torch.Size([380, 513])


In [145]:
test_01_trans = img_tranform(test_01)
test_02_trans = img_tranform(test_02)

test_01_dataset = utils.TensorDataset(test_01_trans, test_01_trans)
test_01_loader = utils.DataLoader(test_01_dataset, batch_size=32, shuffle=False)

test_02_dataset = utils.TensorDataset(test_02_trans, test_02_trans)
test_02_loader = utils.DataLoader(test_02_dataset, batch_size=32, shuffle=False)


net.eval()
test_01_list = []
with torch.no_grad():
  for row, row_copy in test_01_loader:
    row = row.to(device)
    target = target.to(device)
    outs = net(row)
    test_01_list.append(outs)
test_01_output = torch.cat(test_01_list)

test_02_list = []
with torch.no_grad():
  for row, row_copy, in test_02_loader:
    row = row.to(device)
    target = target.to(device)
    outs = net(row)
    test_02_list.append(outs)
test_02_output = torch.cat(test_02_list)

def signal_cleaner(x_org, output):
  temp = x_org/np.abs(x_org)
  return np.multiply(temp, output.numpy())

clean_test_01 = signal_cleaner(test_01_org, test_01_output.cpu().t())
print("Shape of clean_test_01: ", clean_test_01.shape)
test_01_istft = librosa.core.istft(clean_test_01, hop_length=512)

clean_test_02 = signal_cleaner(test_02_org, test_02_output.cpu().t())
print("Shape of clean_test_02: ", clean_test_02.shape)
test_02_istft = librosa.core.istft(clean_test_02, hop_length=512)

Shape of transformed data:  torch.Size([142, 1, 20, 513])
Shape of transformed data:  torch.Size([380, 1, 20, 513])
Shape of clean_test_01:  (513, 142)
Shape of clean_test_02:  (513, 380)


In [21]:
!pip install soundfile
import soundfile

Collecting soundfile
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Installing collected packages: soundfile
Successfully installed soundfile-0.10.3.post1


In [146]:
soundfile.write("result_01_2.wav", test_01_istft, tr1)
soundfile.write("result_02_2.wav", test_02_istft, tr2)

In [147]:
# result for test_x_01.wav
import IPython.display as ipd
ipd.Audio('result_01_2.wav')

In [136]:
# result for test_x_02.wav
import IPython.display as ipd
ipd.Audio('result_02_2.wav')