# **1. Speech Denoising using 1D CNN**

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import torchvision
from torch import nn, optim
import torch.nn.functional as F
import librosa
import torch.utils.data as utils

In [6]:
cv, sr=librosa.load('data/train_clean_male.wav', sr=None)
clean_v=librosa.stft(cv, n_fft=1024, hop_length=512)
dv, sr=librosa.load('data/train_dirty_male.wav', sr=None)
dirty_v=librosa.stft(dv, n_fft=1024, hop_length=512)

clean_v_abs = torch.tensor(np.abs(clean_v))
dirty_v_abs = torch.tensor(np.abs(dirty_v))
print("clean: ", clean_v.shape)
print("dirty: ", dirty_v.shape)
print("\nclean (tensored): ", clean_v_abs.shape)
print("dirty (tensored): ", dirty_v_abs.shape)

clean_v_abs = clean_v_abs.t()
dirty_v_abs = dirty_v_abs.t()
print("\nclean (transpose, tensored): ", clean_v_abs.shape)
print("dirty (transpose, tensored): ", dirty_v_abs.shape)

clean:  (513, 2459)
dirty:  (513, 2459)

clean (tensored):  torch.Size([513, 2459])
dirty (tensored):  torch.Size([513, 2459])

clean (transpose, tensored):  torch.Size([2459, 513])
dirty (transpose, tensored):  torch.Size([2459, 513])


In [7]:
# Create a dataset and dataloader
dataset = utils.TensorDataset(dirty_v_abs, clean_v_abs)
train_loader = utils.DataLoader(dataset, batch_size=32, shuffle=False)

# a, b = next(iter(train_loader))
# a_n = a.unsqueeze(0)
# a_n = a_n.permute(1, 0, 2)
# print(a_n.shape)
# print(a.shape, b.shape)

In [8]:
class Denoiser(nn.Module):
    def __init__(self):
        super(Denoiser, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=4, kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=4, out_channels=16, kernel_size=2, stride=2)
        self.fc = nn.Linear(16*128, 1024)
        self.out = nn.Linear(1024, 513)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1, 16*128)
        x = F.relu(self.fc(x))
        x = F.relu(self.out(x))
        return x

def init_weights(layer):
    if isinstance(layer, nn.Linear):
        nn.init.xavier_uniform_(layer.weight.data)
        nn.init.zeros_(layer.bias.data)

In [9]:
net = Denoiser()
net.apply(init_weights)
print(net)

err_func = nn.L1()
optimizer = optim.Adam(net.parameters(), lr=1e-4)

Denoiser(
  (conv1): Conv1d(1, 4, kernel_size=(3,), stride=(2,))
  (conv2): Conv1d(4, 16, kernel_size=(2,), stride=(2,))
  (fc): Linear(in_features=2048, out_features=1024, bias=True)
  (out): Linear(in_features=1024, out_features=513, bias=True)
)


- I started with a basic network architecture with 2 convolutional layer and 1 fully-connected layer. While the output was good enough, I wanted to try out different architecture and see the results. Therefore, I also added one more fully connected layer and found that my output was better than the previous output. 
- In the previous assignment I had initialized my weight and bias with xavier uniform and zeroes respectively. Since that worked well for me before, I didn't change that for this question.
- I am using ReLU activation since we want out output to be greater than 0.
- Since this is a regression problem, I am using SmoothL1Loss() as it keeps balance in penalizing the model compared to MSELoss() and L1Loss().
- For optimizer, I have used adam optimizer with learning rate 0.0001, a setting that I used same as assignment 1.

In [10]:
# Training the network
epochs = 50
 
for e in range(1, epochs+1):
    train_loss_epoch = 0
    for row, target in train_loader:
        # set the gradients to zero as PyTorch automatically accumulates gradients
        optimizer.zero_grad()

        # get the output from the model
        row = row.unsqueeze(0)
        row = row.permute(1, 0, 2)
        outs = net(row)

        # calculate loss
        loss = err_func(outs, target)

        # Calculate gradients
        loss.backward()

        # Propagate gradients back
        optimizer.step()

        train_loss_epoch += loss.item()
    else:
        print(f"Overall Training loss for epoch {e}: {round(train_loss_epoch/len(train_loader),4)}")

Overall Training loss for epoch 1: 0.0199
Overall Training loss for epoch 2: 0.0162
Overall Training loss for epoch 3: 0.0135
Overall Training loss for epoch 4: 0.0111
Overall Training loss for epoch 5: 0.0092
Overall Training loss for epoch 6: 0.0077
Overall Training loss for epoch 7: 0.0066
Overall Training loss for epoch 8: 0.0058
Overall Training loss for epoch 9: 0.0052
Overall Training loss for epoch 10: 0.0048
Overall Training loss for epoch 11: 0.0044
Overall Training loss for epoch 12: 0.0039
Overall Training loss for epoch 13: 0.0036
Overall Training loss for epoch 14: 0.0033
Overall Training loss for epoch 15: 0.0031
Overall Training loss for epoch 16: 0.003
Overall Training loss for epoch 17: 0.0028
Overall Training loss for epoch 18: 0.0027
Overall Training loss for epoch 19: 0.0026
Overall Training loss for epoch 20: 0.0025
Overall Training loss for epoch 21: 0.0025
Overall Training loss for epoch 22: 0.0024
Overall Training loss for epoch 23: 0.0023
Overall Training loss

In [11]:
t1, tr1=librosa.load('data/test_x_01.wav', sr=None)
test_01_org=librosa.stft(t1, n_fft=1024, hop_length=512)
t2, tr2=librosa.load('data/test_x_02.wav', sr=None)
test_02_org=librosa.stft(t2, n_fft=1024, hop_length=512)

print("test_01: ", test_01_org.shape)
print("test_02: ", test_02_org.shape)

test_01 = torch.tensor(np.abs(test_01_org))
test_02 = torch.tensor(np.abs(test_02_org))
print("test_01 (tensored): ", test_01.shape)
print("test_02 (tensored): ", test_02.shape)

test_01 = test_01.t()
test_02 = test_02.t()
print("test_01 (tensored, transpose): ", test_01.shape)
print("test_02 (tensored, transpose): ", test_02.shape)

test_01:  (513, 142)
test_02:  (513, 380)
test_01 (tensored):  torch.Size([513, 142])
test_02 (tensored):  torch.Size([513, 380])
test_01 (tensored, transpose):  torch.Size([142, 513])
test_02 (tensored, transpose):  torch.Size([380, 513])


In [12]:
test_01_dataset = utils.TensorDataset(test_01, test_01)
test_01_loader = utils.DataLoader(test_01_dataset, batch_size=32, shuffle=False)

test_02_dataset = utils.TensorDataset(test_02, test_02)
test_02_loader = utils.DataLoader(test_02_dataset, batch_size=32, shuffle=False)

test_01_list = []
with torch.no_grad():
    for row, row_copy in test_01_loader:
        row = row.unsqueeze(0)
        row = row.permute(1, 0, 2)
        outs = net(row)
        test_01_list.append(outs)
test_01_output = torch.cat(test_01_list, dim=0)

test_02_list = []
with torch.no_grad():
    for row, row_copy, in test_02_loader:
        row = row.unsqueeze(0)
        row = row.permute(1, 0, 2)
        outs = net(row)
        test_02_list.append(outs)
test_02_output = torch.cat(test_02_list, dim=0)

def signal_cleaner(x_org, output):
    temp = x_org/np.abs(x_org)
    return np.multiply(temp, output.numpy())

clean_test_01 = signal_cleaner(test_01_org, test_01_output.t())
print("Shape of clean_test_01: ", clean_test_01.shape)
test_01_istft = librosa.core.istft(clean_test_01, hop_length=512)

clean_test_02 = signal_cleaner(test_02_org, test_02_output.t())
print("Shape of clean_test_02: ", clean_test_02.shape)
test_02_istft = librosa.core.istft(clean_test_02, hop_length=512)

Shape of clean_test_01:  (513, 142)
Shape of clean_test_02:  (513, 380)


In [13]:
!pip install soundfile
import soundfile



In [14]:
soundfile.write("result_01.wav", test_01_istft, tr1)
soundfile.write("result_02.wav", test_02_istft, tr2)

In [15]:
# result for test_x_01.wav
import IPython.display as ipd
ipd.Audio('result_01.wav')

In [16]:
# result for test_x_02.wav
import IPython.display as ipd
ipd.Audio('result_02.wav')