In [1]:
!pip install librosa



In [2]:
import librosa

s, sr  = librosa.load("../input/denoise-data/train_clean_male.wav" , sr=None)
S = librosa.stft( s , n_fft=1024 , hop_length=512)
sn , sr = librosa.load("../input/denoise-data/train_dirty_male.wav" , sr=None)
X = librosa.stft(sn , n_fft=1024 , hop_length=512)

In [3]:
print("Input Clear voice data shape : ", S.shape)
print("Input Noise voice data shape : ", X.shape)

Input Clear voice data shape :  (513, 2459)
Input Noise voice data shape :  (513, 2459)


In [4]:
import numpy as np
S_abs = np.abs(S)
X_abs = np.abs(X)

In [5]:
S_in = np.swapaxes(S_abs , 0 , 1)
X_in = np.swapaxes(X_abs , 0 , 1)

In [6]:
#Import Libraries
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

In [115]:
class Net(nn.Module):
    #This defines the structure of the NN.
    def __init__(self , activation='relu'):
        super(Net, self).__init__()
        self.wav_size = 513
        self.fc1 = nn.Linear(self.wav_size, 1024)      
        self.fc2 = nn.Linear(1024, 1024 )               
        self.fc3 = nn.Linear(1024,1024 )                    
        self.fc4 = nn.Linear(1024,1024 )                    
        self.fc5 = nn.Linear(1024,1024 )                    
        self.fc6 = nn.Linear(1024,1024 ) 
        self.fc7 = nn.Linear(1024,1024 ) 
        self.fc8 = nn.Linear(1024,1024 ) 
        self.fc9 = nn.Linear(1024,1024 ) 
        self.fc10 = nn.Linear(1024,1024 ) 
        self.out_layer = nn.Linear(1024,self.wav_size)                # output layer
        #select the activation function
        if(activation=='relu'):
            self.activation_fn = nn.ReLU()
        if(activation=='logistic_sigmoid'):
            self.activation_fn = nn.LogSigmoid()

    def forward(self, x):
        #flatten the input vector
        x = x.view(-1, self.wav_size)
        #Linear Layer 1 /Activation
        x = self.activation_fn( self.fc1(x) ) 
        #Linear Layer 2 /Activation
        x = self.activation_fn( self.fc2(x) ) 
        #Linear Layer 3 /Activation
        #x = self.activation_fn( self.fc3(x) ) 
        #Linear Layer 4 /Activation
        #x = self.activation_fn( self.fc4(x) ) 
        #Linear Layer 5 /Activation
        #x = self.activation_fn( self.fc5(x) ) 
        #Linear Layer 5 /Activation
        out = self.activation_fn(self.out_layer(x))
        #Softmax gets probabilities. 
        return out


#model weight initialization function 
def init_weights_normal(m):
    if type(m) == nn.Linear:
        torch.nn.init.normal_(m.weight , mean=0 , std=0.01)
        m.bias.data.fill_(0)

def init_weights_xavier(m):
    if type(m) == nn.Linear:      
        torch.nn.init.xavier_normal_(m.weight , gain=0.8)
        m.bias.data.fill_(0)
def init_weights_kaiman(m):
    if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        m.bias.data.fill_(0)

In [116]:
from torch.utils.data import Dataset , DataLoader

class Wav_DataGenerator(Dataset):
    def __init__(self , noise_wav , clean_wav , seed):
        super(Wav_DataGenerator , self).__init__()
        self.noise_wav = noise_wav
        self.clean_wav = clean_wav
        self.seed = torch.manual_seed(seed)
        
    def __getitem__(self , index):
        
        data_x = self.noise_wav[index]
        data_y = self.clean_wav[index]
        
        return data_x , data_y
        
    def __len__(self ):
        return len(self.noise_wav)
    

In [117]:
train_data = Wav_DataGenerator(X_in , S_in , 1264)
train_dataloader = DataLoader(train_data , batch_size=32 , shuffle=True)

In [118]:
#define the model
device="cuda:0" if torch.cuda.is_available() else "cpu"

Denoise_Model = Net()
Denoise_Model.apply(init_weights_normal)
Denoise_Model.to(device)

Net(
  (fc1): Linear(in_features=513, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1024, bias=True)
  (fc5): Linear(in_features=1024, out_features=1024, bias=True)
  (fc6): Linear(in_features=1024, out_features=1024, bias=True)
  (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  (fc8): Linear(in_features=1024, out_features=1024, bias=True)
  (fc9): Linear(in_features=1024, out_features=1024, bias=True)
  (fc10): Linear(in_features=1024, out_features=1024, bias=True)
  (out_layer): Linear(in_features=1024, out_features=513, bias=True)
  (activation_fn): ReLU()
)

In [119]:
#define the model optimizer and loss
optimizer = optim.Adam(Denoise_Model.parameters() , lr=0.001)
#L2 loss function
criterion = nn.MSELoss()

In [120]:
#training the model
epoch = 50
model_train_loss = []


for i_epoch in range(epoch):
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(train_dataloader):

        data, target = data.to(device) , target.to(device)
        #Variables in Pytorch are differenciable. 
        data, target = Variable(data), Variable(target)
        #This will zero out the gradients for this batch. 
        optimizer.zero_grad()
        output = Denoise_Model(data)
        # Calculate the loss The negative log likelihood loss. It is useful to train a classification problem with C classes.
        loss =criterion(output, target)
        #dloss/dx for every Variable 
        loss.backward()
        #to do a one-step update on our parameter.
        optimizer.step()
        epoch_loss += loss.detach().to('cpu').item()
        #Print out the loss periodically. 
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                i_epoch, batch_idx * len(data), len(train_dataloader.dataset),
                100. * batch_idx / len(train_dataloader), loss.detach().item()))



In [121]:
tn , sr = librosa.load("../input/denoise-data/test_x_01.wav" , sr=None)
X = librosa.stft(tn , n_fft=1024 , hop_length=512)

In [122]:
T_abs = np.abs(X)
T_in = np.swapaxes(T_abs , 0 , 1)

In [123]:
T_in_tensor = torch.tensor(T_in , dtype=torch.float32)

In [124]:
#inference the model
T_out_tensor = Denoise_Model(T_in_tensor.to(device))

In [125]:
T_out = T_out_tensor.detach().to("cpu").numpy()
T_out = np.swapaxes(T_out , 0 , 1)

In [126]:
T_phase = X / T_abs

In [127]:
T_phase.shape

(513, 142)

In [128]:
T_out.shape

(513, 142)

In [129]:
#do Hadamard product
S_hat = np.multiply(T_phase,T_out)

In [130]:
import soundfile as sf

iStftMat = librosa.istft(S_hat, hop_length=512)

sf.write("testOut.wav", iStftMat , sr)