In [12]:
'''
Notebook with WSRN Audio Style Transfer
Code based on https://software.intel.com/en-us/articles/neural-style-transfer-on-audio-signals
'''

from __future__ import print_function
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np 
from sys import argv
import torchvision.transforms as transforms
import copy
import librosa
import torchvision

In [0]:
# The Wide Shallow Random Network
class CNNModel(nn.Module):
		def __init__(self):
			super(CNNModel, self).__init__()
			self.cnn1 = nn.Conv1d(in_channels=1025, out_channels=4096, kernel_size=3, stride=1, padding=1) #1025
		
		def forward(self, x):
			out = self.cnn1(x)
			out = out.view(out.size(0),-1)
			return out

In [0]:
# Module for Computing the Gram Matrix
class GramMatrix(nn.Module):

	def forward(self, input):
		a, b, c = input.size()
		features = input.view(a * b, c)
		G = torch.mm(features, features.t())
		return G.div(a * b * c)

In [0]:
class StyleLoss(nn.Module):

	def __init__(self, target, styleweight, contentweight):
		super(StyleLoss, self).__init__()
		self.target = target.detach() * styleweight
		self.weight = styleweight
		self.content_weight = contentweight
		self.gram = GramMatrix()
		self.criterion = nn.MSELoss()
		self.criterion1 = nn.MSELoss()

	def forward(self, input):
		self.output = input.clone()
		self.G = self.gram(input)
		self.G.mul_(self.weight)
		self.loss = self.criterion(self.G, self.target)
		return self.output

	def backward(self,retain_graph=True):
		self.loss.backward(retain_graph=retain_graph)
		return self.loss

In [0]:
class ContentLoss(nn.Module):

    def __init__(self, target,):
        super(ContentLoss, self).__init__()
        self.target = target.detach()
        self.criterion = nn.MSELoss()

    def forward(self, input):
        self.loss = self.criterion(input, self.target)
        return input

In [0]:
	# Code for generating STFT Spectrogram from Audio File
	N_FFT=2048
	def read_audio_spectum(filename):
		x, fs = librosa.load(filename, duration=8) # Duration=58.05 so as to make sizes convenient
		S = librosa.stft(x, N_FFT)#librosa.stft(x, N_FFT)
		p = np.angle(S)
		S = np.log1p(np.abs(S)) 
		S = S / np.linalg.norm(S)
		return S, fs

In [0]:
style_layers_default = ['conv_1']
content_layers_default = ['conv_1']

style_weight=2000000
content_weight = 5e-2
def get_style_model_and_losses(cnn, style_float, content_float, style_weight=style_weight, style_layers=style_layers_default, content_layers=content_layers_default): #STYLE WEIGHT
		
		cnn = copy.deepcopy(cnn)
		style_losses = []
		content_losses = []
		model = nn.Sequential()  # the new Sequential module network
		gram = GramMatrix()  # we need a gram module in order to compute style targets
		if torch.cuda.is_available():
			model = model.cuda()
			gram = gram.cuda()

		name = 'conv_1'
		model.add_module(name, cnn.cnn1)
		if name in style_layers:
			target_feature = model(style_float).clone()
			target_feature_gram = gram(target_feature)
			style_loss = StyleLoss(target_feature_gram, style_weight, content_weight)
			model.add_module("style_loss_1", style_loss)
			style_losses.append(style_loss)

		return model, style_losses

In [0]:
	#Optimizer Initialization
	learning_rate_initial = 3

	def get_input_param_optimizer(input_float):
		input_param = nn.Parameter(input_float.data)
		optimizer = optim.Adam([input_param], lr=learning_rate_initial, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
		return input_param, optimizer

In [0]:
# Style Transfer Code
num_steps= 2500

def run_style_transfer(cnn, style_float, content_float, input_float, num_steps=num_steps, style_weight=style_weight, content_weight=content_weight): #STYLE WEIGHT, NUM_STEPS
    print('Building the style transfer model..')
    model, style_losses = get_style_model_and_losses(cnn, style_float, content_float, style_weight)
    input_param, optimizer = get_input_param_optimizer(input_float)
    print('Optimizing..')
    run = [0]

    while run[0] <= num_steps:
        def closure():
            input_param.data.clamp_(0, 1)

            optimizer.zero_grad()
            model(input_param)
            style_score = 0
            content_score = 0

            for sl in style_losses:
                style_score += sl.backward()

            run[0] += 1
            if run[0] % 100 == 0:
                print("run {}:".format(run))
                print(style_score)
                print('Style Loss : {:10f}'.format(style_score.item()))
                print()

            return style_score


        optimizer.step(closure)
    input_param.data.clamp_(0, 1)
    return input_param.data

In [0]:
# Main Part of Code that calls everything

content_audio_name = "weeknd_10sec_sample.wav"
style_audio_name = "JFK.wav"


style_audio, style_sr = read_audio_spectum(style_audio_name)
content_audio, content_sr = read_audio_spectum(content_audio_name)

if(content_sr == style_sr):
    print('Sampling Rates are same')
else:
    print('Sampling rates are not same')
    exit()

num_samples=style_audio.shape[1]	

print(style_audio.shape)
print(content_audio.shape)

style_audio = style_audio.reshape([1,1025,num_samples]) #1025
content_audio = content_audio.reshape([1,1025,num_samples]) #1025



if torch.cuda.is_available():
    style_float = Variable((torch.from_numpy(style_audio)).cuda())
    content_float = Variable((torch.from_numpy(content_audio)).cuda())	
else:
    style_float = Variable(torch.from_numpy(style_audio))
    content_float = Variable(torch.from_numpy(content_audio))




cnn = CNNModel()

if torch.cuda.is_available():
    cnn = cnn.cuda()



input_float = content_float.clone() # Initialize with Content
# input_float = Variable(torch.ones(content_float.size())).type(torch.FloatTensor).cuda() # Random Initialize


output = run_style_transfer(cnn, style_float, content_float, input_float)
if torch.cuda.is_available():
    output = output.cpu()

output = output.squeeze(0)
output = output.numpy()



N_FFT=2048
a = np.zeros_like(output)
a = np.exp(output) - 1

# This code is supposed to do phase reconstruction
p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
for i in range(500):
    S = a * np.exp(1j*p)
    x = librosa.istft(S)
    p = np.angle(librosa.stft(x, N_FFT))

OUTPUT_FILENAME = 'output1D_ZEROD_4096_iter'+str(num_steps)+'_c'+content_audio_name+'_s'+style_audio_name+'_sw'+str(style_weight)+'_k3s1p1.wav'
librosa.output.write_wav(OUTPUT_FILENAME, x, style_sr)

print('DONE...')

In [196]:
# Code To listen to output in notebook
from IPython.display import display as dis, Audio
dis(Audio(x, rate=22050))

In [0]:
# Code to Plot Spectrograms
plt.figure(figsize=(15, 25))
plt.subplot(1,3,1)
plt.title('Content')
plt.axis('off')
plt.imshow(content_float.cpu().numpy().squeeze())
plt.subplot(1,3,2)
plt.title('Style')
plt.axis('off')
plt.imshow(style_float.cpu().numpy().squeeze())
plt.subplot(1,3,3)
plt.title('Output')
plt.axis('off')
plt.imshow(output)
plt.show()

In [13]:
# Following Code Cells are for denoising
import scipy.signal as signal
import scipy.ndimage
import torchaudio
import torchaudio.transforms as transforms
from IPython.display import display as dis, Audio


In [14]:
# Filter requirements.
T = 10.0        # Sample Period
fs = 22050.0      # sample rate, Hz
low_cut = 1000     # desired cutoff frequency of the filter, Hz ,      slightly higher than actual 1.2 Hz
high_cut = 8000
nyq = 0.5 * fs  # Nyquist Frequency
order = 2       # sin wave can be approx represented as quadratic
n = int(T * fs) # total number of samples

def butter_band_filter(data, low_cut, high_cut, fs, order):
    low_norm = low_cut / nyq
    high_norm = high_cut / nyq
    # Get the filter coefficients 
    b, a = signal.butter(order, [low_norm, high_norm], btype='band', analog=False)
    y = signal.filtfilt(b, a, data)
    return y

In [15]:
wave, sampleR = torchaudio.load('./Good Outputs/RockywithJFKstyle.wav', normalization=True)

print(sampleR)
dis(Audio(wave, rate=22050))
wave = butter_band_filter(wave, 750, 4000, fs, order)
dis(Audio(wave, rate=22050))
torchaudio.save('RockywithJFKstyle_denoised.wav', torch.tensor(wave.copy()), 22050)

22050


  b = a[a_slice]
