In [1]:
import matplotlib

from torch.autograd import Variable
matplotlib.use('agg')
import matplotlib.pyplot as plt

from utils import *
from model import *
import time
import math
import argparse

In [2]:
cuda = True if torch.cuda.is_available() else False

cuda

True

In [3]:
learning_rate = 0.002
content = None
content_weight = 1e2
style = None
style_weight = 1
epochs = 20000
print_interval = 1000
plot_interval = 1000
output = 'outputR'

In [5]:
# WAV TO SPECTRUM BLOCK
CONTENT_FILENAME = "taycut.wav"
STYLE_FILENAME = "boy18.wav"

a_content, src = wav2spectrum(CONTENT_FILENAME)
a_style, srs = wav2spectrum(STYLE_FILENAME)

a_content_torch = torch.from_numpy(a_content)[None, None, :, :] 
a_style_torch = torch.from_numpy(a_style)[None, None, :, :]


print("a_content", a_content.shape)
print("a_content_torch", a_content_torch.shape)

print("sr",src )
print("------------")
print("a_style", a_content.shape)
print("a_style_torch", a_style_torch.shape)

print("sr",src )

# Output
# a_content (257, 244)
# a_content_torch torch.Size([1, 1, 257, 244])
# sr 22050
# ------------
# a_style (257, 244)
# a_style_torch torch.Size([1, 1, 257, 355])
# sr 22050


a_content (257, 1249)
a_content_torch torch.Size([1, 1, 257, 1249])
sr 22050
------------
a_style (257, 1249)
a_style_torch torch.Size([1, 1, 257, 244])
sr 22050


In [6]:
# Display the Content spectrum
plt.figure(figsize=(10, 6))
plt.imshow(a_content, aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(label="Log Amplitude")
plt.title("Spectrogram of the Audio")
plt.xlabel("Time Frames")
plt.ylabel("Frequency Bins")
plt.show()
plt.savefig("Content_Spectrum.png")  # Save the plot as an image


  plt.show()


In [7]:
# Display the Style spectrum
plt.figure(figsize=(10, 6))
plt.imshow(a_style, aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(label="Log Amplitude")
plt.title("Spectrogram of the Audio")
plt.xlabel("Time Frames")
plt.ylabel("Frequency Bins")
plt.show()
plt.savefig("Style_Spectrum.png")  # Save the plot as an image


  plt.show()


In [8]:
if cuda:
    a_content_torch = a_content_torch.cuda()
    print(a_content_torch.shape)
if cuda:
    a_style_torch = a_style_torch.cuda()
    print(a_style_torch.shape)

torch.Size([1, 1, 257, 1249])
torch.Size([1, 1, 257, 244])


In [9]:
a_content_torch[0][0][0]

tensor([0.0000, 0.0000, 0.0000,  ..., 0.0241, 0.0081, 0.0096], device='cuda:0')

In [10]:
model = RandomCNN()
model.eval()

RandomCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 1), stride=(1, 1))
  (LeakyReLU): LeakyReLU(negative_slope=0.2)
)

In [11]:
# a_C_var = Variable(a_content_torch, requires_grad=False).float()
# a_S_var = Variable(a_style_torch, requires_grad=False).float()

a_C_var = a_content_torch.float()
a_S_var = a_style_torch.float()

if cuda:
    model = model.cuda()
    a_C_var = a_C_var.cuda()
    a_S_var = a_S_var.cuda()


a_C = model(a_C_var) 
a_S = model(a_S_var) 


print("a_C_var:", a_C_var.shape)
print("a_S_var:", a_S_var.shape)
print("-------------")
print("a_C:", a_C.shape)
print("a_S:", a_S.shape)

a_C_var: torch.Size([1, 1, 257, 1249])
a_S_var: torch.Size([1, 1, 257, 244])
-------------
a_C: torch.Size([1, 32, 255, 1249])
a_S: torch.Size([1, 32, 255, 244])


In [12]:

a_C = model(a_C_var) 
a_S = model(a_S_var) 

print("a_content_torch type", type(a_content_torch))

a_G_var = torch.randn(a_content_torch.shape) * 1e-3


a_content_torch type <class 'torch.Tensor'>


In [13]:
a_C.shape

torch.Size([1, 32, 255, 1249])

In [14]:
a_G_var.shape

torch.Size([1, 1, 257, 1249])

In [15]:
# Optimizer
# a_G_var = Variable(torch.randn(a_content_torch.shape) * 1e-3)

a_G_var = torch.randn(a_content_torch.shape) * 1e-3

print("a_G_var:",a_G_var.shape)


if cuda:
    a_G_var = a_G_var.cuda()


a_G_var.requires_grad = True
optimizer = torch.optim.Adam([a_G_var])

print("optimizer:", optimizer)
# coefficient of content and style
style_param = style_weight
content_param = content_weight

num_epochs = epochs
print_every = print_interval
plot_every = plot_interval



a_G_var: torch.Size([1, 1, 257, 1249])
optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


In [16]:
# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [17]:
a_G = model(a_G_var)

type(a_G)
a_G.shape

torch.Size([1, 32, 255, 1249])

# Training the Model'

In [18]:
def trainModel(optimizer,current_loss):
    for epoch in range(1, num_epochs + 1):
        optimizer.zero_grad()
        a_G = model(a_G_var)

        content_loss = content_param * compute_content_loss(a_C, a_G)
        style_loss = style_param * compute_layer_style_loss(a_S, a_G)
        loss = content_loss + style_loss
        loss.backward()
        optimizer.step()

        # print
        if epoch % print_every == 0:
            print("{} {}% {} content_loss:{:4f} style_loss:{:4f} total_loss:{:4f}".format(epoch,
                                                                                        epoch / num_epochs * 100,
                                                                                        timeSince(start),
                                                                                        content_loss.item(),
                                                                                        style_loss.item(), 
                                                                                        loss.item())
                                                                                        )
            current_loss += loss.item()

        # Add current loss avg to list of losses
        if epoch % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0
    
    return 

start = time.time()
trainModel(optimizer,current_loss)

1000 5.0% 1m 5s content_loss:0.700147 style_loss:0.175027 total_loss:0.875173
2000 10.0% 2m 11s content_loss:0.546032 style_loss:0.051478 total_loss:0.597510
3000 15.0% 3m 17s content_loss:0.476206 style_loss:0.050646 total_loss:0.526852
4000 20.0% 4m 23s content_loss:0.449274 style_loss:0.052619 total_loss:0.501893
5000 25.0% 5m 29s content_loss:0.441016 style_loss:0.053609 total_loss:0.494625
6000 30.0% 6m 35s content_loss:0.439434 style_loss:0.053751 total_loss:0.493186
7000 35.0% 7m 41s content_loss:0.439307 style_loss:0.053767 total_loss:0.493074
8000 40.0% 8m 47s content_loss:0.439305 style_loss:0.053768 total_loss:0.493073
9000 45.0% 9m 53s content_loss:0.439305 style_loss:0.053769 total_loss:0.493074
10000 50.0% 10m 59s content_loss:0.439306 style_loss:0.053769 total_loss:0.493075
11000 55.00000000000001% 12m 5s content_loss:0.439306 style_loss:0.053769 total_loss:0.493075
12000 60.0% 13m 11s content_loss:0.439307 style_loss:0.053769 total_loss:0.493076
13000 65.0% 14m 18s cont

In [19]:
a_G_var.shape

torch.Size([1, 1, 257, 1249])

# Output 

In [20]:
def librosa_write(outfile, x, sr):
    if version.parse(librosa.__version__) < version.parse('0.8.0'):
        librosa.output.write_wav(outfile, x, sr)
    else:
        soundfile.write(outfile, x, sr)


def spectrum2wav(spectrum, sr, outfile):
    # Return the all-zero vector with the same shape of `a_content`
    a = np.exp(spectrum) - 1
    p = 2 * np.pi * np.random.random_sample(spectrum.shape) - np.pi
    
    for i in range(50):
        S = a * np.exp(1j * p)
        x = librosa.istft(S)
        p = np.angle(librosa.stft(x, n_fft=N_FFT))

    print("x:",x)
    print("x.shape:",x.shape)
    print("type(x)):",type(x))

    librosa_write(outfile, x, sr)

gen_spectrum = a_G_var.cpu().data.numpy().squeeze()
gen_audio_C = output + ".wav"
spectrum2wav(gen_spectrum, src, gen_audio_C)

x: [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 7.68906351e-05
 1.53408667e-04 1.17949826e-04]
x.shape: (159744,)
type(x)): <class 'numpy.ndarray'>


In [21]:
gen_spectrum.shape

(257, 1249)

# Output Graphs

In [22]:
plt.figure()
plt.plot(all_losses)
plt.savefig('loss_curve.png')

plt.figure(figsize=(5, 5))
# we then use the 2nd column.
plt.subplot(1, 1, 1)
plt.title("Content Spectrum")
plt.imsave('Content_SpectrumT.png', a_content[:400, :])

plt.figure(figsize=(5, 5))
# we then use the 2nd column.
plt.subplot(1, 1, 1)
plt.title("Style Spectrum")
plt.imsave('Style_SpectrumT.png', a_style[:400, :])

plt.figure(figsize=(5, 5))
# we then use the 2nd column.
plt.subplot(1, 1, 1)
plt.title("CNN Voice Transfer Result")
plt.imsave('RanGen_SpectrumT.png', gen_spectrum[:400, :])
