In [1]:
import matplotlib

from torch.autograd import Variable
matplotlib.use('agg')
import matplotlib.pyplot as plt

from utils import *
from model import *
import time
import math
import argparse

In [2]:
cuda = True if torch.cuda.is_available() else False

cuda

True

In [3]:
learning_rate = 0.002
content = None
content_weight = 1e2
style = None
style_weight = 1
epochs = 20000
print_interval = 1000
plot_interval = 1000
output = 'output'

In [4]:
# WAV TO SPECTRUM BLOCK
CONTENT_FILENAME = "boy18.wav"
STYLE_FILENAME = "girl52.wav"

a_content, src = wav2spectrum(CONTENT_FILENAME)
a_style, srs = wav2spectrum(STYLE_FILENAME)

a_content_torch = torch.from_numpy(a_content)[None, None, :, :] 
a_style_torch = torch.from_numpy(a_style)[None, None, :, :]

a_content_torch = a_content_torch.repeat(1, 3, 1, 1)  # [batch_size, 3, height, width]
a_style_torch = a_style_torch.repeat(1, 3, 1, 1)      # [batch_size, 3, height, width]

print("a_content", a_content.shape)
print("a_content_torch", a_content_torch.shape)

print("sr",src )
print("------------")
print("a_style", a_content.shape)
print("a_style_torch", a_style_torch.shape)

print("sr",src )

# Output
# a_content (257, 244)
# a_content_torch torch.Size([1, 1, 257, 244])
# sr 22050
# ------------
# a_style (257, 244)
# a_style_torch torch.Size([1, 1, 257, 355])
# sr 22050


a_content (257, 1249)
a_content_torch torch.Size([1, 3, 257, 1249])
sr 22050
------------
a_style (257, 1249)
a_style_torch torch.Size([1, 3, 257, 244])
sr 22050


In [5]:
# Display the Content spectrum
plt.figure(figsize=(10, 6))
plt.imshow(a_content, aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(label="Log Amplitude")
plt.title("Spectrogram of the Audio")
plt.xlabel("Time Frames")
plt.ylabel("Frequency Bins")
plt.show()
plt.savefig("Content_Spectrum.png")  # Save the plot as an image


  plt.show()


In [6]:
# Display the Style spectrum
plt.figure(figsize=(10, 6))
plt.imshow(a_style, aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(label="Log Amplitude")
plt.title("Spectrogram of the Audio")
plt.xlabel("Time Frames")
plt.ylabel("Frequency Bins")
plt.show()
plt.savefig("Style_Spectrum.png")  # Save the plot as an image


  plt.show()


In [7]:
if cuda:
    a_content_torch = a_content_torch.cuda()
    print(a_content_torch.shape)
if cuda:
    a_style_torch = a_style_torch.cuda()
    print(a_style_torch.shape)

torch.Size([1, 3, 257, 1249])
torch.Size([1, 3, 257, 244])


VGG IMPLEMENTATION

In [8]:
import torch
import torch.nn as nn
from torchvision import models

In [9]:

# Define the VGG-based feature extractor
class VGGFeatureExtractor(nn.Module):
    def __init__(self, selected_layers):
        super(VGGFeatureExtractor, self).__init__()
        
        # Load pretrained VGG19 model
        vgg = models.vgg19(pretrained=True).features
        
        # Select layers to extract features from
        self.selected_layers = selected_layers
        self.vgg_layers = nn.ModuleList([vgg[i] for i in range(max(selected_layers) + 1)])
        
        # Freeze VGG weights to prevent updates during training
        for param in self.vgg_layers.parameters():
            param.requires_grad = False

    def forward(self, x):
        features = []
        for i, layer in enumerate(self.vgg_layers):
            x = layer(x)
            if i in self.selected_layers:
                features.append(x)
        return features

In [10]:
# Define selected layers to extract features from
# Example: Use features from layers corresponding to relu1_2, relu2_2, relu3_4, etc.
selected_layers = [3, 8, 17, 26, 35]

# Initialize the feature extractor
model = VGGFeatureExtractor(selected_layers)

# Set to evaluation mode
model.eval()



VGGFeatureExtractor(
  (vgg_layers): ModuleList(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), s

In [15]:


# Example: Generated, content, and style tensors
a_C_var = a_content_torch.float()  # Content tensor
a_S_var = a_style_torch.float()    # Style tensor
a_G_var = torch.randn_like(a_C_var).float()  # Random initialized tensor for generated

if torch.cuda.is_available():
    model = model.cuda()
    a_C_var = a_C_var.cuda()
    a_S_var = a_S_var.cuda()
    a_G_var = a_G_var.cuda()

# Extract features
a_C = model(a_C_var)  # Content features
a_S = model(a_S_var)  # Style features
a_G = model(a_G_var)  # Generated features

# Print feature shapes
for i, (c, s, g) in enumerate(zip(a_C, a_S, a_G)):
    print(f"Layer {selected_layers[i]} -> a_C: {c.shape}, a_S: {s.shape}, a_G: {g.shape}")


Layer 3 -> a_C: torch.Size([1, 64, 257, 244]), a_S: torch.Size([1, 64, 257, 355]), a_G: torch.Size([1, 64, 257, 244])
Layer 8 -> a_C: torch.Size([1, 128, 128, 122]), a_S: torch.Size([1, 128, 128, 177]), a_G: torch.Size([1, 128, 128, 122])
Layer 17 -> a_C: torch.Size([1, 256, 64, 61]), a_S: torch.Size([1, 256, 64, 88]), a_G: torch.Size([1, 256, 64, 61])
Layer 26 -> a_C: torch.Size([1, 512, 32, 30]), a_S: torch.Size([1, 512, 32, 44]), a_G: torch.Size([1, 512, 32, 30])
Layer 35 -> a_C: torch.Size([1, 512, 16, 15]), a_S: torch.Size([1, 512, 16, 22]), a_G: torch.Size([1, 512, 16, 15])


In [16]:

a_C = a_C[0]
a_S = a_S[0]
a_G = a_G[0]


type(a_S)

torch.Tensor

In [17]:
# Generated spectrogram (initialized with small random noise)
a_G_var = torch.randn(a_content_torch.shape) * 1e-3
if cuda:
    a_G_var = a_G_var.cuda()

a_G_var.requires_grad = True

# Optimizer for the generated spectrogram
optimizer = torch.optim.Adam([a_G_var])

# Coefficients for content and style loss
style_param = style_weight
content_param = content_weight

# Training configuration
num_epochs = epochs
print_every = print_interval
plot_every = plot_interval


In [18]:
# Loss tracking
current_loss = 0
all_losses = []

# Utility function to compute elapsed time
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return f'{m}m {s:.2f}s'


# Training Process


In [19]:
# Training function
def trainModel(optimizer, current_loss):
    for epoch in range(1, num_epochs + 1):
        optimizer.zero_grad()

        # Forward pass through the model
        a_G = model(a_G_var)
        a_G = a_G[0]

        # Compute content and style losses
        content_loss = content_param * compute_content_loss(a_C, a_G)
        style_loss = style_param * compute_layer_style_loss(a_S, a_G)

        # Total loss
        loss = content_loss + style_loss

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        # Log progress
        if epoch % print_every == 0:
            print(f"{epoch} {epoch / num_epochs * 100:.2f}% {timeSince(start)} "
                  f"content_loss: {content_loss.item():.4f} "
                  f"style_loss: {style_loss.item():.4f} "
                  f"total_loss: {loss.item():.4f}")
            current_loss += loss.item()

        # Update loss list for plotting
        if epoch % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0

    return

# Start training
start = time.time()
trainModel(optimizer, current_loss)


1000 5.00% 1m 56.45s content_loss: 12.2053 style_loss: 259.8648 total_loss: 272.0702
2000 10.00% 3m 53.39s content_loss: 13.4224 style_loss: 133.0526 total_loss: 146.4751
3000 15.00% 5m 48.96s content_loss: 13.7781 style_loss: 80.7092 total_loss: 94.4873
4000 20.00% 7m 43.67s content_loss: 13.7278 style_loss: 46.9445 total_loss: 60.6722
5000 25.00% 10m 27.30s content_loss: 13.4987 style_loss: 27.0963 total_loss: 40.5950
6000 30.00% 12m 38.86s content_loss: 13.2103 style_loss: 18.0150 total_loss: 31.2253
7000 35.00% 14m 32.59s content_loss: 12.9721 style_loss: 14.5875 total_loss: 27.5596
8000 40.00% 16m 26.28s content_loss: 12.8311 style_loss: 13.3538 total_loss: 26.1849
9000 45.00% 18m 19.96s content_loss: 12.7459 style_loss: 12.7828 total_loss: 25.5287
10000 50.00% 20m 13.67s content_loss: 12.6887 style_loss: 12.3966 total_loss: 25.0853
11000 55.00% 22m 8.82s content_loss: 12.6469 style_loss: 12.1267 total_loss: 24.7736
12000 60.00% 24m 3.01s content_loss: 12.6146 style_loss: 11.9112 

In [23]:
a_G_var.shape

torch.Size([1, 3, 257, 244])

# Converting output to Wav file

In [57]:
def librosa_write(outfile, x, sr):
    if version.parse(librosa.__version__) < version.parse('0.8.0'):
        librosa.output.write_wav(outfile, x, sr)
    else:
        soundfile.write(outfile, x, sr)


def spectrum2wav(spectrum, sr, outfile):
    # Return the all-zero vector with the same shape of `a_content`
    a = np.exp(spectrum) - 1
    p = 2 * np.pi * np.random.random_sample(spectrum.shape) - np.pi
    
    print("a", a.shape)
    print("p", p.shape)

    for i in range(50):
        S = a * np.exp(1j * p)
        x = librosa.istft(S)
        p = np.angle(librosa.stft(x, n_fft=N_FFT))

    x = x.mean(axis=0)  # Mix down to mono
    # x = x.flatten()
    # print("x:",x)
    print("x.shape:",x.shape)
    print("type(x)):",type(x))

    librosa_write(outfile, x, sr)


gen_spectrum = a_G_var.cpu().data.numpy().squeeze()
gen_audio_C = output + ".wav"
spectrum2wav(gen_spectrum, src, gen_audio_C)

a (3, 257, 244)
p (3, 257, 244)
x.shape: (31104,)
type(x)): <class 'numpy.ndarray'>


In [None]:

a_G_var.shape

torch.Size([1, 3, 257, 244])

In [55]:
gen_spectrum.shape

(3, 257, 244)

# Output Spectrograms

In [49]:
plt.figure()
plt.plot(all_losses)
plt.savefig('loss_curve.png')

plt.figure(figsize=(5, 5))
# we then use the 2nd column.
plt.subplot(1, 1, 1)
plt.title("Content Spectrum")
plt.imsave('Content_Spectrum.png', a_content[:400, :])

plt.figure(figsize=(5, 5))
# we then use the 2nd column.
plt.subplot(1, 1, 1)
plt.title("Style Spectrum")
plt.imsave('Style_Spectrum.png', a_style[:400, :])

plt.figure(figsize=(5, 5))
# we then use the 2nd column.
plt.subplot(1, 1, 1)
plt.title("CNN Voice Transfer Result")
plt.imsave('VGGGen_Spectrum.png', gen_spectrum[0][:400, :])
