## 构建metadata.pkl

In [2]:
import os
import pickle
import torch
import numpy as np
import glob
from math import ceil
from model_vc import Generator
from pydub import AudioSegment
from make_spect import make_spect_fun

def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

device = 'cuda:3'
G = Generator(28,256,512,16).eval().to(device)

#load model(pre-train and train)
g_checkpoint = torch.load('autovc-zhu.ckpt')
G.load_state_dict(g_checkpoint['model'])
train = pickle.load(open("./spmel/train.pkl",'rb'))

## 数据集测试 

In [7]:
metadata_dataset = []
spect_vc = []
# Test dataset
content_Hotpot1 = np.load('./spmel/H2/EPwchina_EPwchina_86_Hotpot.npy') # 2 second
content_Hotpot2 =np.load('./spmel/H4/You_Really_Don-t_Understand_the_West_You_Really_Don-t_Understand_the_West_103_Hotpot.npy')
content_Lance1 = np.load('./spmel/L4/Lance_JPHE-005_55_Lance.npy') # 2 second
content_Lance2 = np.load('./spmel/L1/Lance_JPLE-007_102_Lance.npy')

metadata_dataset.append([train[1][0],train[1][1],content_Hotpot1])
metadata_dataset.append([train[3][0],train[3][1],content_Hotpot2])
metadata_dataset.append([train[7][0],train[7][1],content_Lance2])
metadata_dataset.append([train[4][0],train[4][1],content_Lance1])

for sbmt_i in metadata_dataset:
             
    x_org = sbmt_i[2]
    x_org, len_pad = pad_seq(x_org)

    #sourse content(1 x * x 80)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    #sourse speaker embedding(1 x 256)
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
    
    for sbmt_j in metadata_dataset:
        if sbmt_i[0][0] == sbmt_j[0][0]:
            continue
        
        #target speaker embedding           
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
        with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
        if len_pad == 0:
            #content with target utterance
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )

with open('results.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)       

In [8]:
spect_vc

[('H2xL4',
  array([[0.24622096, 0.23738296, 0.22597304, ..., 0.02101145, 0.01691216,
          0.01219785],
         [0.26166517, 0.25897247, 0.23621976, ..., 0.0157168 , 0.01558639,
          0.00879951],
         [0.2741456 , 0.26324314, 0.2234242 , ..., 0.01226834, 0.01382287,
          0.0088155 ],
         ...,
         [0.44051415, 0.43204987, 0.42539343, ..., 0.05698032, 0.05332664,
          0.05113255],
         [0.45969173, 0.44789028, 0.42510214, ..., 0.06163758, 0.05756512,
          0.05837513],
         [0.35410523, 0.34686092, 0.34177998, ..., 0.03696606, 0.0365518 ,
          0.0366719 ]], dtype=float32)),
 ('H2xL1',
  array([[0.2290548 , 0.21811639, 0.20694557, ..., 0.01950912, 0.01555524,
          0.01170908],
         [0.234073  , 0.22894524, 0.20844509, ..., 0.01283053, 0.01353043,
          0.00946734],
         [0.24785969, 0.23489337, 0.20011842, ..., 0.00616541, 0.00929598,
          0.00794667],
         ...,
         [0.43580317, 0.41039097, 0.37913397, ...,

In [None]:
from IPython.display import Audio

display(Audio('./output/H2xH2.wav'))
display(Audio('./output/H2xH4.wav'))


## 测试集测试 

In [3]:
metadata_test = []
spect_vc = []
rootDir = './Testset/'
wav_list = glob.glob(rootDir+"*.wav")
i = 1

# Test Unknown
for wav in wav_list:   
    content_U = make_spect_fun(wav)
    metadata_test.append([train[0][0],train[0][1],content_U]) #H1
    metadata_test.append([train[4][0],train[4][1],content_U]) #L1

for index in range(len(metadata_test)):
             
    x_org = metadata_test[index][2]
    x_org, len_pad = pad_seq(x_org)
    
    #sourse content(1 x * x 80)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    #sourse speaker embedding(1 x 256)
    emb_org = torch.from_numpy(metadata_test[index][1][np.newaxis, :]).to(device)
    
    #target speaker embedding           
    emb_trg = torch.from_numpy(metadata_test[index][1][np.newaxis, :]).to(device)

    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)

    if len_pad == 0:
        #content with target utterance
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()

    spect_vc.append( ('{}x{}'.format('U'+str(i), metadata_test[index][0]), uttr_trg) )
    if index % 2 ==1:
        i+=1
with open('results.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)          

In [4]:
wav_list

['./Testset/American_IMPERIALIST_Hong_Kong_Bill_American_IMPERIALIST_Hong_Kong_Bill_119_Hotpot.wav',
 './Testset/Lance_JPLE-007_167_Lance.wav',
 './Testset/CNNchina_CNNchina_22_Hotpot.wav',
 './Testset/p311_020.wav',
 './Testset/Lance_JPLE-007_165_Lance.wav',
 './Testset/p237_057.wav',
 './Testset/Lance_JPLE-007_166_Lance.wav',
 './Testset/p254_016.wav',
 './Testset/p245_010.wav',
 './Testset/p245_001.wav',
 './Testset/p286_064.wav']

## Original Test 

In [8]:
import os
import pickle
import torch
import numpy as np
from math import ceil
from model_vc import Generator
from pydub import AudioSegment

def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

device = 'cuda:3'
G = Generator(32,256,512,16).eval().to(device)

#load model(pre-train and train)
g_checkpoint = torch.load('autovc-zhu.ckpt')
G.load_state_dict(g_checkpoint['model'])

metadata = pickle.load(open('metadata.pkl', "rb"))

spect_vc = []

for sbmt_i in metadata:
             
    x_org = sbmt_i[2]
    x_org, len_pad = pad_seq(x_org)

    #sourse content(1 x * x 80)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    #sourse speaker embedding(1 x 256)
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
    
    for sbmt_j in metadata:

        #target speaker embedding           
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
        with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
        if len_pad == 0:
            #content with target utterance
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
        
# metadata.pkl中包含了225、228、270和256这4个说话人，两两结合生成对应的说话人特征的spectrogram(eg. p225xp270(目标).wav)

with open('results.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)          