In [5]:
import torch
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np
import pandas as pd
import math
import os
import configure as c
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, ToTensorTestInput
from model.model import background_resnet
import matplotlib.pyplot as plt

def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
    if use_cuda:
        model.cuda()
    print('=> loading checkpoint')
    # original saved file with DataParallel
    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
    # create new OrderedDict that does not contain `module.`
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()
    return model

def split_enroll_and_test(dataroot_dir):
    DB_all = read_feats_structure(dataroot_dir)
    enroll_DB = pd.DataFrame()
    test_DB = pd.DataFrame()
    
    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
    
    # Reset the index
    enroll_DB = enroll_DB.reset_index(drop=True)
    test_DB = test_DB.reset_index(drop=True)
    return enroll_DB, test_DB

def get_embeddings(use_cuda, filename, model, test_frames):
    total = []
    input, label = read_MFB(filename) # input size:(n_frames, n_dims)

    print("len(input) : ", len(input))
    
    tot_segments = math.ceil(len(input)/test_frames)# total number of segments with 'test_frames' 
    print("tot_segments : ", tot_segments)
    
    activation = 0
    with torch.no_grad():
        for i in range(tot_segments):
            temp_input = input[i*test_frames:i*test_frames+test_frames]
            
            TT = ToTensorTestInput()
            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
    
            if use_cuda:
                temp_input = temp_input.cuda()
            temp_activation,_ = model(temp_input)
            total.append(list(np.array(temp_activation)[0]))
            activation += torch.sum(temp_activation, dim=0, keepdim=True)
    
    activation = l2_norm(activation, 1)
                
    return total

def l2_norm(input, alpha):
    input_size = input.size()  # size:(n_frames, dim)
    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
    norm = torch.sqrt(normp)  # size:(n_frames)
    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
    output = _output.view(input_size)
    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
    output = output * alpha
    return output

def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
    """
    Output the averaged d-vector for each speaker (enrollment)
    Return the dictionary (length of n_spk)
    """
    n_files = len(DB) # 10
    enroll_speaker_list = sorted(set(DB['speaker_id']))
    
    embeddings = {}
    
    # Aggregates all the activations
    print("Start to aggregate all the d-vectors per enroll speaker")
    
    for i in range(n_files):
        filename = DB['filename'][i]
        spk = DB['speaker_id'][i]
        
        activation = get_embeddings(use_cuda, filename, model, test_frames)
        if spk in embeddings:
            embeddings[spk] += activation
        else:
            embeddings[spk] = activation
            
        print("Aggregates the activation (spk : %s)" % (spk))
        
    if not os.path.exists(embedding_dir):
        os.makedirs(embedding_dir)
        
    # Save the embeddings
    for spk_index in enroll_speaker_list:
        embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
        torch.save(embeddings[spk_index], embedding_path)
        print("Save the embeddings for %s" % (spk_index))
    return embeddings
    
def main():
        
    # Settings
    use_cuda = False
    log_dir = 'model_saved'
    embedding_size = 128
    cp_num = 24 # Which checkpoint to use?
    n_classes = 251
    test_frames = 200
    
    # Load model from checkpoint
    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
    
    # Get the dataframe for enroll DB
    enroll_DB, test_DB = split_enroll_and_test('filterbank/test')
    
    # Where to save embeddings
    embedding_dir = 'enroll_embeddings'
    
    # Perform the enrollment and save the results
    enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
    
    """ Test speaker list
    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
    """ 

if __name__ == '__main__':
    main()

ModuleNotFoundError: ignored

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
from sklearn.cluster import KMeans
from pydub import AudioSegment

ModuleNotFoundError: ignored

In [1]:
def extract_section(file_path, wav_file_path):
    # Settings
    use_cuda = False
    log_dir = 'model_saved'
    embedding_size = 128
    cp_num = 24 # Which checkpoint to use?
    n_classes = 251
    test_frames = 100 #1초당 feature

    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
    test_embedding2 = get_embeddings(use_cuda, file_path, model, test_frames)

    X = test_embedding2
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X)
    y_kmeans = kmeans.predict(X)
    print(y_kmeans)
    
    '''plt.scatter(np.array(X)[:, 0], np.array(X)[:, 1], c=y_kmeans, s=50, cmap='viridis')
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)'''
    
    section = [0]
    cnt = 0
    for i in range(0, len(y_kmeans)-1):
        if (y_kmeans[i] != y_kmeans[i+1]) && cnt>1:
            section.append((i+1)*1000)
        else:
          cnt++
            
    #export sound
    sound = AudioSegment.from_wav(wav_file_path)
    
    for i in range(1, len(section)):
        sound_cut = sound[section[i-1]:section[i]]
        print(section[i-1], section[i])
        sound_cut.export("output/voice"+str(i-1)+".wav", format="wav")

    sound_cut = sound[section[len(section)-1]:]
    sound_cut.export("output/voice"+str(len(section)-1)+".wav", format="wav")

SyntaxError: ignored

In [0]:
extract_section('test_data_pickle/rand_overlay136-rm_silence.p', 'rand_overlay_136-rm_silence.wav')

=> loading checkpoint
len(input) :  1484
tot_segments :  15
[2 2 2 2 2 2 0 0 0 1 1 1 1 1 1]
0 6000
6000 9000


In [0]:
sec = extract_section('test_data_pickle/rand_overlay136-rm_silence.p', 'rand_overlay_136-rm_silence.wav')
sound = AudioSegment.from_wav('rand_overlay_136-rm_silence.wav')

for i in range(1, len(sec)):
    sound_cut = sound[sec[i-1]:sec[i]]
    print(sec[i-1], sec[i])
    sound_cut.export("output/voice"+str(i-1)+".wav", format="wav")

sound_cut = sound[sec[len(sec)-1]:]
sound_cut.export("output/voice"+str(len(sec)-1)+".wav", format="wav")

0 6000
6000 9000


<_io.BufferedRandom name='output/voice2.wav'>