In [1]:
import streamlit as st
from stqdm import stqdm

In [2]:
from speaker_embedding import preprocess_wav, VoiceEncoder
from demo_utils import *
from itertools import groupby
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle
import argparse, sys, os, json, codecs

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import matplotlib.pyplot as plt
import matplotlib
cmaps = plt.colormaps()
plt.rcParams["font.family"] = 'AppleGothic'
matplotlib.rcParams['axes.unicode_minus'] = False

from umap import UMAP
import plotly.express as px

In [5]:
import torch
from torch import nn
from skorch import NeuralNetClassifier

## Making Segment Vectors

In [18]:
def get_embeddings(path_sample):
    wav_fpaths = list(Path(path_sample).glob("*.wav"))
    voice_type = list(map(lambda wav_fpath: wav_fpath.parent.stem, wav_fpaths))
    wavs = np.array(list(map(preprocess_wav, stqdm(
        wav_fpaths, "Preprocessing wavs", len(wav_fpaths), position=0))))
    st.success('Finished preprocessing')

    encoder = VoiceEncoder()
    utterance_embeds = np.array(
        list(map(encoder.embed_utterance, stqdm(wavs, "compute emb vec.", len(wavs)))))
    st.success('Finished embedding compute')

    voice_type_wavs = {speaker: wavs[list(indices)] for speaker, indices in
                        stqdm(groupby(range(len(wavs)), lambda i: voice_type[i]))}
    voice_type_embeds = np.array([encoder.embed_speaker(wavs[:len(wavs)])
                                    for wavs in stqdm(voice_type_wavs.values())])
    st.success('Finished voice-type embeddings')

    return voice_type, wavs, utterance_embeds, voice_type_wavs, voice_type_embeds

In [19]:
embeds_path = "/home/kwangje/Desktop/sr-iptv-proto/embeds"

In [20]:
with open(embeds_path+'/wavs/wavs_child123_v2_1.pkl','rb') as f:
    wavs_child_123_real = pickle.load(f)

with open(embeds_path+'/wavs/wavs_adult_female_v2_1.pkl','rb') as f:
    wavs_adult_female_real = pickle.load(f)

with open(embeds_path+'/wavs/wavs_adult_male_v2_1.pkl','rb') as f:
    wavs_adult_male_real = pickle.load(f)

In [22]:
with open(embeds_path+'/real/voice_type_embeds_child123_v2_1.pkl','rb') as f:
    embeds_child_123_real = pickle.load(f)

with open(embeds_path+'/real/voice_type_embeds_adult_male_v2_1.pkl','rb') as f:
    embeds_adult_female_real = pickle.load(f)

with open(embeds_path+'/real/voice_type_embeds_adult_female_v2_1.pkl','rb') as f:
    embeds_adult_male_real = pickle.load(f)

In [23]:
from speaker_embedding import preprocess_wav, VoiceEncoder

encoder = VoiceEncoder()
encoder.load_state_dict

Loaded the voice encoder model on cpu in 0.01 seconds.


<bound method Module.load_state_dict of VoiceEncoder(
  (lstm): LSTM(40, 256, num_layers=3, batch_first=True)
  (linear): Linear(in_features=256, out_features=256, bias=True)
  (relu): ReLU()
)>

In [24]:
with open(embeds_path+'/utter/utterance_embeds_child123.pkl','rb') as f:
    utterance_embeds_child123 = pickle.load(f)

with open(embeds_path+'/utter/utterance_embeds_female.pkl','rb') as f:
    utterance_embeds_female = pickle.load(f)

with open(embeds_path+'/utter/utterance_embeds_male.pkl','rb') as f:
    utterance_embeds_male = pickle.load(f)

### visualization for age groups

In [25]:
import plotly.graph_objects as go
import plotly.express as px

In [26]:
def hist(speaker_wavs, spk_embeds_total, test_embed):
    spk_sim_matrix = np.inner(spk_embeds_total, test_embed)

    labels = [i for i in speaker_wavs.keys()]
    stats = dict(zip(labels, spk_sim_matrix))

    fig = go.Figure([go.Bar(x=labels, y=spk_sim_matrix)])

    result = max(stats, key=stats.get)

    fig.update_layout(margin=dict(l=5, r=5, b=5, t=5))
    st.plotly_chart(fig)
    return stats, result

In [27]:
from umap import UMAP
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import plot

In [21]:
'''
utterance_embeds_female     61362
utterance_embeds_male       59214
utterance_embeds_child123   50849
total                       171425
'''

utterance_embeds_total = np.append(utterance_embeds_female, utterance_embeds_male, axis=0)
utterance_embeds_total = np.append(utterance_embeds_total, utterance_embeds_child123, axis=0)

In [56]:
reducer = UMAP(n_neighbors=30, min_dist=0.2, n_components=3)
proj_3d = reducer.fit_transform(utterance_embeds_total)

In [57]:
df = pd.DataFrame(proj_3d)

In [58]:
df.loc[:61361, "voice_type"] = 'female'
df.loc[61362:120576, "voice_type"] = "male"
df.loc[120576:, "voice_type"] = "child"
#df.loc[:61361, :]

In [60]:
df["voice_type"].value_counts()

female    61362
male      59214
child     50849
Name: voice_type, dtype: int64

In [63]:
fig_3d = px.scatter_3d(
    proj_3d, x=0, y=1, z=2,
    color=df.voice_type, labels={'color': 'voice_type'}, opacity=0.7)
    
fig_3d.update_traces(marker_size=1)
fig_3d.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig_3d.write_html("clf_dvector.html")

### caculate the similarity between the groups

In [28]:
def cal_similarity(self, embeds_child, embeds_adult_female, embeds_adult_male, test):
    labels = ["child", "adult_female", "adult_male", "unknown"]

    sim_child = np.inner(embeds_child, test)
    sim_adult_fm = np.inner(embeds_adult_female, test)
    sim_adult_m = np.inner(embeds_adult_male, test)

    if (sim_child and sim_adult_fm and sim_adult_m) < 0.5:
        sim_unk = int(1.0)
    else:
        sim_unk = int(0)

    similarities = [sim_child, sim_adult_fm, sim_adult_m, sim_unk]

    stats = dict(zip(labels, similarities))
    result = max(stats, key=stats.get)

    return stats, result, similarities

In [29]:
def age_clf_single(self, path_audio_file):
    """
    :param audio_file (str): Path to audio file to predict
    :return answer (str): voice-type predicted by the model
    """

    test_path = Path(path_audio_file)
    test_wav = preprocess_wav(test_path)
    test_embed = self.encoder.embed_utterance(test_wav)
    score, answer, sim = self.cal_similarity(
        self.embeds_child_123,
        self.embeds_adult_female,
        self.embeds_adult_male,
        test_embed,
    )
    return answer

In [None]:
audio_folder = args.dst
    output_folder = args.output
    test_result = age_clf(audio_folder, output_folder)