In [None]:
import os
import sys
from os.path import join as opj 

## Configure TRIAAN-VC

In [None]:
os.getcwd()
ORIGINAL_MODEL_PATH = opj(os.getcwd(), 'models', 'triaanvc')


## Tested with Pretrained model

Parameters List
- `--config` : direct of configuration file 
- `--device` : device to run the conversion on (default: `cuda:0`)
- `--sample_path`: path to the sample data
- `--src_name`: name of the source data 
- `--trg_name`: name of the target data
- `--checkpoints`: path to the checkpoints directory
- `--model_name`: name of the model, injecting with the custom pre-trained path
- `--seed`: seed

**Path**

In [None]:
# configure path to the pretrained model
PRETRAINED_TRIAAN = os.path.join(os.getcwd(), 'pre-trained','triaan-vc')
### others pretrained models: CPC, Vocoder -------------------
PRETRAINED_CPC = os.path.join(os.getcwd(), 'pre-trained','cpc')
PRETRAINED_Vocoder = os.path.join(os.getcwd(), 'pre-trained','vocoder')

print(PRETRAINED_TRIAAN)
print(PRETRAINED_CPC)
print(PRETRAINED_Vocoder)

**Folder configurations**

In [None]:
# path to model configuration
convert_file = os.path.join(ORIGINAL_TRIAAN, 'convert.py')
original_config = os.path.join(ORIGINAL_TRIAAN, 'config', 'base.yaml')
triaan_model = os.path.join(PRETRAINED_TRIAAN, 'model-cpc-split.pth') 

device = 'cpu'

data_source = './data'
checkpoint_dir = './checkpoints'

print(convert_file)
print(original_config)
print(triaan_model)

**File source**

In [None]:
# source and target audio files
source_name = 'MONTREAL_6_happiness_m.wav'
target_name = 'ken8hapvoc.wav'

# source_name = 'film_female_1.mp3'
# target_name = 'film_male_1.mp3'


In [None]:
# convert the original model to the pre-trained model
#os.system(f'python {convert_file} --device {device} --config {original_config} --sample_path {data_source} --src_name {source_name} --trg_name {target_name} --checkpoint {checkpoint_dir}')

In [None]:
#!python  --device cpu --sample_path ./data --src_name MONTREAL_6_happiness_m.wav --trg_name ken8hapvoc.wav  --checkpoint ./checkpoints

## Train Original model and fine-tune with custom data for specific task

https://github.com/winddori2002/TriAAN-VC

## #TODO

- In preporcessing phase (`audio.py`)file, normalise the pitch and loudness of the audio files 
- Recreate the preprocess configuration file (with customise data - robotic data)
- Dataset that we need to use: `WILLOW, BEST (Possible EMOGIB)`
- Train the model with the new configuration
- Save the new model
- Evaluate the model ~ WER, CER, SV Average
  

**Original dataset:** VCTK - 110 speakers with 400 utterances per speaker.

***Custom dataset:*** Robotic data

**Expected  Output**: The model can be use to convert the human speaking voice into robotic voice and it also encapsulating all the pause, and different in timbre. Source input will be human voice, target input will be robotic voice and the output is the conversion

 TODO: Rewrite all the functions to match with custom data. We only keep the model architecture, but the data processing, training, and evaluation will be re-written

In [None]:
# import pyyaml
from modified_audio_process import * #import all the functions changes for preprocessing audio
from models.triaanvc.src.utils import Config

#import preprocess, convert, train and test functions
# from models.triaanvc.preprocess_cpc import main as model_preprocess_cpc
# from models.triaanvc.preprocess import main as model_preprocess
# from models.triaanvc.convert import main as model_convert
# from models.triaanvc.train import main as model_train
# from models.triaanvc.test import main as model_test


### 1. Modified Model

In [None]:
custom_cfg = Config('./custom-config/preprocess.yaml')

In [None]:
def get_triaan_preprocess():
    # model_preprocess
    pass

# Function to execute the train or test in the TRIAAN-VC model, with specify parameters:
# action # train / test
# --config # path to config file
# --num_worker # number of workers
# --seed # seed number
# --device # cuda device - cpu or device
# --logging # logging option
# --resume # resume option
# --checkpoint # results save path
# --model_name # best model name
# --n_uttr # number of target utterances (default 1)
def get_triaan_execute():
    # using model_train or model_test
    pass



In [None]:
# Function to execute the convert.py file in the TRIAAN-VC model, with specify parameters
def get_triaan_convert():
    # using model_convert
    pass

## Modified Input
- reduce amplitude of target (robot) to match amplitude values of source (human)
- lower pitch of robot to human range (find out human range)
- remove the breathing/silence parts from source
- adding small silence at the start and at the end of the robot audio file
or combine 2 robot audios (from same source/similar sounding) with a silence gap in between them (maybe trying with samples from R2 or Wall-E since it's easier to find samples)
- play with combinations of the things above

In [1]:
import os
import sys
from os.path import join as opj 
import librosa
import numpy as np
import soundfile as sf
import resampy

In [2]:

# A multiple processes function
def input_process(source_path, target_path, processed_path, sr=16000, top_db=60):
    # function that fully process the input audio files from source and target 

    src_wav, src_fs  = sf.read(source_path)
    tgt_wav, tgt_fs   = sf.read(target_path)

    # trim slience
    src_wav, _   = librosa.effects.trim(y=wav, top_db=top_db)
    tgt_wav, _   = librosa.effects.trim(y=wav, top_db=top_db)

    # resample
    if src_fs != sr:
        src_wav = resampy.resample(x=src_wav, sr_orig=fs, sr_new=sr, axis=0)
        fs  = sr
    if tgt_fs != sr:
        tgt_wav = resampy.resample(x=tgt_wav, sr_orig=fs, sr_new=sr, axis=0)
        fs  = sr

    peak_src_max = np.abs(src_wav).max()
    peak_src_min = np.abs(src_wav).min()

    peak_tgt_max = np.abs(tgt_wav).max()
    peak_tgt_min = np.abs(tgt_wav).min()

    # normalise amplitude of tgt_wav to match src_wav
    if peak_tgt_max > peak_src_max:
        tgt_wav /= peak_src_max
    elif peak_tgt_min < peak_src_min:
        tgt_wav *= peak_src_min / peak_tgt_min


    # lower pitch of the target audio (robot) to human pitch range
    tgt_wav = librosa.effects.pitch_shift(tgt_wav, sr, n_steps=-2)

    # save the processed audio files
    src_name = source_path.split('/')[-1].split('.')[0]
    src_name = target_path.split('/')[-1].split('.')[0]
    sf.write(opj(processed_path, f's_processed_{src_name}.wav'), src_wav, sr)
    sf.write(opj(processed_path, f't_processed_{tgt_name}.wav'), tgt_wav, sr)

    return src_wav, tgt_wav

## Seperate Processed Function

### I/O Functions

In [3]:
def resample_audio(audio, fs, sr=16000):
    if fs != sr:
        audio = resampy.resample(x=audio, sr_orig=fs, sr_new=sr, axis=0)
        fs  = sr
    return audio, fs

def trim_silence(audio, top_db=60):
    audio, _   = librosa.effects.trim(y=audio, top_db=top_db)
    return audio

In [34]:
def output_processed_audio(output_path, audio_path, processed_audio, fs, tag='fullprocessed'):
    name = audio_path.split('\\')[-1].split('.')[0]
    sf.write(opj(output_path, f'{tag}_{name}.wav'), processed_audio, fs)


### Acoustic Modification

In [42]:
def concat_audio_files(audio1_path, audio2_path, processed_path):
    """
    Concatenate two audio files without silence in between 
    and save the concatenated audio file in processed_path
    """
    # function that concatenate two audio files
    # and save the concatenated audio file in processed_path
    audio1, fs1 = sf.read(audio1_path)
    audio2, fs2 = sf.read(audio2_path)

    # resample both audio files to 16kHz
    audio1, fs1 = resample_audio(audio1, fs1)
    audio2, fs2 = resample_audio(audio2, fs2)

    # concatenate two audio files
    concatenated_audio = np.concatenate((audio1, audio2), axis=0)
    

    audio1_name = audio1_path.split('\\')[-1].split('.')[0]
    audio2_name = audio2_path.split('\\')[-1].split('.')[0]
    
    sf.write(opj(processed_path, f'concatenated_{audio1_name}_{audio2_name}.wav'), concatenated_audio, fs1)

    return concatenated_audio

In [36]:
def matching_amplitude(audio1_path, audio2_path, processed_path):
    # function that match amplitude of audio2 to audio1
    audio1, fs1 = sf.read(audio1_path) #human reading
    audio2, fs2 = sf.read(audio2_path) #robot voice
    # resample
    audio1, fs1 = resample_audio(audio1, fs1)
    audio2, fs2 = resample_audio(audio2, fs2)

    # trim silence
    audio1 = trim_silence(audio1)
    audio2 = trim_silence(audio2)

    
    # normalise audio2 amplitude
    audio2 = audio2 / np.abs(audio2).max() * np.abs(audio1).max()

    # peak_audio1_max = np.abs(audio1).max()
    # peak_audio1_min = np.abs(audio1).min()

    # peak_audio2_max = np.abs(audio2).max()
    # peak_audio2_min = np.abs(audio2).min()

    # # normalise amplitude of audio2 to match audio1
    # if peak_audio2_max > peak_audio1_max:
    #     audio2 /= peak_audio1_max
    # elif peak_audio2_min < peak_audio1_min:
    #     audio2 *= peak_audio1_min / peak_audio2_min
    # # elif peak_audio1_min < peak_audio2_min:
    #     audio2 *= peak_audio2_min / peak_audio1_min
    # outputing
    output_processed_audio(processed_path, audio1_path, audio1, fs1, tag='normamplitude1')
    output_processed_audio(processed_path, audio2_path, audio2, fs2, tag='normamplitude2')

    return audio1, audio2 #audio 2 is normalised to amplitude of audio 1

def pitch_shift(audio_path, processed_path, sr=16000, n_steps=-2):
    """
    Shift pitch of an audio by n_steps
    """
    audio, fs = sf.read(audio_path)
    audio, fs = resample_audio(audio, fs, sr) # resample before pitch shift

    audio = librosa.effects.pitch_shift(audio, fs, n_steps=n_steps)
    output_processed_audio(processed_path, audio_path, audio, fs=fs, tag='pitchshift')
    return audio

def adjust_pitch(audio_path, pitch_range_start=90, pitch_range_end=155):
    """
    Adjust pitch of an audio to a given range
    by default, range is set to 
    (90 - 155) - adult male voice range or 
    (165 - 255) - adult female voice range
    """
    audio, fs = sf.read(audio_path)

    #resample before adjust pitch
    audio, fs = resample_audio(audio, fs)

    #calculate highest pitch
    pitches, magnitudes = librosa.piptrack(y=audio, sr=fs)
    max_pitch = pitches[np.argmax(magnitudes)]

    # desire pitch from range 
    desired_pitch = np.random.uniform(pitch_range_start, pitch_range_end)
    # n_steps = librosa.hz_to_octs(desired_pitch) - librosa.hz_to_octs(max_pitch)
    n_steps = np.abs(librosa.hz_to_octs(desired_pitch) - librosa.hz_to_octs(max_pitch))
    
    normpitch_audio = librosa.effects.pitch_shift(audio, fs, n_steps)
    output_processed_audio(processed_path, audio_path, normpitch_audio, fs=fs, tag='adjustpitch')

    return normpitch_audio

def add_silence(audio_path, processed_path, duration=0.5):
    """
    Add silence to the beginning and end of the audio
    """
    audio, fs = sf.read(audio_path)
    audio, fs = resample_audio(audio, fs) # resample before adding silence, expected 16kHz

    silence = np.zeros(int(duration * fs))
    audio = np.concatenate((silence, audio, silence), axis=0)

    output_processed_audio(processed_path, audio_path, audio, fs=fs, tag='addsilence')
    
    return audio

## Testing and Observation

In [37]:
#FIX PATH
input_path = '.\data\Phuoc\\testing_conversion\input'
output_path = '.\data\Phuoc\\testing_conversion\output'

processed_path = '.\data\Phuoc\\testing_conversion\processed'

In [38]:
human1_path = opj(input_path, 'manreading1.wav') #human voice
robot1_path = opj(input_path, 'best1137surprise.wav') #robot-voice 1
robot2_path = opj(input_path, 'best1138surprise.wav') #same robot-voice 2

**Case 1** | Trim silence from both audios and matching the amplitude of target audio to source audio

In [39]:

#matching amplitude of robot1 to human1
human1, robot1 = matching_amplitude(human1_path, robot1_path, processed_path)

**Case 2**
| Lower pitch of robot to human range (target audio into human pitch range)

NOTES:
- Male voice pitch range: 90 - 155Hz
- Female voice pitch range: 165 - 255Hz

In [15]:
robot1 = adjust_pitch(robot1_path)

IndexError: index 22856 is out of bounds for axis 0 with size 1025

**Case 3** | Adding small silence at the start and at the end of the robot audio file

In [40]:
robot1 = add_silence(robot1_path, processed_path)

**Case 4** | Concatenating 2 robot audios with a silence gap in between them

In [43]:
concat_audio_files(robot1_path, robot2_path, processed_path)

array([-6.35169214e-06, -8.04107549e-06,  1.46622250e-05, ...,
        8.36818821e-06,  1.36421141e-05, -3.64212635e-06])