In [None]:
import time
from pathlib import Path
import glob
import os
import numpy as np
from tqdm import tqdm
import itertools
import csv
import matplotlib.pyplot as plt
from scipy.io import wavfile
import librosa
import IPython.display as ipd
from IPython.display import clear_output

In [None]:
def plot_spec(filepath):
    samplingFrequency, signalData = wavfile.read(filepath)
    
    # Plot the signal read from wav file
    plt.subplot(211)

    plt.title('Spectrogram of a wav file')

    plt.plot(signalData)
    plt.xlabel('Sample')
    plt.ylabel('Amplitude')

    plt.subplot(212)
    plt.specgram(signalData,Fs=samplingFrequency,NFFT=512)
    plt.xlabel('Time')
    plt.ylabel('Frequency')
    
    plt.show()

In [None]:
folder = glob.glob('dataset/wavs/*')

Check all dataset

In [None]:
nfiles = [len(os.listdir(x)) for x in glob.glob(f'dataset/wavs/*')]
start_id = 377

for index in range(start_id, 400):
    audio_in_folder = [str(Path(x)) for x in glob.glob(f'dataset/wavs/{folder[index]}/*.wav')]
    print(folder[index], 'id:', index, 'count:', sum(nfiles[:index]))
    for i, f in enumerate(audio_in_folder):
        path  = str(Path(f))
        print(f"[{i + 1}/{len(audio_in_folder)}] {path}")

        audio = ipd.Audio(path, autoplay=True) 
        ipd.display(audio)

        duration = librosa.get_duration(filename=path)
        time.sleep(duration + 0.5)
    time.sleep(1.0)
    clear_output(wait=True)

Check each folder

In [None]:
audio_in_folder = glob.glob(f'dataset/wavs/734-M-41/*.wav')
for i, f in enumerate(audio_in_folder):
    path  = str(Path(f))
    print(f"[{i + 1}/{len(audio_in_folder)}] {path}", end=' ')
    
    audio = ipd.Audio(path, autoplay=True) 
    ipd.display(audio)
    
    duration = librosa.get_duration(filename=path)
#     print(duration, end='\n\n')
    time.sleep(duration + 0.5)

Check each files

In [None]:
path = f"dataset/aug_wavs/272-M-26/speaker_272-10_augmented_1.wav"
plot_spec(path)
ipd.Audio(path, autoplay=True)

check folder and print name file path with theirduration

In [None]:
audio_in_folder = glob.glob(f'dataset/wavs/424-M-28/*.wav')
# dump = []
for i, f in enumerate(audio_in_folder):
    path  = str(Path(f))
    print(f"[{i + 1}/{len(audio_in_folder)}] {path}", end=' ')
    duration = librosa.get_duration(filename=path)
    print(duration, end='\n\n')

Perform vad on long file(>10s of duration)

In [None]:
from utils import *
path = f"dataset/wavs/726-M-34/726-9.wav"
write = True
segments = VAD(win_length=200).detect(path, write=write, show=False)
print(len(segments))

## Delete noise

Get n-thresh files which have shortest length and put to dump to remove later, n is number of files in folder, thresh take 35

## Plot graphs

In [None]:
origin = "dataset/wavs_35/"
filenames = glob.glob(f"{origin}/*")
# filenames

In [None]:
# so luong files moi nguoi
no_of_files = []
for f in filenames:
    if len(os.listdir(f)) == 0:
        print(f)
    no_of_files.append(len(os.listdir(f)))
plt.figure(figsize=(24, 8))
plt.bar(filenames, no_of_files), plt.ylabel('number of files'), plt.xticks([])
plt.tight_layout()

In [None]:
np.mean(no_of_files), filenames[np.argmax(no_of_files)], filenames[np.argmin(no_of_files)], sum(no_of_files), max(no_of_files), min(no_of_files)

In [None]:
# thoi gian audio cua moi nguoi
import wave
import librosa
import contextlib

def get_duration(fn):
    length = 0
    for audio in os.listdir(fn):
        with contextlib.closing(wave.open(os.path.join(fn, audio),'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration_each = frames / float(rate)
        length += duration_each
    return length
    

durations = []
for fn in tqdm(filenames):
    length = get_duration(fn)
    durations.append(length)
plt.figure(figsize=(24, 8))
plt.bar(filenames, durations), plt.ylabel('durations'), plt.xticks([]), plt.tight_layout()
plt.show()

In [None]:
np.mean(durations), np.argmax(durations), sum(durations), max(durations), min(durations)

In [None]:
# mat do thoi gian / 1 audio tren moi nguoi
import numpy as np
no_of_files = np.array(no_of_files)
durations = np.array(durations)
avg_durations = np.divide(durations, no_of_files)
plt.figure(figsize=(24, 8))
plt.bar(filenames, avg_durations), plt.ylabel('avg_durations'), plt.xticks([]), plt.tight_layout()
plt.show()

In [None]:
avg_durations.mean(), np.argmax(avg_durations), sum(durations), max(avg_durations), min(avg_durations)

## EQUALIZE DATASET

Check the inequality of dataset and print out details, return number of files greater ans lower than threshold(35 files)

In [None]:
def check_inequality_data():
    '''to check the inequality of dataset from wavs folder
    '''
    root = Path("dataset/wavs_35")
    thresh = 35
    
    audio_folder_num = {}
    for audio_folder in root.iterdir():
        audio_folder_num[audio_folder.name] = len(os.listdir(audio_folder))
    mean_num = np.mean(list(audio_folder_num.values()))
    print("Total files:", sum(audio_folder_num.values()))
    print("mean number of audio files each folder", mean_num)
    print("min and max of number of files:", min(audio_folder_num.values()), max(audio_folder_num.values()))
    print(list(audio_folder_num.keys())[np.argmin(list(audio_folder_num.values()))])
    
    greater_than_mean = [k for k, v in audio_folder_num.items() if v > 1.1 * mean_num]
    lower_than_mean = [k for k, v in audio_folder_num.items() if v < 0.9 * mean_num]
    lower_than_thresh = [k for k, v in audio_folder_num.items() if v < thresh]
    greater_than_thresh = [k for k, v in audio_folder_num.items() if v > thresh]
    
    print('//===================================')
    # check for the total duration of each folder
    audio_folder_duration = {}
    for audio_folder in root.iterdir():
        audio_folder_duration[audio_folder.name] = sum([audio_file.stat().st_size for audio_file in audio_folder.iterdir()])
    mean_duration = np.mean(list(audio_folder_duration.values()))
    print("Total size:", sum(audio_folder_duration.values())/1e6, "MB")
    print("mean duration of audio files each folder", mean_duration/(1024*1024), "MB")
    print("min and max of duration:", min(audio_folder_duration.values())/(1024*1024), max(audio_folder_duration.values())/(1024*1024), "MB")
    
    greater_than_mean_duration = [k for k, v in audio_folder_duration.items() if v > 1.1 * mean_duration]
    lower_than_mean_duration = [k for k, v in audio_folder_duration.items() if v < 0.9 * mean_duration]

    print('//===================================')

    long_large = np.intersect1d(greater_than_mean, greater_than_mean_duration)
    short_small = np.intersect1d(lower_than_mean, lower_than_mean_duration)
    long_small = np.setxor1d(greater_than_mean, long_large)
    short_large = np.setxor1d(lower_than_mean, short_small)


    print("greater than mean in number of files:", len(greater_than_mean))
    print("lower than mean in number of files:", len(lower_than_mean))
    print("greater than mean in duration:", len(greater_than_mean_duration))
    print("lower than mean in duration:", len(lower_than_mean_duration))
    
    print('//===================================')

    print("long_large:", len(long_large))
    print("short_small:", len(short_small))
    print("long_small:", len(long_small))
    print("short_large:", len(short_large))


    with open("data_inequality.txt", 'w') as f:
        f.write("long_large: \n")
        for line in long_large:
            f.write(line + ' - ' + str(audio_folder_num[line]) +  ' - ' + str(audio_folder_duration[line])  + '\n')

        f.write("short_small: augment \n")
        for line in short_small:
            f.write(line + ' - ' + str(audio_folder_num[line]) +  ' - ' + str(audio_folder_duration[line])  + '\n')

        f.write("long_small: \n")
        for line in long_small:
            f.write(line + ' - ' + str(audio_folder_num[line]) +  ' - ' + str(audio_folder_duration[line])  + '\n')
  

        f.write("short_large: vad \n")
        for line in short_large:
            f.write(line + ' - ' + str(audio_folder_num[line]) +  ' - ' + str(audio_folder_duration[line])  + '\n')

        f.write("greater than thresh:  \n")
        for line in greater_than_thresh:
            f.write(line + ' - ' + str(audio_folder_num[line] - thresh) + ' / ' +  str(audio_folder_num[line]) + '\n')
           
    
    return lower_than_thresh, greater_than_thresh, short_large
            
add, elim, need_vad = check_inequality_data()

In [None]:
len(add), len(elim)

For small number of files folder but with large size, perform vad for all >10s audio files in that folder

In [None]:
# vad for vad list
from utils import *
def vad_on_vad_list(vad_list):
    root = 'dataset/wavs_35'
    vad_list = [str(Path(root, f)) for f in vad_list]

    vad_engine = VAD(frame_duration=30, win_length=200)

    data_paths = []
    for fdir in tqdm(vad_list):
        data_paths.extend(
            glob.glob(f'{fdir}/*.wav'))

    # filters audiopaths
    raw_paths = list(
        filter(lambda x: 'augment' not in str(x) and 'vad' not in str(x), data_paths))

    for audio_path in tqdm(raw_paths, desc="Vad performing"):
        duration = librosa.get_duration(filename=audio_path)
        if duration >= 10.0:
            vad_engine.detect(audio_path, write=True, show=False)
        
    print("Done!")

In [None]:
# vad on short large
vad_on_vad_list(need_vad)

With folder have number of files less than threshold, copy augmented files randomly from aug_wavs dataset

add augment to short small

In [None]:
import random
import shutil
from tqdm import tqdm

def add_augment_file(add, thresh=26):
    aug_root = 'dataset/aug_wavs'
    spk_root = 'dataset/wavs_35'
    for f in tqdm(add):
        gap = thresh - len(os.listdir(Path(spk_root, f))) 
        aug_files = glob.glob(f"{Path(aug_root, f)}/*.wav")
        choices = random.sample(aug_files, gap)
        for choice in choices:
            src = choice
            dst = choice.replace(str(Path(aug_root, f)), str(Path(spk_root, f)))
#             print(src, '->', dst)
            shutil.copy(src, dst)
    print('DONE')

add_augment_file(add, thresh=35)

Delete long folder

In [None]:
def delete_redundance(elim):
    for f in tqdm(elim):
        audio_in_folder = glob.glob(f"dataset/wavs_35/{f}/*.wav")
        gap = len(audio_in_folder) - 35

        dump = {}

        for i, f in enumerate(audio_in_folder):
            path  = str(Path(f))
            duration = librosa.get_duration(filename=path)
            dump[path] = duration
        dump = {k: v for k, v in sorted(dump.items(), key=lambda item: item[1])}

        delete_files = list(dump.keys())[:gap]
        for f in tqdm(delete_files):
        #     path = os.path.join('dataset/wavs/' , f)
            if os.path.exists(f):
#                 print("delete", f)
                os.remove(f)

In [None]:
delete_redundance(elim)