In [2]:
# !unzip dataset/dataset.zip -d dataset
# !pip install -r requirements.txt

In [4]:
# !pip install pandas

In [5]:
# !python dataprep.py --convert

In [1]:
import csv
import itertools
from pathlib import Path
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import wavfile
from tqdm import tqdm
import glob
%matplotlib inline

In [2]:
def generate_checklist(raw_path):
    """
    Generate train test lists for zalo data
    """
    root = Path(raw_path)
    classpaths = [d for d in root.iterdir() if d.is_dir()]
    checklist = []
    checkdict = {}
    for classpath in classpaths:
        filepaths = list(classpath.glob('*.wav'))

        non_augment_path = list(
            filter(lambda x: 'augment' not in str(x), filepaths))

        label = str(non_augment_path[0].parent.stem.split('-')[0])

        checklist = [str(x).replace(raw_path, '') for x in non_augment_path[:]]
        checkdict[label] = list(itertools.combinations(checklist, 2))

    return checkdict


def convert_to_csv(checkdict, save_root):
    write_file = Path(save_root, 'checklist.csv')
    with open(write_file, 'w', newline='') as wf:
        spamwriter = csv.writer(wf, delimiter=',')
        spamwriter.writerow(['ref', 'com'])
        for k, v in checkdict.items():
            for v_ in v:
                spamwriter.writerow([v_[0], v_[1]])
    pass

In [3]:
def plot_spec(filepath):
    samplingFrequency, signalData = wavfile.read(filepath)

    # Plot the signal read from wav file
    plt.subplot(211)

    plt.title('Spectrogram of a wav file')
    plt.plot(signalData)
    plt.xlabel('Sample')
    plt.ylabel('Amplitude')

    plt.subplot(212)

    plt.specgram(signalData, Fs=samplingFrequency)
    plt.xlabel('Time')
    plt.ylabel('Frequency')

    plt.show()

## check all

In [14]:
len(glob.glob('dataset/wavs/*'))
folder = sorted(os.listdir('dataset/wavs/'), key=lambda x: int(x.split('-')[0]))

In [12]:
# rename all file
for f in tqdm(glob.glob('dataset/wavs/*')[:]):
    audio_files = os.listdir(f)
    for i, af in enumerate(audio_files):
        new_name = f"{af.replace('.wav', '').split('-')[0]}_{i}.wav"
        if os.path.exists(os.path.join(f, new_name)):
            continue
        else:
            # os.rename(os.path.join(f, af), os.path.join(f, new_name))
            pass


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 1052.77it/s]


In [4]:
import time
import librosa
import IPython.display as ipd
from IPython.display import clear_output

In [847]:
# Listen to whole directory
nfiles = [len(os.listdir(x)) for x in glob.glob(f'dataset/wavs/*')]
start_id = 377

for index in range(start_id, 400):
    audio_in_folder = [str(Path(x)) for x in glob.glob(f'dataset/wavs/{folder[index]}/*.wav')]
    print(folder[index], 'id:', index, 'count:', sum(nfiles[:index]))
    for i, f in enumerate(audio_in_folder):
        path  = str(Path(f))
        print(f"[{i + 1}/{len(audio_in_folder)}] {path}")

        audio = ipd.Audio(path, autoplay=True) 
        ipd.display(audio)
        
        duration = librosa.get_duration(filename=path)
        time.sleep(duration + 0.5)
    time.sleep(1.0)
    clear_output(wait=True)

812-M-27 id: 399 count: 10609
[1/21] dataset\wavs\812-M-27\812_0.wav


[2/21] dataset\wavs\812-M-27\812_1.wav


[3/21] dataset\wavs\812-M-27\812_10.wav


[4/21] dataset\wavs\812-M-27\812_11.wav


[5/21] dataset\wavs\812-M-27\812_12.wav


[6/21] dataset\wavs\812-M-27\812_13.wav


[7/21] dataset\wavs\812-M-27\812_14.wav


[8/21] dataset\wavs\812-M-27\812_15.wav


[9/21] dataset\wavs\812-M-27\812_16.wav


[10/21] dataset\wavs\812-M-27\812_17.wav


[11/21] dataset\wavs\812-M-27\812_18.wav


[12/21] dataset\wavs\812-M-27\812_19.wav


[13/21] dataset\wavs\812-M-27\812_2.wav


[14/21] dataset\wavs\812-M-27\812_20.wav


[15/21] dataset\wavs\812-M-27\812_3.wav


[16/21] dataset\wavs\812-M-27\812_4.wav


[17/21] dataset\wavs\812-M-27\812_5.wav


[18/21] dataset\wavs\812-M-27\812_6.wav


[19/21] dataset\wavs\812-M-27\812_7.wav


[20/21] dataset\wavs\812-M-27\812_8.wav


[21/21] dataset\wavs\812-M-27\812_9.wav


Specific files

In [246]:
#  listen to specific file
filename = "797-F-28/797_13"
path = str(Path(f"dataset/wavs/{filename}.wav"))
# plot_spec(path)
ipd.Audio(path, autoplay=True)

In [248]:
# listen to specific folder
folder_name = '807-F-28/807_10'.split('/')[0]

audio_in_folder = glob.glob(f'dataset/wavs2/{folder_name}/*.wav')[9:]
for i, f in enumerate(audio_in_folder):
    path  = str(Path(f))
    print(f"[{i + 1}/{len(audio_in_folder)}] {path}")

    audio = ipd.Audio(path, autoplay=True) 
    ipd.display(audio)

    duration = librosa.get_duration(filename=path)
    time.sleep(duration + 0.5)

[1/7] dataset/wavs2/805-F-27/805-4.wav


[2/7] dataset/wavs2/805-F-27/805-5.wav


[3/7] dataset/wavs2/805-F-27/805-6.wav


[4/7] dataset/wavs2/805-F-27/805-7.wav


[5/7] dataset/wavs2/805-F-27/805-8.wav


[6/7] dataset/wavs2/805-F-27/805-9.wav


[7/7] dataset/wavs2/805-F-27/805.wav


In [16]:
#  check VAD segments
from utils import *

path = "dataset/wavs/49-F-26/49_10.wav"

vad_engine = VAD(frame_duration=30, win_length=100)
segments = vad_engine.detect(path, write=False, show=True)
print("Total:", len(segments))

___111+(0.09)11111_111111111111111111__1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111__11111111111111111111111__11111111111111111111111111111111111___-(6.270000000000008)_____________111+(6.660000000000011)11111111111111111111_111111111111111111111111111111111111111111111111111111__11111111111111111111111111111-(9.92999999999998)
Total: 2


In [None]:
index = 133
audio_in_folder = [str(Path(x)) for x in glob.glob(f'dataset/wavs/{folder[index]}/*.wav')]
nfiles = [len(os.listdir(x)) for x in glob.glob(f'dataset/wavs/*')]
sum(nfiles[:index]), len(audio_in_folder), audio_in_folder[0]

(4171, 21, 'dataset\\wavs\\494-M-33\\494_0.wav')

In [253]:
with open('error.txt' ,'r') as f:
    data = [x.replace('\n','')+'.wav' for x in f.readlines()]
for line in data:
    path = os.path.join('dataset/wavs2/', line)
    if os.path.exists(path):
        os.remove(path)