In [8]:
import os
import glob
import pandas as pd
import textgrids
import tqdm
from utils import flac_to_wav

# Create Subset json

## Keep files in subset

In [3]:
import ast

def get_subset_df(json_dir=None):
    if not json_dir:
        json_dir = '/home/knoriy/Documents/laion/pps_split/flac_train_manifest.jsonl'

    with open(json_dir, 'r') as json_file:
        json_list = list(json_file)

    df = pd.DataFrame.from_dict(json_list)
    df[0] = df[0].apply(lambda x: ast.literal_eval(x))

    for keys in df[0][0].keys():
        df[keys] = [path[0][keys] for path in df.iloc()]

    subset = glob.glob('/home/knoriy/Documents/laion/pps_split/subset/**/*.flac', recursive=True)
    subset = [os.path.join(*(dir.split(os.path.sep)[7:])) for dir in subset]

    subset_df = df[df['audio_filepath'].isin(subset)].reset_index(drop=True)
    subset_df = subset_df.drop(0, axis=1)

    return subset_df

get_subset_df().to_csv('/home/knoriy/Documents/laion/pps_split/subset.tsv', sep='\t', header=None, index=False)

# prepare MFA

In [4]:
# Clean old wavs ONLY FOR TESTING
import glob

old_wav_path = glob.glob(f'/home/knoriy/Documents/laion/pps_split/subset/**/*.wav', recursive=True)
for wav_path in old_wav_path:
    os.remove(wav_path)

old_txt_path = glob.glob(f'/home/knoriy/Documents/laion/pps_split/subset/**/*.txt', recursive=True)
for txt_path in old_txt_path:
    os.remove(txt_path)

In [5]:
def generate_txt(df):
    for row in tqdm.tqdm(df.iloc, desc="Generating .txt files for MFA"):
        open(f'./subset/{row["audio_filepath"].split(".")[0]}.txt', 'w').write(row['text'])

In [10]:
# base_pps_dataset_path = '/home/knoriy/Documents/laion/pps_split/subset'

# # Converting Flac to wav
# for row in df.iloc:
#     flac_path = os.path.join(base_pps_dataset_path, f'{row[0]}')
#     wav_path = os.path.join(base_pps_dataset_path, f'{row[0].split(".")[0]}.wav')

#     flac_to_wav(flac_path, wav_path, overwrite=True, no_log=False)

## Generate TextGrid alignments

In [7]:
def generate_alignments(src, dest, overwrite=True):
    # if os.path.exists(dest): raise Warning("Desitination Folder already exists")
    os.system(f'mfa align --clean {src} english english {dest}')

def generate_textgrids(dataset_root_path):
    generate_alignments(dataset_root_path, f"{dataset_root_path}_textgrids")

# generate_textgrids('/home/knoriy/Documents/laion/pps_split/subset/')

# Split audio into 5-10 seconds

In [9]:
def get_potential_splits(textgrid_words):

    potential_split_points = []
    for index, word in enumerate(textgrid_words):
        if word.text != "":
            continue
        if word.xmin > 5 and word.xmax < 10: # find split that are longer than 5 sec and shorter than 10
            potential_split_points.append(index)
            # print(word.xmin, word.xmax)

    return potential_split_points

def get_longest_silance(textgrid_words):
    potential_split_points = get_potential_splits(textgrid_words)

    silance_length = 0
    word_index = None
    time = 0

    for index in potential_split_points:
        silance = textgrid_words[index]
        
        if (silance.xmax - silance.xmin) > silance_length:
            silance_length = (silance.xmax - silance.xmin)
            word_index = index

            time = (silance.xmax + silance.xmin) / 2

    return word_index, time

In [10]:
def split_audio(root_textgrid_path, root_wav_path):
    textgrid_paths = glob.glob(f'{root_textgrid_path}/**/*.TextGrid', recursive=True)

    for path in tqdm.tqdm(textgrid_paths, desc='spliting flac files into 5-10 seconds'):
        textgrid = textgrids.TextGrid(path)
        split_time = get_longest_silance(textgrid.get('words'))[1]

        # get src path
        wav_paths = os.path.split(path)
        wav_file_name = f'{str(wav_paths[-1]).split(".")[0]}.flac'
        wav_folder_name = os.path.split(wav_paths[0])[-1]

        src_wav_path = os.path.join(root_wav_path, wav_folder_name, wav_file_name) # remove aligned and replace file extension

        # create destinaltion path
        processed_path = os.path.join(f'{root_wav_path}_split', wav_folder_name)
        os.makedirs(processed_path, exist_ok=True)
        dest_path = os.path.join(processed_path, f"{wav_file_name}_%03d.flac")

        # Split audio
        os.system(f"ffmpeg -i {src_wav_path} -f segment -segment_times {split_time} {dest_path}")

In [11]:
import os
import pandas as pd
from utils import flac_to_wav
import tqdm

def main():
    df = pd.read_csv('/home/knoriy/Documents/laion/pps_split/subset.tsv', names=["audio_filepath","duration", "shard_id", "text"], header=None, sep="\t")
    base_pps_dataset_path = '/home/knoriy/Documents/laion/pps_split/subset'

    generate_txt(df)

    for row in tqdm.tqdm(df.iloc, desc="Converting .flac files to .wav"):
        flac_path = os.path.join(base_pps_dataset_path, f'{row["audio_filepath"]}')
        wav_path = os.path.join(base_pps_dataset_path, f'{row["audio_filepath"].split(".")[0]}.wav')
        flac_to_wav(flac_path, wav_path, overwrite=True, no_log=False)

    generate_textgrids(base_pps_dataset_path)
    split_audio('/home/knoriy/Documents/laion/pps_split/subset_textgrids', base_pps_dataset_path)

main()


Generating .txt files for MFA: 15it [00:00, 2741.85it/s]
Converting .flac files to .wav: 0it [00:00, ?it/s]ffmpeg version 5.0 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 10.3.0 (GCC)
  configuration: --prefix=/home/knoriy/anaconda3/envs/aligner --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1646229198505/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-demuxer=dash --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-vaapi --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1646229198505/_build_env/bin/pkg-config
  libavutil      57. 17.100 / 57. 17.100
  libavcodec     59. 18.100 / 59. 18.100
  libavformat    59. 16.100 / 59. 16.100
  libavdevice   

INFO - Setting up corpus information...
INFO - Loading corpus from source files...


100%|██████████| 15/15 [00:01<00:00, 14.80it/s]


INFO - Number of speakers in corpus: 5, average number of utterances per speaker: 3.0
INFO - Setting up training data...
INFO - Generating base features (mfcc)...
INFO - Generating MFCCs...


100%|██████████| 15/15 [00:01<00:00, 12.27it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

INFO - Calculating CMVN...
INFO - Compiling training graphs...


100%|██████████| 15/15 [00:01<00:00, 13.69it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

INFO - Performing first-pass alignment...
INFO - Generating alignments...


100%|██████████| 15/15 [00:01<00:00, 10.59it/s]


INFO - Collecting word alignments from alignment lattices...


100%|██████████| 15/15 [00:01<00:00, 13.89it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

INFO - Collecting phone alignments from alignment lattices...


100%|██████████| 15/15 [00:01<00:00, 12.08it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

INFO - Exporting TextGrids to /home/knoriy/Documents/MFA/subset_pretrained_aligner/pretrained_aligner/textgrids...


100%|██████████| 15/15 [00:01<00:00, 14.04it/s]


INFO - Finished exporting TextGrids to
                /home/knoriy/Documents/MFA/subset_pretrained_aligner/pretrained_aligner/textgrids!
INFO - Done! Everything took 12.318682432174683 seconds


spliting flac files into 5-10 seconds:   0%|          | 0/15 [00:00<?, ?it/s]ffmpeg version 5.0 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 10.3.0 (GCC)
  configuration: --prefix=/home/knoriy/anaconda3/envs/aligner --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1646229198505/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-demuxer=dash --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-vaapi --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1646229198505/_build_env/bin/pkg-config
  libavutil      57. 17.100 / 57. 17.100
  libavcodec     59. 18.100 / 59. 18.100
  libavformat    59. 16.100 / 59. 16.100
  libavdevice    59.  4.100 / 59.  4.100
  lib