In [1]:
import os
import glob
import pandas as pd
import textgrids
import tqdm
from utils import flac_to_wav

# Create Subset json

## Keep files in subset

In [7]:
import ast


def get_subset_df(json_dir=None):
    if not json_dir:
        json_dir = '/home/knoriy/Documents/laion/split_peoples_speech/flac_train_manifest.jsonl'

    with open(json_dir, 'r') as json_file:
        json_list = list(json_file)

    df = pd.DataFrame.from_dict(json_list)
    df[0] = df[0].apply(lambda x: ast.literal_eval(x))

    for keys in df[0][0].keys():
        df[keys] = [path[0][keys] for path in df.iloc()]

    subset = glob.glob('/home/knoriy/Documents/laion/split_peoples_speech/subset/**/*.flac', recursive=True)
    subset = [os.path.join(*(dir.split(os.path.sep)[7:])) for dir in subset]

    subset_df = df[df['audio_filepath'].isin(subset)].reset_index(drop=True)
    subset_df = subset_df.drop(0, axis=1)

    return subset_df

get_subset_df().to_csv('/home/knoriy/Documents/laion/split_peoples_speech/subset.tsv', sep='\t', header=None, index=False)

# prepare MFA

In [59]:
# Clean old wavs ONLY FOR TESTING
import glob

old_wav_path = glob.glob(f'/home/knoriy/Documents/laion/split_peoples_speech/subset/**/*.wav', recursive=True)
for wav_path in old_wav_path:
    os.remove(wav_path)

# old_txt_path = glob.glob(f'/home/knoriy/Documents/laion/split_peoples_speech/subset/**/*.txt', recursive=True)
# for txt_path in old_txt_path:
#     os.remove(txt_path)

# old_txt_path = glob.glob(f'/home/knoriy/Documents/laion/split_peoples_speech/subset_split/**/*.txt', recursive=True)
# for txt_path in old_txt_path:
#     os.remove(txt_path)

In [2]:
def generate_txt(df):
    for row in tqdm.tqdm(df.iloc, desc="Generating .txt files for MFA"):
        open(f'./subset/{row["audio_filepath"].split(".")[0]}.txt', 'w').write(row['text'])

## Generate TextGrid alignments

In [3]:
def generate_alignments(src, dest, overwrite=True):
    # if os.path.exists(dest): raise Warning("Desitination Folder already exists")
    os.system(f'mfa align --clean {src} english english {dest}')

def generate_textgrids(dataset_root_path):
    generate_alignments(dataset_root_path, f"{dataset_root_path}_textgrids")

# generate_textgrids('/home/knoriy/Documents/laion/split_peoples_speech/subset/')

# Split audio into 5-10 seconds

In [11]:
def get_potential_splits(textgrid_words):

    potential_split_points = []
    for index, word in enumerate(textgrid_words):
        if word.text != "":
            continue
        if word.xmin > 5 and word.xmax < 10: # find split that are longer than 5 sec and shorter than 10
            potential_split_points.append(index)
            # print(word.xmin, word.xmax)

    return potential_split_points

def get_longest_silance(textgrid_words):
    potential_split_points = get_potential_splits(textgrid_words)

    silance_length = 0
    word_index = None
    time = 0

    for index in potential_split_points:
        silance = textgrid_words[index]
        
        if (silance.xmax - silance.xmin) > silance_length:
            silance_length = (silance.xmax - silance.xmin)
            word_index = index

            time = (silance.xmax + silance.xmin) / 2

    return word_index, time

In [60]:
def split_audio(root_textgrid_path, root_wav_path):
    textgrid_paths = glob.glob(f'{root_textgrid_path}/**/*.TextGrid', recursive=True)

    for path in tqdm.tqdm(textgrid_paths, desc='spliting flac files into 5-10 seconds'):
        textgrid = textgrids.TextGrid(path)
        textgrid_words = textgrid.get('words')

        word_index, split_time = get_longest_silance(textgrid_words)

        # get src path
        wav_paths = os.path.split(path)
        wav_file_name = f'{str(wav_paths[-1]).split(".")[0]}.flac'
        wav_folder_name = os.path.split(wav_paths[0])[-1]

        src_wav_path = os.path.join(root_wav_path, wav_folder_name, wav_file_name) # remove aligned and replace file extension

        # create destinaltion path
        processed_path = os.path.join(f'{root_wav_path}_split', wav_folder_name)
        os.makedirs(processed_path, exist_ok=True)
        dest_path = os.path.join(processed_path, f"{wav_file_name}")

        # Split audio
        returned_path = os.system(f"ffmpeg -i {src_wav_path} -f segment -segment_times {split_time} {dest_path}_%02d.flac")

        # Split text
        sentences = [[word.text for word in textgrid_words[:word_index]], [word.text for word in textgrid_words[word_index:]]]
        for sentence in sentences:
            with open(f'{dest_path}_%03d.txt', 'w') as file:
                file.write(' '.join(sentence))



In [4]:
%%time

import os
import pandas as pd
from utils import flac_to_wav
import tqdm

def main():
    df = pd.read_csv('/home/knoriy/Documents/laion/split_peoples_speech/subset.tsv', names=["audio_filepath","duration", "shard_id", "text"], header=None, sep="\t")
    base_pps_dataset_path = '/home/knoriy/Documents/laion/split_peoples_speech/subset'

    # generate_txt(df)

    # for row in tqdm.tqdm(df.iloc, desc="Converting .flac files to .wav"):
    #     flac_path = os.path.join(base_pps_dataset_path, f'{row["audio_filepath"]}')
    #     wav_path = os.path.join(base_pps_dataset_path, f'{row["audio_filepath"].split(".")[0]}.wav')
    #     flac_to_wav(flac_path, wav_path, overwrite=True, no_log=False)

    # generate_textgrids(base_pps_dataset_path)
    split_audio('/home/knoriy/Documents/laion/split_peoples_speech/subset_textgrids', base_pps_dataset_path)

# main()


CPU times: user 797 ms, sys: 646 ms, total: 1.44 s
Wall time: 457 ms


# Tar loader

In [1]:
import tarfile

In [2]:
filename = '/home/knoriy/Downloads/subset_flac.tar'

In [4]:
with tarfile.open(filename,'r') as file_obj:
    file_names = file_obj.getnames()[1:]

    file_obj.extract(file_names[:10])

    # for file_name in file_names:
    #     file = file_obj.extractfile(file_name)
    #     # print(file.read())

AttributeError: 'list' object has no attribute 'islnk'