## Imports

In [1]:
import os, pprint
import pandas as pd
import matplotlib.pyplot as plt 
import re
import wave
import contextlib
import time

## Helper funcs

In [66]:
########## Audio duration helper funcs ##########
def get_wavs(path, result_list):
    # Adds path to all wav files to result_list
    pattern = re.compile("^\S+.wav$")
    
    for file in os.listdir(path):
        if os.path.isdir(f"{path}/{file}"):
            get_wavs(f"{path}/{file}", result_list)
        elif pattern.match(file):
            result_list.append(f"{path}/{file}")
            
def get_duration(fname):
    try:
        with contextlib.closing(wave.open(fname,'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
        return duration
    except EOFError:
        return 10000000

def get_shorter_than_10s_wavs(path):
    wav_paths = []
    get_wavs(path, wav_paths)
    if True:
        return [path_cleaner(i) for i in wav_paths if get_duration(i) <= 10.0]
    return [i for i in wav_paths if get_duration(i) <= 10.0]
########## Metadata funcs ##########

def path_cleaner(path_string):
    # Removes unnessesary path parts, used for file comparison
    return '/'.join(path_string.split('/')[-3:])

def get_sent_paths(dataset_path):
    # Get only directorys
    dataset = [i for i in os.listdir(dataset_path) if os.path.isdir(f"{dataset_path}/{i}")]
    sent_paths = []
    for person in dataset:
        sent_paths.append(f"{dataset_path}/{person}/sentences.csv")
    return sent_paths

def get_metadata(dataset_path, add_22k=True, ignore_longer_than_10s=True):
    paths = get_sent_paths(dataset_path)    
    metadata_list = []

    for i in paths:
        # Add speaker id to the metadata
        speaker_id = i.split('/')[-2]
        with open(i) as f:
            sentences = f.readlines()
        metadata_list.extend([f"{speaker_id}/{row.strip()}|{speaker_id}" for row in sentences])
    print("Combined all metadata files")
        
    if add_22k:
        # Adds -22k.wav to file ends, because the resampler changes the wav file names
        metadata_list = [f"{row.split('.wav')[0]}-22k.wav{row.split('.wav')[1]}" for row in metadata_list]
        print("Added -22k to all soundfile paths in metadata")
        
    if ignore_longer_than_10s:
        # Filters out rows, that are longer than 10s 
        print("Starting to remove lines longer than 10s, this might take a while")
        shorter_than_10s = get_shorter_than_10s_wavs(dataset_path)
        new_metadata_list = [row for row in metadata_list if path_cleaner(row.split("|")[0]) in shorter_than_10s]
        print(f"Filtered out lines longer than 10s, because of this removed about {round(100-(len(new_metadata_list)/len(metadata_list))*100,2)}% of the corpus")
        metadata_list = new_metadata_list
        
    return metadata_list

def generate_metadata_file(source_path, target_path, file_name):
    # Writes the metadata file
    metadata_list = get_metadata(source_path)
    
    with open(f"{target_path}/{file_name}", 'w') as f:
        f.writelines(f"{row}\n" for row in metadata_list)
    print(f"Wrote metadata file to {target_path}")

In [99]:
generate_metadata_file(
    '/gpfs/space/home/zuppur/cotatron/data/preprocessed_v2', 
    '/gpfs/space/home/zuppur/cotatron/datasets/metadata/', 
    'estonian_metadata.txt')

Combined all metadata files
Added 22k to all soundfile paths in metadata
Starting to remove lines longer than 10s, this might take a while
Filtered out lines longer than 10s, because of this removed about 12.780000000000001% of the corpus
Wrote metadata file to /gpfs/space/home/zuppur/cotatron/datasets/metadata/


# Common Voice

## Common voice helper funcs

In [67]:
def read_commonvoice_meta(meta_file, target_path):
    # Reads the meta file to df
    print("Reading metadata file and formating it")
    df = pd.read_csv(meta_file, delimiter='\t')
    # Replaces .mp3 with -22k.wav because we change the file type
    df['path'] = df['path'].apply(lambda x: x.replace('.mp3', '-22k.wav'))
    meta = [f"{target_path}/{row['path']}|{row['sentence']}|{row['client_id']}" for index, row in df.iterrows()]
    
    # Get wav paths that are shorter than 10s
    print('Starting to remove lines longer than 10s, this might take a while')
    shorter_than_10s = get_shorter_than_10s_wavs('/gpfs/space/home/zuppur/cotatron/data/cv-corpus-6.1-2020-12-11/et/clips_wav')
    new_meta = [row for row in meta if row.split('|')[0] in shorter_than_10s]
    print(f"Filtered out lines longer than 10s, because of this removed about {round(100-(len(new_meta)/len(meta))*100,2)}% of the corpus")
    
    return new_meta

#meta = read_commonvoice_meta('/gpfs/space/home/zuppur/cotatron/data/cv-corpus-6.1-2020-12-11/et/validated.tsv', 'et/clips_wav')

# Writing the metadata file

In [85]:
v2_source_path = '/gpfs/space/home/zuppur/cotatron/data/preprocessed_v2'
commonvoice_source_path = '/gpfs/space/home/zuppur/cotatron/data/cv-corpus-6.1-2020-12-11/et/validated.tsv'
commonvoice_audio_target_path = 'et/clips_wav'

print("Starting with preprocessed_v2 metadata")
preprocessed_v2_meta = get_metadata(v2_source_path)
print("Starting to work on commonvoice metadata")
commonvoice_meta = read_commonvoice_meta(commonvoice_source_path, commonvoice_audio_target_path)

Starting with preprocessed_v2 metadata
Combined all metadata files
Added -22k to all soundfile paths in metadata
Starting to remove lines longer than 10s, this might take a while
Filtered out lines longer than 10s, because of this removed about 12.78% of the corpus
Starting to work on commonvoice metadata
Reading metadata file and formating it
Starting to remove lines longer than 10s, this might take a while
Filtered out lines longer than 10s, because of this removed about 4.9% of the corpus


In [86]:
# Adding folder names to metadata
preprocessed_v2_meta_2 = [f"preprocessed_v2/{i}" for i in preprocessed_v2_meta]
commonvoice_meta_2 = [f"cv-corpus-6.1-2020-12-11/{i}" for i in commonvoice_meta]

# Combining the two lists
combined_meta = preprocessed_v2_meta_2 + commonvoice_meta_2

target_path = '/gpfs/space/home/zuppur/cotatron/datasets/metadata/'
file_name = 'estonian_metadata.txt'

# Writing the metadata file
with open(f"{target_path}/{file_name}", 'w') as f:
    f.writelines(f"{row}\n" for row in combined_meta)
    print(f"Wrote metadata file to {target_path}")

Wrote metadata file to /gpfs/space/home/zuppur/cotatron/datasets/metadata/


In [89]:
with open('/gpfs/space/home/zuppur/cotatron/datasets/metadata/estonian_metadata.txt', 'r') as f:
    sentences = f.readlines()
    
sentences = [i.strip().split('|')[2] for i in sentences]
speaker_isentences

'ERR-uudised-Tonu_Karjatse'