## Imports

In [2]:
import os, pprint
import pandas as pd
import matplotlib.pyplot as plt 
import re

import wave
import contextlib

## Metadata preparation 

In [None]:
def get_sent_paths(dataset_path):
    # Get only directorys
    dataset = [i for i in os.listdir(dataset_path) if os.path.isdir(f"{dataset_path}/{i}")]
    sent_paths = []
    for person in dataset:
        sent_paths.append(f"{dataset_path}/{person}/sentences.csv")
    return sent_paths

In [None]:
def get_metadata(dataset_path, add_22k=True):
    paths = get_sent_paths(dataset_path)
    metadata_list = []

    for i in paths:
        # Add speaker id to the metadata
        speaker_id = i.split('/')[-2]
        with open(i) as f:
            sentences = f.readlines()
        metadata_list.extend([f"{speaker_id}/{row.strip()}|{speaker_id}" for row in sentences])
        
    if add_22k:
        # Adds -22k.wav to file ends, because the resampler changes the wav file names
        metadata_list = [f"{row.split('.wav')[0]}-22k.wav{row.split('.wav')[1]}" for row in metadata_list]
            
    return metadata_list

In [None]:
def generate_metadata_file(source_path, target_path, file_name):
    # Writes the metadata file
    metadata_list = get_metadata(source_path)
    
    with open(f"{target_path}/{file_name}", 'w') as f:
        f.writelines(f"{row}\n" for row in metadata_list)
    
generate_metadata_file(
    '/gpfs/space/home/zuppur/cotatron/data/preprocessed_v2', 
    '/gpfs/space/home/zuppur/cotatron/datasets/metadata/', 
    'estonian_metadata.txt')

In [None]:
def get_speaker_ids(dataset_path):
    # Prints a list of speaker ids for config file
    dataset = [i for i in os.listdir(dataset_path) if os.path.isdir(f"{dataset_path}/{i}")]
    print(dataset)
    
#get_speaker_ids('/gpfs/space/home/zuppur/cotatron/data/preprocessed_v2/')

## wav Lenght

In [2]:
def get_wavs(path, result_list):
    # Adds path to all wav files to result_list
    pattern = re.compile("^\S+.wav$")
    
    for file in os.listdir(path):
        if os.path.isdir(f"{path}/{file}"):
            get_wavs(f"{path}/{file}", result_list)
        elif pattern.match(file):
            result_list.append(f"{path}/{file}")

In [3]:
def get_duration(fname):
    with contextlib.closing(wave.open(fname,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
    return duration

In [4]:
def get_all_durations(data_path):
    path_list = []
    get_wavs(data_path, path_list) # adds all wav paths to path_list
    
    return [get_duration(i) for i in path_list]
    
#get_all_durations('/gpfs/space/home/zuppur/cotatron/data/preprocessed_v2/')

In [None]:
durations = get_all_durations('/gpfs/space/home/zuppur/cotatron/data/preprocessed_v2/')
durations.sort()

for i in range(len(durations)):
    if durations[i] > 10:
        print(f"Lenght under 10s proportion is : {round(i/len(durations), 2)}")
        break

## Random stuff

In [None]:
df = pd.read_csv('/gpfs/space/home/zuppur/cotatron/data/cv-corpus-6.1-2020-12-11/et/validated.tsv', delimiter='\t')
df = df.drop(['up_votes', 'down_votes','age','gender','accent','locale','segment'], axis=1)

speaker_counts = {}
for index, row in df.iterrows():
    if row['client_id'] in speaker_counts:
        speaker_counts[row['client_id']] += 1
    else:
        speaker_counts[row['client_id']] = 1

commonvoice = speaker_counts

In [None]:
def speaker_counts(path):
    df = pd.read_csv(path, delimiter='|', names=['path','text','speaker'])
    speaker_counts = {}

    for index, row in df.iterrows():
        if row['speaker'] in speaker_counts:
            speaker_counts[row['speaker']] += 1
        else:
            speaker_counts[row['speaker']] = 1

    return speaker_counts

estonian = speaker_counts('/gpfs/space/home/zuppur/cotatron/datasets/metadata/estonian_metadata.txt')
english = speaker_counts('/gpfs/space/home/zuppur/cotatron/datasets/metadata/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist_22k.txt')

In [None]:
def plot_counts(counts, label):
    counts = list(counts.values())
    counts.sort()
    
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar([i for i in range(len(counts))],counts)
    ax.set_title(label)
    plt.show()
    
plot_counts(estonian, 'preprocessed_v2')
plot_counts(commonvoice, 'commonvoice')
plot_counts(english, 'english')